In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import psycopg2
from geoalchemy2 import Geometry, WKTElement
from sqlalchemy import *
from shapely.geometry import MultiPolygon
from zipfile import ZipFile
import requests 
import sys
from tqdm import tqdm

In [3]:
import yaml

with open('../../config/postgres.yaml') as f:
    engine_configs = yaml.load(f, Loader=yaml.FullLoader)
    
try:
    engine = create_engine('postgresql://{username}:{password}@{host}:{port}/{dbname}'.format(**engine_configs))
except Exception as e:
    print("Uh oh, can't connect. Invalid dbname, user or password?")
    print(e)

In [4]:
def process_geometry_SQL_insert(gdf):
    gdf['geom'] = gdf['geometry'].apply(lambda x: WKTElement((MultiPolygon([x]) if x.geom_type == 'Polygon' else x).wkt, srid=4326))
    gdf = gdf.drop('geometry', 1)
    return gdf

In [5]:
CITY='bogota'
NEIGHBORHOOD_SIZE = 805 # 805 OR 1609

In [5]:
bounds_gdf = gpd.read_file('../../data/bogota/boundary/boundary.gpkg')
bounds_gdf = bounds_gdf[['geometry']]
bounds_gdf = bounds_gdf.to_crs("EPSG:4326") 
bounds_gdf['city'] = CITY
bounds_gdf.head()

Unnamed: 0,geometry,city
0,"MULTIPOLYGON (((-74.12742 4.47120, -74.12799 4...",bogota1m


In [6]:
insert_gdf = process_geometry_SQL_insert(bounds_gdf)
insert_gdf.to_sql('boundary', engine, if_exists='append', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

## Spatial groups and blocks_group

In [7]:
block_groups_gdf = gpd.read_file('../../data/bogota/blocks_group/barrios_2014_merged.geojson')
#No rural
block_groups_gdf = block_groups_gdf[block_groups_gdf['SCaTipo']!=1]
block_groups_gdf = block_groups_gdf[['SCaCodigo', 'geometry', 'SCaNombre']]
#block_groups_gdf = block_groups_gdf.to_crs({'init': 'epsg:4326'}) 

#Remove corridors
block_groups_gdf = block_groups_gdf[block_groups_gdf['SCaCodigo'] != '004572']

# Remove isolated points
block_groups_gdf = block_groups_gdf[~block_groups_gdf['SCaCodigo'].isin({'009138', '009139', '009140', '009141', '009142', '002585', '004624', '006533', '008545', '008221', '002636', '009261'})]

# Lack of data
block_groups_gdf = block_groups_gdf[~block_groups_gdf['SCaCodigo'].isin({'208110'})]

# Too small
block_groups_gdf = block_groups_gdf[~block_groups_gdf['SCaCodigo'].isin({'008319', '008419', '008424'})]

# Some issues with the ids
block_groups_gdf['SCaCodigo'] = block_groups_gdf['SCaCodigo'].fillna(0)
import random

for i, row in block_groups_gdf[block_groups_gdf['SCaCodigo'].isin({'006511', '002456'})].iterrows():
    block_groups_gdf.loc[i, 'SCaCodigo'] = str(random.randint(0, 1000000))

block_groups_gdf.head()

Unnamed: 0,SCaCodigo,geometry,SCaNombre
0,4316,"MULTIPOLYGON (((-74.17498 4.63674, -74.17298 4...",LAS MARGARITAS
1,4609,"MULTIPOLYGON (((-74.17828 4.64850, -74.17771 4...",LOS ALMENDROS
2,4607,"MULTIPOLYGON (((-74.17213 4.63790, -74.17549 4...",CIUDAD DE CALI
3,4612,"MULTIPOLYGON (((-74.17617 4.63847, -74.17599 4...",DINDALITO
4,4601,"MULTIPOLYGON (((-74.17042 4.64452, -74.17068 4...",CALANDAIMA


In [8]:
block_groups_gdf = gpd.sjoin(block_groups_gdf, bounds_gdf, how="inner", op='intersects').drop('index_right', axis=1)

In [9]:
block_groups_gdf = block_groups_gdf.rename(columns={'SCaCodigo': 'original_id'})
block_groups_gdf['city'] = CITY

In [10]:
insert_gdf = process_geometry_SQL_insert(block_groups_gdf.drop('SCaNombre', axis=1))
insert_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [11]:
sql = """
INSERT INTO blocks_group (original_id, city, geom) 
SELECT s.original_id, s.city, ST_Multi(ST_Intersection(s.geom, b.geom))
FROM temptable_{tempname} as s
INNER JOIN boundary b ON ST_Intersects(s.geom, b.geom) AND NOT ST_Touches(s.geom, b.geom) AND s.city=b.city
where s.city='{city}' and ST_Area(ST_Intersection(s.geom, b.geom))/ST_Area(s.geom) >= 0.5
AND ST_GeometryType(ST_Multi(ST_Intersection(s.geom, b.geom))) = 'ST_MultiPolygon';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

### Neighborhoods

In [12]:
sql = """INSERT INTO spatial_groups (city, core_geom, core_id, lower_ids, spatial_name, approx_geom)
SELECT  a.city, a.geom as core_geom, a.bid as core_id, array_agg(b.bid), 'ego', ST_multi(ST_Union(b.geom))
FROM blocks_group a
INNER JOIN blocks_group b ON a.city = b.city AND (a.bid = b.bid OR ST_DWithin(a.geom::geography, ST_Centroid(b.geom)::geography, {distance}) OR st_touches(a.geom, b.geom)) 
where a.city='{city}'
GROUP BY a.bid, a.geom, a.city;
delete from spatial_groups where ST_Area(approx_geom::geography) < 250000 and spatial_name='ego';
""".format(city=CITY, tempname=CITY.lower(), distance=NEIGHBORHOOD_SIZE)

result = engine.execute(text(sql))

## Blocks

In [13]:
block_gdf = gpd.read_file('zip://../../data/bogota/block/Manzana.zip')
block_gdf = block_gdf[['geometry', 'MANZ_CCNCT']]
block_gdf.head()

Unnamed: 0,geometry,MANZ_CCNCT
0,"POLYGON ((-74.04383 4.81932, -74.04293 4.81872...",1100110000000091240208
1,"POLYGON ((-74.05200 4.81899, -74.05180 4.81911...",1100110000000091240217
2,"POLYGON ((-74.05572 4.81415, -74.05575 4.81418...",1100110000000091240224
3,"POLYGON ((-74.04858 4.81827, -74.04828 4.81819...",1100110000000091240210
4,"POLYGON ((-74.03558 4.81683, -74.03387 4.81625...",1100110000000085370129


In [14]:
insert_gdf = process_geometry_SQL_insert(block_gdf)
insert_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [15]:
sql = """
insert into block (sp_id, geom, city, geog, greater_1sm) 
select bid, geom, city, geom::geography, ST_AREA(geom::geography)>2.59e+6 
from(
    SELECT bid, st_multi(geom) as geom, city, ROW_NUMBER() OVER (PARTITION BY geom ORDER by area DESC) AS r
    from (
        select b.bid, c.geom, b.city, ST_Area(ST_Intersection(b.geom, c.geom)) as area
        from temptable_{tempname} as c
        inner join blocks_group as b on ST_Intersects(b.geom, c.geom)
        where b.city = '{city}' AND ST_Area(ST_Intersection(b.geom, c.geom))/ST_Area(c.geom) >= 0.5
    ) as dtable
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [16]:
sql = """
UPDATE block AS b SET geom=ST_Multi(ST_Intersection(b.geom, s.geom))
FROM boundary AS s
WHERE ST_Intersects(b.geom, s.geom) AND b.city=s.city AND s.city='{city}' AND (NOT ST_Contains(s.geom, b.geom))
AND ST_GeometryType(ST_Intersection(b.geom, s.geom)) = 'ST_MultiPolygon';;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [17]:
sql = """
DELETE FROM block as b
USING boundary AS s
WHERE ST_Intersects(b.geom, s.geom) AND b.city=s.city AND s.city='{city}' 
AND ST_GeometryType(ST_Multi(ST_Intersection(b.geom, s.geom))) != 'ST_MultiPolygon';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [18]:
## Some blocks_group do not have blocks
sql = """
DELETE FROM blocks_group bg
WHERE NOT EXISTS(SELECT * FROM block b WHERE b.sp_id = bg.bid AND b.city = bg.city) AND bg.city='{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Additional for bogota

In [19]:
## Create spatial_group shapefiles
sql = """
select bid, census_block, original_id
from(
    SELECT bid, census_block, original_id, ROW_NUMBER() OVER (PARTITION BY geom ORDER by area DESC) AS r
    from (
        select b.bid, c.geom, c."MANZ_CCNCT" as census_block, b.original_id, ST_Area(ST_Intersection(b.geom, c.geom)) as area
        from temptable_{tempname} as c
        inner join blocks_group as b on ST_Intersects(b.geom, c.geom)
        where b.city = '{city}'
    ) as dtable
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

census_block_to_barrios = pd.read_sql(sql, engine)
census_block_to_barrios.head()

Unnamed: 0,bid,census_block,original_id
0,500853,1100110000000025690212,2599
1,500853,1100110000000025690228,2599
2,500853,1100110000000025690227,2599
3,500853,1100110000000025690225,2599
4,500853,1100110000000025690226,2599


In [20]:
def census2barrios(census_block_to_barrios, census_df):
    census_df = census_df.rename(columns={'original_id': 'census_block'})
    merged_df = pd.merge(census_block_to_barrios, census_df, on='census_block')
    
    merged_df = merged_df.groupby('original_id', as_index=False).sum()
    return merged_df

## Census

In [21]:
census_df = pd.read_excel('../../data/bogota/census/CENSO2005_BOGOTA_VIHOPE_AG.xlsx').reset_index()
census_df.columns = census_df.iloc[0].values
census_df = census_df.iloc[1:]

census_df['original_id'] = census_df['REDCODE'].str[:6].astype(str) + census_df['REDCODE'].str[8:]
census_df['inforce'] = census_df['TRABAJÓ'].astype(int) + census_df['NO TRABAJÓ PERO TENIA TRABAJO'].astype(int) + census_df['BUSCO TRABAJO PERO HABIA TRABAJADO ANTES'].astype(int) + census_df['BUSCO TRABAJO POR PRIMERA VEZ'].astype(int) 
census_df['employed'] = census_df['TRABAJÓ'].astype(int) #+ census_df['NO TRABAJÓ PERO TENIA TRABAJO'].astype(int)
census_df['population'] = census_df['PERSONAS'].astype(int)
census_df['dwellings'] = census_df['VIVIENDAS'].astype(int)

# Fix for barrios
census_df = census2barrios(census_block_to_barrios, census_df)
## Fix for Sectors
#census_df.loc[:, 'original_id'] = census_df['original_id'].str[:-2]
#census_df = census_df.groupby('original_id', as_index=False).sum()

census_df['armed'] = 0
census_df = census_df[['original_id', 'population', 'inforce', 'employed', 'armed', 'dwellings']]

census_df.head()

Unnamed: 0,original_id,population,inforce,employed,armed,dwellings
0,0,14,7,5,0,3
1,1101,4212,1862,1720,0,1287
2,1102,7812,3164,2865,0,2033
3,1103,6994,2816,2557,0,1833
4,1104,722,295,256,0,176


In [22]:
census_df.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False)

In [23]:
sql = """
insert into census (bid, population, employed, inforce, tot_survey, dwellings, city) 
select b.bid, c.population, c.employed+c.armed, c.inforce, c.population, c.dwellings, '{city}' 
from temptable_{tempname} c 
inner join blocks_group b on b.original_id = c.original_id
where b.city='{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

### Residential stability

In [24]:
stab_df = pd.read_csv('../../data/bogota/residential_stability/residential_stability.csv', dtype={'ccnct': str})

stab_df['total'] = stab_df['changed'] + stab_df['nochanged']
stab_df['stable'] = stab_df['nochanged']
stab_df['original_id'] = stab_df['ccnct']
stab_df = stab_df[['original_id', 'stable', 'total']]

stab_df['original_id'] = stab_df['original_id'].str[:6].astype(str) + stab_df['original_id'].str[8:]

stab_df['total'] = stab_df['total'].astype(int)
stab_df['stable'] = stab_df['stable'].astype(int)
stab_df['total2'] = stab_df['total']
stab_df['owner'] = stab_df['total']

# Fix for barrios
stab_df = census2barrios(census_block_to_barrios, stab_df)

stab_df.head()

Unnamed: 0,original_id,bid,stable,total,total2,owner
0,0,501165,8,14,14,14
1,1101,3502352,2542,4212,4212,4212
2,1102,19524102,4786,7612,7612,7612
3,1103,26038948,4689,6743,6743,6743
4,1104,5003600,613,722,722,722


In [25]:
stab_df.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False)

In [26]:
sql = """
INSERT INTO residential_stability (bid, city, total, stable, total2, owner) 
SELECT b.bid, '{city}', c.total, c.stable, c.total2, c.owner 
FROM temptable_{tempname} c 
INNER JOIN blocks_group b ON b.original_id = c.original_id
where b.city='{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

### Ethnic diversity

In [27]:
eth_df = pd.read_csv('../../data/bogota/ethnic_diversity/ethnic_diversity.csv', dtype={'ccnct': str})
eth_df = eth_df.rename(columns={
    'ccnct': 'original_id'
})
eth_df['original_id'] = eth_df['original_id'].str[:6].astype(str) + eth_df['original_id'].str[8:]


# Fix for barrios
eth_df = census2barrios(census_block_to_barrios, eth_df)
eth_df.head()

Unnamed: 0,original_id,bid,indigena,rom,raizal,palanguero,negro,ninguno
0,0,501165,0,0,0,0,0,14
1,1101,3502352,15,0,2,0,156,4038
2,1102,19524102,12,0,0,0,191,7409
3,1103,26038948,27,0,1,0,105,6610
4,1104,5003600,0,0,0,0,1,721


In [28]:
eth_df.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False)

In [29]:
sql = """
INSERT INTO ethnic_diversity (bid, city, race1, race2, race3, race4, race5, race6) 
SELECT b.bid, '{city}', c.ninguno, c.negro, c.indigena, c.raizal, c.palanguero, c.rom
FROM temptable_{tempname} c 
INNER JOIN blocks_group b ON b.original_id = c.original_id
where b.city='{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

### Poverty

In [30]:
pov_gdf = gpd.read_file('zip://../../data/bogota/poverty/povert_filtered.zip')
pov_gdf = pov_gdf[(~(pov_gdf['geometry'].isnull()))]

pov_gdf.crs = {'init': 'epsg:4326'}

pov_gdf.head()

  return _prepare_from_string(" ".join(pjargs))


Unnamed: 0,npersons,geometry
0,4,"POLYGON ((-74.33891 4.02470, -74.33886 4.02469..."
1,0,"POLYGON ((-74.32786 4.13038, -74.32783 4.13037..."
2,43,"POLYGON ((-74.31458 4.01474, -74.31456 4.01471..."
3,55,"POLYGON ((-74.11416 4.18183, -74.11413 4.18182..."
4,41,"POLYGON ((-74.15319 4.19681, -74.15319 4.19663..."


In [31]:
pov_gdf = gpd.read_file('../../data/bogota/poverty/poverty_SISBEN.gpkg')
pov_gdf = pov_gdf.rename(columns={'PERSONASENCUESTADAS': 'npersons'})
pov_gdf.head()

Unnamed: 0,SCACODIGO,npersons,geometry
0,4207,31.0,"MULTIPOLYGON (((-74.10587 4.61856, -74.10603 4..."
1,5615,3921.0,"MULTIPOLYGON (((-74.10289 4.69350, -74.10293 4..."
2,205318,1492.0,"MULTIPOLYGON (((-74.21200 4.61744, -74.21179 4..."
3,101307,0.0,"MULTIPOLYGON (((-74.08001 4.50041, -74.08015 4..."
4,2604,1323.0,"MULTIPOLYGON (((-74.10237 4.51860, -74.10225 4..."


In [32]:
insert_gdf = process_geometry_SQL_insert(pov_gdf)
insert_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [33]:
sql = """
UPDATE temptable_{tempname} p SET geom=ST_Multi(ST_buffer(p.geom, 0.0)) 
WHERE (NOT ST_IsValid(p.geom)) ;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [34]:
sql = """
INSERT INTO poverty_index (bid, city, total, poors) 
SELECT b.bid, b.city, COALESCE(ce.population, 0), SUM(COALESCE((ST_AREA(ST_INTERSECTION(c.geom, b.geom))/ST_AREA(c.geom)*npersons::float), 0))
FROM blocks_group b
LEFT JOIN census ce ON ce.bid = b.bid AND ce.city = b.city
LEFT JOIN temptable_{tempname} c ON ST_INTERSECTS(c.geom, b.geom) and not st_touches(c.geom, b.geom)
WHERE b.city = '{city}'
GROUP by b.bid, b.city, population;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Buildings

In [35]:
bld_gdf = gpd.read_file('zip://../../data/bogota/buildings/Cons.zip')

bld_gdf = bld_gdf.rename(columns={'ConNPisos': 'floors'})
bld_gdf = bld_gdf[(~((bld_gdf['floors'] == 1) & (bld_gdf['floors'] == bld_gdf['ConNSotano']))) & ((bld_gdf['floors'] > 0))]

bld_gdf = bld_gdf[['floors', 'geometry', 'ConCodigo']]
#bld_gdf = bld_gdf.to_crs("EPSG:4326") 

bld_gdf.head()

Unnamed: 0,floors,geometry,ConCodigo
0,3,"POLYGON ((-74.06812 4.62824, -74.06809 4.62824...",0081010010080000000000000
1,2,"POLYGON ((-74.06798 4.62810, -74.06798 4.62810...",0081010010090000000000000
2,2,"POLYGON ((-74.06794 4.62807, -74.06791 4.62804...",0081010010100000000000000
3,1,"POLYGON ((-74.06813 4.62805, -74.06813 4.62805...",0081010010050000000000000
4,3,"POLYGON ((-74.06795 4.62796, -74.06794 4.62797...",0081010010040000000000000


In [36]:
len(bld_gdf)

2295275

In [37]:
insert_gdf = process_geometry_SQL_insert(bld_gdf)
insert_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [38]:
sql = """
UPDATE temptable_{tempname} p SET geom=ST_Multi(ST_Transform(ST_buffer(p.geom, 0.0), 4326));
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [39]:
sql = """
INSERT INTO building (bid, city, geom, floors, area) 
SELECT bid, '{city}', geom, floors, barea
FROM (
    SELECT bid, geom, floors, barea, ROW_NUMBER() OVER (PARTITION BY geom ORDER BY area DESC) AS r
    from (
        SELECT p.geom, p.floors, ST_Area(p.geom::geography) as barea, d.bid, ST_Area(ST_Intersection(p.geom, d.geom)) as area
        FROM temptable_{tempname} as p
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' AND ST_Area(p.geom::geography) >= 40
        ) as dtable
    order by area
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Land use

In [40]:
land_gdf = gpd.read_file('../../data/bogota/land_use/Lots_2014.gpkg')
land_gdf = land_gdf[(~(land_gdf['LotCodigo'].isnull()))]

land_gdf = land_gdf[['LotCodigo', 'geometry']]

#land_gdf = land_gdf.to_crs({'init': 'epsg:4326'}) 

land_gdf.head()

Unnamed: 0,LotCodigo,geometry
0,4597039009,"MULTIPOLYGON (((-74.20334 4.60793, -74.20339 4..."
1,4593071010,"MULTIPOLYGON (((-74.18929 4.62773, -74.18935 4..."
2,4597039035,"MULTIPOLYGON (((-74.20295 4.60799, -74.20300 4..."
3,4597039020,"MULTIPOLYGON (((-74.20312 4.60792, -74.20315 4..."
4,1401046024,"MULTIPOLYGON (((-74.09880 4.57604, -74.09883 4..."


In [41]:
land_use_df = pd.read_csv('../../data/bogota/land_use/uso.csv.zip', dtype={'UsoCLote': str, 'UsoTUso': str})
land_use_df = land_use_df.rename(columns={
    'UsoArea': 'sqftmain', 
    'UsoTUso': 'usecode',
    'UsoCLote': 'LotCodigo'
})
land_use_df.head()

Unnamed: 0,OBJECTID,LotCodigo,usecode,sqftmain
0,1,5402013032,4,40.5
1,2,5626003014,10,1171.8
2,3,2205012067,1,129.8
3,4,9203069030,1,75.0
4,5,2430034026,22,91.61


In [42]:
'''
from simpledbf import Dbf5

dbf = Dbf5('../../data/bogota/land_use/Uso.dbf')
land_use_df = dbf.to_dataframe()

land_use_df = land_use_df.rename(columns={
    'USOAREA': 'sqftmain', 
    'USOTUSO': 'usecode',
    'USOCLOTE': 'LotCodigo'
})#.drop('OBJECTID', axis=1)

land_use_df.head()
'''

"\nfrom simpledbf import Dbf5\n\ndbf = Dbf5('../../data/bogota/land_use/Uso.dbf')\nland_use_df = dbf.to_dataframe()\n\nland_use_df = land_use_df.rename(columns={\n    'USOAREA': 'sqftmain', \n    'USOTUSO': 'usecode',\n    'USOCLOTE': 'LotCodigo'\n})#.drop('OBJECTID', axis=1)\n\nland_use_df.head()\n"

In [43]:
land_gdf = pd.merge(land_gdf, land_use_df, on='LotCodigo', how='left')
land_gdf.loc[land_gdf['usecode'].isnull(), 'usecode'] = '999' # In bogota land use is about buildings. Missing lote are correlated with parks and sport courts

In [44]:
land_gdf['landuse'] = 'none'

land_gdf.loc[land_gdf['usecode'].isin({'001', '002', '037', '038'}), 'landuse'] = 'residential'

# Be careful of the NOT in the query
land_gdf.loc[~land_gdf['usecode'].isin({'001', '002', '023', '029', '030', '031', '032', '036', '037', '038', '047', '048', '052', '065', '090'}), 'landuse'] = 'commercial'

land_gdf.loc[land_gdf['usecode'].isin({'023', '029', '030', '031', '032', '036',  '047', '052', '065', '999'}), 'landuse'] = 'recreational'
land_gdf.loc[land_gdf['usecode'].isin({'090', '048'}), 'landuse'] = 'vacant'

In [45]:
land_gdf = land_gdf.reset_index()
land_gdf.head()

Unnamed: 0,index,LotCodigo,geometry,OBJECTID,usecode,sqftmain,landuse
0,0,4597039009,"MULTIPOLYGON (((-74.20334 4.60793, -74.20339 4...",209953.0,1,145.0,residential
1,1,4593071010,"MULTIPOLYGON (((-74.18929 4.62773, -74.18935 4...",218802.0,1,196.09,residential
2,2,4597039035,"MULTIPOLYGON (((-74.20295 4.60799, -74.20300 4...",800131.0,1,796.0,residential
3,3,4597039020,"MULTIPOLYGON (((-74.20312 4.60792, -74.20315 4...",273787.0,1,204.0,residential
4,4,1401046024,"MULTIPOLYGON (((-74.09880 4.57604, -74.09883 4...",666530.0,1,3692.0,residential


In [46]:
ins_gdf = process_geometry_SQL_insert(land_gdf)
ins_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [47]:
sql = """
UPDATE temptable_{tempname} p SET geom=ST_Multi(ST_buffer(p.geom, 0.0))  
WHERE NOT ST_Isvalid(p.geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [48]:
sql = """
INSERT INTO land_uses (bid, city, use_type, area) 
SELECT bid, '{city}', landuse, SUM(sqftmain) 
FROM (
    SELECT bid, landuse, sqftmain, ROW_NUMBER() OVER (PARTITION BY index ORDER BY area DESC) AS r
    from (
        SELECT p.index, p.\"OBJECTID\" as pid, p.landuse, ST_AREA(p.geom::geography) as sqftmain, d.bid, ST_Area(ST_Intersection(p.geom, d.geom)) as area
        FROM temptable_{tempname} as p
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' AND p.landuse <> 'none' AND ST_Isvalid(p.geom)
        ) as dtable
) x
WHERE x.r = 1
GROUP BY bid, landuse;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Income (Extra)

In [49]:
from simpledbf import Dbf5

dbf = Dbf5('../../data/bogota/poverty/ESoc.dbf')
strata_df = dbf.to_dataframe()

strata_df = strata_df.rename(columns={
    'ESoCLote': 'LotCodigo',
}).drop('OBJECTID', axis=1)#.drop('ESoChip', axis=1)

strata_df.head()

PyTables is not installed. No support for HDF output.


Unnamed: 0,LotCodigo,ESoChip,ESoEstrato
0,4630014009,AAA0148WSAW,1
1,4630014011,AAA0148WSCN,1
2,4630014015,AAA0148WSHK,1
3,4630014016,AAA0148WSJZ,1
4,4630014019,AAA0148WSMR,1


In [50]:
lote_gdf = gpd.read_file('zip://../../data/bogota/land_use/Lote.zip')
#pvalues_gdf = pvalues_gdf[(~(pvalues_gdf['geometry'].isnull()))] 

lote_gdf.head()

Unnamed: 0,OBJECTID,LotCodigo,LotDispers,LotILDispe,LotUPredia,ManzCodigo,SHAPE_Leng,SHAPE_Area,geometry
0,1,2527077002,D,2604016001.0,0,2527077,0.006068,2.426556e-07,"POLYGON ((-74.10744 4.51206, -74.10743 4.51205..."
1,2,6104034020,N,,0,6104034,0.000588,1.546601e-08,"POLYGON ((-74.08081 4.61418, -74.08095 4.61399..."
2,3,6104034010,N,,0,6104034,0.000619,1.785291e-08,"POLYGON ((-74.08047 4.61374, -74.08053 4.61367..."
3,4,6104032016,N,,0,6104032,0.000505,1.349744e-08,"POLYGON ((-74.08007 4.61339, -74.08008 4.61338..."
4,5,6104032009,N,,0,6104032,0.000433,1.072869e-08,"POLYGON ((-74.08022 4.61320, -74.08026 4.61315..."


In [51]:
lots_list_df = strata_df.drop_duplicates(subset={'LotCodigo'})[['LotCodigo']]
lots_list_df.head()

Unnamed: 0,LotCodigo
0,4630014009
1,4630014011
2,4630014015
3,4630014016
4,4630014019


In [52]:
import os
from math import modf
from joblib import Parallel, delayed

def fetch_new_objectid(row):
    payload = {'where': "LOTCODIGO='{}'".format(row), 'returnIdsOnly': True, 'f': 'json'}
    r = requests.post("https://serviciosgis.catastrobogota.gov.co/arcgis/rest/services/catastro/lote/MapServer/0/query", params=payload)
    try:
        ids = r.json()['objectIds'][0]
    except:
        ids = None
    return {'LotCodigo': str(row), 'objectid': ids}



if os.path.isfile('../../data/bogota/poverty/lot2objectid.csv'):
    lot2objectid_df = pd.read_csv('../../data/bogota/poverty/lot2objectid.csv')
else:
    lotcodigos = lots_list_df['LotCodigo'].values
    dict_lot_objid = Parallel(n_jobs=30)(delayed(fetch_new_objectid)(x) for x in tqdm(lotcodigos))
    
    lot2objectid_df = pd.DataFrame(dict_lot_objid)
    lot2objectid_df = lot2objectid_df[~lot2objectid_df.objectid.isnull()]
    lot2objectid_df['objectid'] = lot2objectid_df['objectid'].astype(str)
    lot2objectid_df.to_csv('../../data/bogota/poverty/lot2objectid.csv', index=False)

lot2objectid_df.head()

Unnamed: 0,LotCodigo,objectid
0,4630014009,724028.0
1,4630014011,724033.0
2,4630014015,728447.0
3,4630014016,728450.0
4,4630014019,725431.0


In [53]:
import itertools 

def fetch_object_info(row):
    payload = {'objectIds': str(int(row)), 'f': 'json', 'relationshipId': 2, 'outFields': 'PRECHIP,PREACONST,PREATERRE,PREVETUSTZ', 'returnGeometry': False}
    r = requests.get("https://serviciosgis.catastrobogota.gov.co/arcgis/rest/services/catastro/lote/MapServer/0/queryRelatedRecords", params=payload)
    try:
        infos = r.json()['relatedRecordGroups'][0]['relatedRecords']
        new_infos = []
        for x in infos:
            x['attributes']['objectid'] = str(int(row))
            new_infos.append(x['attributes'])
        infos = new_infos
    except:
        infos = [{'objectid': str(int(row)), 'PRECHIP': None,'PREACONST': None,'PREATERRE': None,'PREVETUSTZ': None}]
    return infos


if os.path.isfile('../../data/bogota/poverty/chips.csv'):
    chip_info_df = pd.read_csv('../../data/bogota/poverty/chips.csv')
else:
    dict_chips = Parallel(n_jobs=30)(delayed(fetch_object_info)(x) for x in tqdm(lot2objectid_df['objectid'].values))
    dict_chips_flatten = list(itertools.chain.from_iterable(dict_chips))
    chip_info_df = pd.DataFrame(dict_chips_flatten)
    #chip_info_df = chip_info_df.drop_duplicates(subset='PRECHIP')
    chip_info_df.to_csv('../../data/bogota/poverty/chips.csv', index=False)

chip_info_df.head()

Unnamed: 0,PREACONST,PREATERRE,PRECHIP,PREVETUSTZ,objectid
0,147.5,71.8,AAA0148WSAW,1996.0,724028
1,293.2,70.2,AAA0148WSCN,1996.0,724033
2,98.4,70.9,AAA0148WSHK,2008.0,728447
3,124.4,71.8,AAA0148WSJZ,1986.0,728450
4,25.0,77.5,AAA0148WSMR,1986.0,725431


In [54]:
chip_info_df['objectid'] = chip_info_df['objectid'].astype(str)
lot2objectid_df['objectid'] = lot2objectid_df['objectid'].astype(str).str[:-2]
val_chips_df = pd.merge(chip_info_df, lot2objectid_df, on='objectid')
val_chips_df.head()

Unnamed: 0,PREACONST,PREATERRE,PRECHIP,PREVETUSTZ,objectid,LotCodigo
0,147.5,71.8,AAA0148WSAW,1996.0,724028,4630014009
1,293.2,70.2,AAA0148WSCN,1996.0,724033,4630014011
2,98.4,70.9,AAA0148WSHK,2008.0,728447,4630014015
3,124.4,71.8,AAA0148WSJZ,1986.0,728450,4630014016
4,25.0,77.5,AAA0148WSMR,1986.0,725431,4630014019


In [55]:
len(val_chips_df)

2254424

In [56]:
strata_chips_df = pd.merge(val_chips_df[['PREACONST', 'PRECHIP']].rename(columns={'PRECHIP': 'ESoChip'}), 
         strata_df[['ESoChip', 'LotCodigo', 'ESoEstrato']], on=['ESoChip'])
strata_chips_df.head()

Unnamed: 0,PREACONST,ESoChip,LotCodigo,ESoEstrato
0,147.5,AAA0148WSAW,4630014009,1
1,293.2,AAA0148WSCN,4630014011,1
2,98.4,AAA0148WSHK,4630014015,1
3,124.4,AAA0148WSJZ,4630014016,1
4,25.0,AAA0148WSMR,4630014019,1


In [57]:
unique_land_gdf = land_gdf.copy()
unique_land_gdf.loc[:, 'x'] = unique_land_gdf.geometry.centroid.x
unique_land_gdf.loc[:, 'y'] = unique_land_gdf.geometry.centroid.y
unique_land_gdf = unique_land_gdf.drop_duplicates(subset=['x', 'y'])

## Property age

In [62]:
bld_gdf.loc[:, 'ConCodigo'] = bld_gdf['ConCodigo'].str[:-13]
bld_gdf.head()

Unnamed: 0,floors,geometry,ConCodigo,geom
0,3,"POLYGON ((-74.06812 4.62824, -74.06809 4.62824...",8101001008,MULTIPOLYGON (((-74.06812331399993 4.628244778...
1,2,"POLYGON ((-74.06798 4.62810, -74.06798 4.62810...",8101001009,MULTIPOLYGON (((-74.06798451299994 4.628095871...
2,2,"POLYGON ((-74.06794 4.62807, -74.06791 4.62804...",8101001010,MULTIPOLYGON (((-74.0679410539999 4.6280662830...
3,1,"POLYGON ((-74.06813 4.62805, -74.06813 4.62805...",8101001005,MULTIPOLYGON (((-74.06812777099992 4.628047357...
4,3,"POLYGON ((-74.06795 4.62796, -74.06794 4.62797...",8101001004,MULTIPOLYGON (((-74.06795040799994 4.627963112...


In [63]:
age_df = val_chips_df[~val_chips_df['PREVETUSTZ'].isnull()][['PRECHIP', 'LotCodigo', 'PREACONST', 'PREVETUSTZ']].copy()
age_df['LotCodigo'] = age_df['LotCodigo'].astype(str)
age_df.loc[:, 'LotCodigo'] = age_df['LotCodigo'].str.zfill(12)
age_df.head()

Unnamed: 0,PRECHIP,LotCodigo,PREACONST,PREVETUSTZ
0,AAA0148WSAW,4630014009,147.5,1996.0
1,AAA0148WSCN,4630014011,293.2,1996.0
2,AAA0148WSHK,4630014015,98.4,2008.0
3,AAA0148WSJZ,4630014016,124.4,1986.0
4,AAA0148WSMR,4630014019,25.0,1986.0


In [64]:
age_df = age_df[age_df['PREVETUSTZ'] <= 2014]
age_df = age_df.drop_duplicates(subset=['LotCodigo', 'PREVETUSTZ'])

In [65]:
age_lot_gdf = pd.merge(bld_gdf.rename(columns={'ConCodigo': 'LotCodigo'}), age_df, on='LotCodigo')
age_lot_gdf = age_lot_gdf.drop(['PRECHIP', 'PREACONST'], axis=1)
age_lot_gdf.head()

Unnamed: 0,floors,geometry,LotCodigo,geom,PREVETUSTZ
0,3,"POLYGON ((-74.06812 4.62824, -74.06809 4.62824...",8101001008,MULTIPOLYGON (((-74.06812331399993 4.628244778...,1947.0
1,2,"POLYGON ((-74.06798 4.62810, -74.06798 4.62810...",8101001009,MULTIPOLYGON (((-74.06798451299994 4.628095871...,1957.0
2,2,"POLYGON ((-74.06794 4.62807, -74.06791 4.62804...",8101001010,MULTIPOLYGON (((-74.0679410539999 4.6280662830...,1957.0
3,1,"POLYGON ((-74.06813 4.62805, -74.06813 4.62805...",8101001005,MULTIPOLYGON (((-74.06812777099992 4.628047357...,1947.0
4,1,"POLYGON ((-74.06807 4.62802, -74.06803 4.62800...",8101001005,MULTIPOLYGON (((-74.06807286599991 4.628018886...,1947.0


In [66]:
len(age_lot_gdf)

2203207

In [67]:
ins_gdf = process_geometry_SQL_insert(age_lot_gdf)
ins_gdf.to_sql('temptable3_{}'.format(CITY.lower()).format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [68]:
sql = """
INSERT INTO property_age (bid, age, area, city) 
SELECT x.bid, age, area_building, x.city
FROM (
    SELECT bid, age, city, area as area_building, ROW_NUMBER() OVER (PARTITION BY lotid ORDER BY area DESC) AS r
    from (
        SELECT p."LotCodigo" lotid, d.city, d.bid, p."PREVETUSTZ"::int as age, ST_Area((CASE WHEN ST_CoveredBy(p.geom, d.geom) THEN p.geom ELSE ST_Intersection(p.geom, d.geom) END)::geography) as area
        FROM temptable3_{tempname} as p
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' 
        ) as dtable
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Land values

In [69]:
pvalues_gdf = gpd.read_file('zip://../../data/bogota/land_use/property_values.geojson.zip')
pvalues_gdf = pvalues_gdf[(~(pvalues_gdf['geometry'].isnull()))] 

pvalues_gdf.head()

Unnamed: 0,gid,vrevalor,value,geometry
0,35019,450000,450000,"MULTIPOLYGON (((-74.10787 4.60391, -74.10793 4..."
1,12,100,100,"MULTIPOLYGON (((-74.18296 4.22776, -74.18300 4..."
2,584,1800,1800,"MULTIPOLYGON (((-74.18257 4.34028, -74.18226 4..."
3,97,100,100,"MULTIPOLYGON (((-74.20165 4.17417, -74.20132 4..."
4,108,100,100,"MULTIPOLYGON (((-74.20409 4.15925, -74.20425 4..."


In [70]:
ins_gdf = process_geometry_SQL_insert(pvalues_gdf)
ins_gdf.to_sql('temptable6_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [71]:
sql = """
UPDATE temptable6_{tempname} p SET geom=ST_Multi(ST_buffer(p.geom, 0.0))
WHERE (NOT ST_IsValid(p.geom));
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [72]:
sql = """
DELETE FROM temptable6_{tempname} t WHERE NOT EXISTS(
    SELECT * 
    FROM temptable_{tempname} l WHERE l.landuse <> 'none' AND ST_INTERSECTS(l.geom, t.geom) AND ST_AREA(ST_Intersection(l.geom, t.geom))/ST_AREA(l.geom) > 0.5
);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [73]:
sql = """
INSERT INTO property_value (bid, area, value, city) 
SELECT bid, area, value, '{city}'
FROM (
    SELECT bid, area, value, ROW_NUMBER() OVER (PARTITION BY bid ORDER BY area DESC) AS r
    from (
        SELECT ST_Area(ST_Intersection(p.geom, d.geom)::geography) as area, d.bid, p.vrevalor::float as value
        FROM temptable6_{tempname} as p
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' AND ST_Isvalid(p.geom)
        ) as dtable
    order by area
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Unused areas

In [74]:
unused_gdf = gpd.read_file('zip://../../data/bogota/unused_area/CAgu.zip')
unused_gdf = unused_gdf[['geometry']]

unused_gdf['type'] = 'water'
unused_gdf['city'] = CITY
unused_gdf.head()

Unnamed: 0,geometry,type,city
0,"POLYGON ((-74.12167 4.32736, -74.12170 4.32735...",water,bogota1m
1,"POLYGON ((-74.12028 4.32475, -74.12031 4.32474...",water,bogota1m
2,"POLYGON ((-74.12679 4.30599, -74.12676 4.30599...",water,bogota1m
3,"POLYGON ((-74.17003 4.38655, -74.16999 4.38652...",water,bogota1m
4,"POLYGON ((-74.06112 4.67783, -74.06103 4.67788...",water,bogota1m


In [75]:
ins_gdf = process_geometry_SQL_insert(unused_gdf)
ins_gdf.to_sql('unused_areas', engine, if_exists='append', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [76]:
unused_gdf = gpd.read_file('../../data/bogota/unused_area/parksandrivers.geojson')
unused_gdf = unused_gdf[['geometry']]

unused_gdf['type'] = 'park'
unused_gdf['city'] = CITY
unused_gdf.head()

Unnamed: 0,geometry,type,city
0,"POLYGON ((-74.04808 4.69141, -74.04824 4.69109...",park,bogota1m
1,"POLYGON ((-74.20717 4.80086, -74.20708 4.80109...",park,bogota1m
2,"POLYGON ((-74.08523 4.63937, -74.08519 4.63939...",park,bogota1m
3,"POLYGON ((-74.08620 4.63813, -74.08574 4.63796...",park,bogota1m
4,"POLYGON ((-74.10028 4.71777, -74.10076 4.71731...",park,bogota1m


In [77]:
unused_gdf = unused_gdf[(unused_gdf.geometry.geom_type == 'Polygon') | (unused_gdf.geometry.geom_type == 'MultiPolygon')]
    

In [78]:
ins_gdf = process_geometry_SQL_insert(unused_gdf)
ins_gdf.to_sql('unused_areas', engine, if_exists='append', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

Bogota has streets in the parcels, so let's remove them

In [79]:
unused_gdf = gpd.read_file('../../data/bogota/unused_area/street_areas.gpkg')
unused_gdf = unused_gdf[['geometry']]

unused_gdf['type'] = 'street'
unused_gdf['city'] = CITY
unused_gdf.head()

Unnamed: 0,geometry,type,city
0,"MULTIPOLYGON (((-74.10046 4.67042, -74.09994 4...",street,bogota1m
1,"MULTIPOLYGON (((-74.09210 4.66453, -74.09212 4...",street,bogota1m
2,"MULTIPOLYGON (((-74.09187 4.66423, -74.09188 4...",street,bogota1m
3,"MULTIPOLYGON (((-74.09126 4.66332, -74.09127 4...",street,bogota1m
4,"MULTIPOLYGON (((-74.08902 4.65982, -74.08903 4...",street,bogota1m


In [80]:
ins_gdf = process_geometry_SQL_insert(unused_gdf)
ins_gdf.to_sql('unused_areas', engine, if_exists='append', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [81]:
sql = """
update unused_areas set geom=st_multi(st_buffer(geom, 0.0)) WHERE city = '{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [82]:
unused_gdf = gpd.read_file('../../data/bogota/unused_area/parksandrivers.geojson')
unused_gdf = unused_gdf[['geometry']]
unused_gdf = unused_gdf.to_crs("EPSG:4326")
unused_gdf['type'] = 'parksrivers'
unused_gdf['city'] = CITY

unused_gdf = unused_gdf[unused_gdf.geometry.geom_type != 'LineString']
unused_gdf.head()

Unnamed: 0,geometry,type,city
0,"POLYGON ((-74.04808 4.69141, -74.04824 4.69109...",parksrivers,bogota1m
1,"POLYGON ((-74.20717 4.80086, -74.20708 4.80109...",parksrivers,bogota1m
2,"POLYGON ((-74.08523 4.63937, -74.08519 4.63939...",parksrivers,bogota1m
3,"POLYGON ((-74.08620 4.63813, -74.08574 4.63796...",parksrivers,bogota1m
4,"POLYGON ((-74.10028 4.71777, -74.10076 4.71731...",parksrivers,bogota1m


In [83]:
ins_gdf = process_geometry_SQL_insert(unused_gdf)
ins_gdf.to_sql('temptable_{tempname}'.format(tempname=CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [84]:
sql = """
update temptable_{tempname} set geom=st_multi(st_buffer(geom, 0.0));
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [85]:
sql = """
DROP TABLE IF EXISTS temptable_unusedhelper_{tempname};
CREATE TEMPORARY TABLE temptable_unusedhelper_{tempname} AS
SELECT ST_Union(geom) as geom FROM unused_areas u 
WHERE city='{city}';
CREATE INDEX ON temptable_unusedhelper_{tempname} USING GIST (geom);

DROP TABLE IF EXISTS temptable_unusedhelper_exp_{tempname};
CREATE TEMPORARY TABLE temptable_unusedhelper_exp_{tempname} AS
SELECT (ST_Dump(geom)).geom as geom FROM temptable_unusedhelper_{tempname} u;
CREATE INDEX ON temptable_unusedhelper_exp_{tempname} USING GIST (geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [86]:
sql = """
update temptable_{tempname} t set geom=ST_Multi(ST_Difference(t.geom, h.geom))
FROM temptable_unusedhelper_{tempname} h
WHERE st_intersects(t.geom, h.geom) AND (NOT ST_Touches(t.geom, h.geom)) AND ST_GeometryType(ST_Multi(ST_Difference(t.geom, h.geom))) <> 'ST_GeometryCollection';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [87]:
sql = """
DELETE FROM temptable_{tempname} t 
USING temptable_unusedhelper_exp_{tempname} h
WHERE ST_Within(t.geom, h.geom) OR (st_intersects(t.geom, h.geom) AND (NOT ST_Touches(t.geom, h.geom)) AND ST_GeometryType(ST_Multi(ST_Difference(t.geom, h.geom))) = 'ST_GeometryCollection');
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [88]:
sql = """
update temptable_{tempname} set geom=st_multi(st_buffer(geom, 0.0));
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [89]:
sql = """
INSERT INTO unused_areas (geom, type, city) 
SELECT st_multi(st_buffer(p.geom, 0.0)), p.type, p.city
FROM temptable_{tempname} as p
WHERE NOT EXISTS(SELECT * FROM unused_areas u WHERE ST_Intersects(u.geom, st_buffer(p.geom, 0.0)) AND u.city=p.city)
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Net area

In [90]:
land_gdf = process_geometry_SQL_insert(unique_land_gdf)
land_gdf.to_sql('temptable_u_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [91]:
sql = """
UPDATE temptable_u_{tempname} p SET geom=ST_Multi(ST_buffer(p.geom, 0.0)) 
WHERE (NOT ST_IsValid(p.geom));
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [92]:
sql = """
DELETE 
FROM temptable_u_{tempname} t
USING unused_areas u 
WHERE u.city = '{city}' AND ST_Intersects(u.geom, t.geom) AND (NOT ST_Touches(u.geom, t.geom)) 
AND (ST_Contains(u.geom, t.geom) OR ST_AREA(ST_Intersection(t.geom, u.geom))/ST_Area(t.geom) > 0.5);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [93]:
sql = """
INSERT INTO spatial_groups_net_area (sp_id, city, spatial_name, used_area) 
SELECT sp_id, city, spatial_name, SUM(ST_Area((CASE WHEN ST_Within(t.geom, s.approx_geom) THEN t.geom ELSE ST_Intersection(s.approx_geom, t.geom) END)::geography))/1000000.
FROM temptable_u_{tempname} t
INNER JOIN spatial_groups s ON ST_Intersects(s.approx_geom, t.geom) AND NOT ST_Touches(s.approx_geom, t.geom)
WHERE s.city = '{city}' 
GROUP BY sp_id, city, spatial_name;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

We don't clean directly the blocks as land use is not clean

In [94]:
land_gdf = gpd.read_file('../../data/bogota/land_use/Lots_2014.gpkg')
land_gdf = land_gdf[(~(land_gdf['LotCodigo'].isnull()))]

land_gdf = land_gdf[['LotCodigo', 'geometry']]

#land_gdf = land_gdf.to_crs({'init': 'epsg:4326'}) 

land_gdf.head()

Unnamed: 0,LotCodigo,geometry
0,4597039009,"MULTIPOLYGON (((-74.20334 4.60793, -74.20339 4..."
1,4593071010,"MULTIPOLYGON (((-74.18929 4.62773, -74.18935 4..."
2,4597039035,"MULTIPOLYGON (((-74.20295 4.60799, -74.20300 4..."
3,4597039020,"MULTIPOLYGON (((-74.20312 4.60792, -74.20315 4..."
4,1401046024,"MULTIPOLYGON (((-74.09880 4.57604, -74.09883 4..."


In [96]:
ins_gdf = process_geometry_SQL_insert(land_gdf)
ins_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [97]:
## This deletes the blocks that are related to streets
sql = """
DELETE FROM block b
WHERE city='{city}' and NOT EXISTS (select * from temptable_{tempname} t where st_intersects(t.geom, b.geom));
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Vacuums

In [None]:
sql = """
REFRESH MATERIALIZED VIEW block_centroids;
"""

result = engine.execute(text(sql))

In [None]:
sql = """
REFRESH MATERIALIZED VIEW pois_requests;
"""

result = engine.execute(text(sql))

In [7]:
from collections import defaultdict
from joblib import Parallel, delayed

In [8]:
def make_trip(lon1, lat1, dest):
    # PERSONALIZE HERE
    r = requests.get(
        'http://localhost:5000/table/v1/foot/{lon1},{lat1};{dest}?annotations=distance&sources=0'.format(lon1=lon1, lat1=lat1, dest=dest))
    distances = r.json()['distances']
    distance = np.array(distances[0][1:], np.float32)
    distance[distance < 0] = 0
    return distance

def walkscore_list(bid, clon, clat, list_dests, ws, straight_distances):
    dists = make_trip(clon, clat, list_dests)
    straight_distances = np.array(straight_distances)
    #good_idxs = np.argwhere(dists < 1600)
    ws = np.array(ws)+ 0.00000001 #[good_idxs] 
    zeros_idxs = np.argwhere(dists == 0)
    dists[zeros_idxs] = 1
    if np.sum(ws) == 0 or len(ws) == 0:
        return bid, -1
    return bid, np.average(straight_distances/dists, weights=ws) #[good_idxs] [good_idxs]

cat_weights = {
    'grocery': [3],
    'Food': [.75,.45,.25,.25,.225,.225,.225,.225,.2,.2],
    'Shops': [.5,.45,.4,.35,.3],
    'Schools': [1],
    'Entertainment': [1],
    'Parks and outside': [1],
    'Coffee': [1.25,.75],
    'Banks': [1],
    'Books': [1]
}


def walkscore(meters):
	max_walk = 1500
	score = np.exp(-5 * (meters / max_walk) ** 2.5)
	score = np.clip(score, 0, 1)
	return score

def walkscore2_list(bid, clon, clat, list_dests, c):
    dists = make_trip(clon, clat, list_dests)
    #good_idxs = np.argwhere(dists < 1600)
    scores = np.sort(walkscore(dists))[::-1]
    n = len(cat_weights[c])
    d = np.zeros(n)
    d[:scores.shape[0]] = scores[:n]
    w = np.sum(np.array(d)*np.array(cat_weights[c]))
    assert w <= np.sum(cat_weights[c]) and w >= 0
    
    return bid, w #[good_idxs] [good_idxs]

In [9]:
sql = """
SELECT bid, lon, lat, dests, parent_cat FROM pois_requests p WHERE p.city = '{city}' 
""".format(city=CITY, tempname=CITY.lower())

blocks_df = pd.read_sql_query(sql, con=engine)
blocks_df.head()

Unnamed: 0,bid,lon,lat,dests,parent_cat
0,500248,-74.175807,4.625664,"-74.168640,4.629927;-74.168129,4.629320",Banks
1,500248,-74.176061,4.627078,"-74.168640,4.629927;-74.168129,4.629320",Banks
2,500248,-74.171836,4.627394,"-74.168640,4.629927;-74.168129,4.629320",Banks
3,500248,-74.17471,4.628239,"-74.168640,4.629927;-74.168129,4.629320",Banks
4,500248,-74.176425,4.628327,"-74.168640,4.629927;-74.168129,4.629320",Banks


In [10]:
sql = """
SELECT bid, COUNT(*) as size
FROM block_centroids b WHERE b.city = '{city}' 
GROUP BY bid
ORDER BY bid
""".format(city=CITY, tempname=CITY.lower())

n_blocks_df = pd.read_sql_query(sql, con=engine).set_index('bid')
n_blocks_df.head()

Unnamed: 0_level_0,size
bid,Unnamed: 1_level_1
500248,22
500249,75
500250,96
500251,97
500252,79


In [11]:
list(set(blocks_df.parent_cat.values))

['Banks',
 'Books',
 'Food',
 'grocery',
 'Schools',
 'Entertainment',
 'Coffee',
 'Shops']

In [12]:
block_groups = defaultdict(list)
for index, row in blocks_df.iterrows():
    block_groups[row['bid']].append(row.values[1:])

In [13]:
from tqdm import tqdm

print(len(blocks_df))
results = [(idx, score) for idx, score in Parallel(n_jobs=10)(delayed(walkscore2_list)(bid, req[0], req[1], req[2], req[3]) for bid, reqs in tqdm(block_groups.items()) for req in reqs)]

  0%|          | 0/918 [00:00<?, ?it/s]

271370


100%|██████████| 918/918 [02:02<00:00,  7.48it/s]


In [14]:
block_vacuum_index = defaultdict(list)
bid2size = {k: v['size'] for k, v in n_blocks_df.iterrows()}

for bid, score in results:
    block_vacuum_index[bid].append(score)
    
sum_cat_weights = np.sum([y for x in cat_weights.values() for y in x])

for bid, score in block_vacuum_index.items():
    if len(score) > 0:
        score = (np.sum(score)/bid2size[bid])/sum_cat_weights
        assert score <= 1.01
        sql = "INSERT INTO walk_index (bid, score, city) VALUES ({}, {}, '{}')".format(bid, score, CITY)
        result = engine.execute(text(sql))

## Crime

In [15]:
crime_dataframes = []

def process_crime_data(filename, type):
    df_hurto = pd.read_csv('../../data/bogota/crime/points/{filename}'.format(filename=filename), delimiter=';')
    #df_hurto = df_hurto[(df_hurto['Municipio'] == 'MEDELLÍN (CT)')]'BOGOTÁ D.C. (CT)'
    df_hurto = df_hurto[df_hurto['Zona'] == 'URBANA']
    df_hurto = df_hurto[((df_hurto['AÑO'] == 2014) & (df_hurto['Municipio'] == 'BOGOTÁ D.C. (CT)'))]
    
    if type == 'LESIONES':
        df_hurto.loc[(df_hurto['MODALIDAD'] != 'ACCIDENTAL') & (~df_hurto['Arma empleada'].isin({'-', 'MOTO', 'AGUA CALIENTE', 'CINTAS/CINTURON', 'VEHICULO', 'POLVORA(FUEGOS PIROTECNICOS)', 'SIN EMPLEO DE ARMAS'})), 'MODALIDAD'] = 'ARMA'
    '''elif type == 'COMERCIO' or type == 'PERSONAS':
        df_hurto.loc[(df_hurto['MODALIDAD'] == 'FACTOR DE OPORTUNIDAD') & (~df_hurto['Arma empleada'].isin({'-', 'MOTO', 'AGUA CALIENTE', 'CINTAS/CINTURON', 'VEHICULO', 'POLVORA(FUEGOS PIROTECNICOS)', 'SIN EMPLEO DE ARMAS'})), 'MODALIDAD'] = 'ATRACO'
        df_hurto.loc[(df_hurto['MODALIDAD'] == 'ENGAÑO') & (~df_hurto['Arma empleada'].isin({'-', 'MOTO', 'AGUA CALIENTE', 'CINTAS/CINTURON', 'VEHICULO', 'POLVORA(FUEGOS PIROTECNICOS)', 'SIN EMPLEO DE ARMAS'})), 'MODALIDAD'] = 'ATRACO'
        df_hurto.loc[(df_hurto['MODALIDAD'] == 'NO REPORTADA') & (~df_hurto['Arma empleada'].isin({'-', 'MOTO', 'AGUA CALIENTE', 'CINTAS/CINTURON', 'VEHICULO', 'POLVORA(FUEGOS PIROTECNICOS)', 'SIN EMPLEO DE ARMAS'})), 'MODALIDAD'] = 'ATRACO'
        
    '''
    df_hurto = df_hurto[['MODALIDAD', 'LONGITUD', 'LATITUD', 'Cantidad']]
    df_hurto['TYPE'] = type
    return df_hurto

In [16]:
df_hurto_personas = process_crime_data('h_personas.csv', 'PERSONAS')
crime_dataframes.append(df_hurto_personas)
df_hurto_personas.head()

  if self.run_code(code, result):


Unnamed: 0,MODALIDAD,LONGITUD,LATITUD,Cantidad,TYPE
114311,ENGAÑO,-74141053,4621416615,1,PERSONAS
114312,ATRACO,-7410123256,450116055,1,PERSONAS
114313,FACTOR DE OPORTUNIDAD,-74180603,4314790546,1,PERSONAS
114314,USO SUSTANCIAS TOXICAS / ESCOPOLAMINADO,-7414620928,4682309673,1,PERSONAS
114315,FACTOR DE OPORTUNIDAD,-7408589352,4618172064,1,PERSONAS


In [17]:
df_hurto_moto = process_crime_data('h_motocicletas.csv', 'MOTOCICLETAS')
crime_dataframes.append(df_hurto_moto)
df_hurto_moto.head()

Unnamed: 0,MODALIDAD,LONGITUD,LATITUD,Cantidad,TYPE
28073,HALADO,-7419754105,4610577803,1,MOTOCICLETAS
28074,HALADO,-7408879893,4595167412,1,MOTOCICLETAS
28075,HALADO,-7409233409,456673057,1,MOTOCICLETAS
28076,HALADO,-7413803356,4542092877,1,MOTOCICLETAS
28092,HALADO,-7418038028,4628180103,1,MOTOCICLETAS


In [18]:
df_hurto_auto = process_crime_data('h_automotores.csv', 'AUTOMOTORES')
crime_dataframes.append(df_hurto_auto)
df_hurto_auto.head()

Unnamed: 0,MODALIDAD,LONGITUD,LATITUD,Cantidad,TYPE
18805,HALADO,-740979103975,47333596368,1,AUTOMOTORES
18807,HALADO,-740359319362,47393435282,1,AUTOMOTORES
18808,HALADO,-74083544486,4599856869,1,AUTOMOTORES
18809,USO SUSTANCIAS TOXICAS / ESCOPOLAMINADO,-741448989258,45990841611,1,AUTOMOTORES
18814,HALADO,-740913531756,46741457013,1,AUTOMOTORES


In [19]:
df_hurto_comercio = process_crime_data('h_comercio.csv', 'COMERCIO')
crime_dataframes.append(df_hurto_comercio)
df_hurto_comercio.head()

Unnamed: 0,MODALIDAD,LONGITUD,LATITUD,Cantidad,TYPE
25451,FACTOR DE OPORTUNIDAD,-7415401987,4691783633,1,COMERCIO
25457,MECHERO/HORMIGUEO,-7408209859,468254554,1,COMERCIO
25458,MECHERO/HORMIGUEO,-7409920999,4570433187,1,COMERCIO
25459,VIOLACION DE CERRADURA,-7413419787,4607595257,1,COMERCIO
25460,MECHERO/HORMIGUEO,-74111558,4713794007,1,COMERCIO


In [20]:
df_hurto_residencia = process_crime_data('h_residencias.csv', 'RESIDENCIAS')
crime_dataframes.append(df_hurto_residencia)
df_hurto_residencia.head()

  if self.run_code(code, result):


Unnamed: 0,MODALIDAD,LONGITUD,LATITUD,Cantidad,TYPE
24580,FACTOR DE OPORTUNIDAD,-740976769102,45813512979,1,RESIDENCIAS
24581,LLAVE MAESTRA,-741190813286,47413748275,1,RESIDENCIAS
24582,FACTOR DE OPORTUNIDAD,-741013998163,44889538653,1,RESIDENCIAS
24583,VENTOSA,-740996758636,47408182336,1,RESIDENCIAS
24584,VENTOSA,-740580834491,46993717809,1,RESIDENCIAS


In [21]:
df_homicidios = process_crime_data('homicidios.csv', 'PERSONAS')
df_homicidios['MODALIDAD'] = 'homicidios'
crime_dataframes.append(df_homicidios)
df_homicidios.head()

Unnamed: 0,MODALIDAD,LONGITUD,LATITUD,Cantidad,TYPE
13595,homicidios,-74081402,45912265793,1,PERSONAS
13596,homicidios,-741554095251,46169752264,1,PERSONAS
13597,homicidios,-741026157521,45684522977,1,PERSONAS
13598,homicidios,-74163874,4627767041,1,PERSONAS
13599,homicidios,-7413948,46003543345,1,PERSONAS


In [22]:
df_lesiones = process_crime_data('lesiones.csv', 'LESIONES')
crime_dataframes.append(df_lesiones)
df_lesiones.head()

  if self.run_code(code, result):


Unnamed: 0,MODALIDAD,LONGITUD,LATITUD,Cantidad,TYPE
64431,ARMA,-7410029475,4506538982,1,LESIONES
64432,ARMA,-7413813338,4536659219,1,LESIONES
64433,ARMA,-7409253805,451730633,1,LESIONES
64434,ARMA,-7418107497,4596349137,1,LESIONES
64435,ARMA,-7417833273,459163135,1,LESIONES


In [23]:
crime_df = pd.concat(crime_dataframes)
crime_df = crime_df.rename(columns={'Cantidad' :'num'})
crime_df.head()

Unnamed: 0,MODALIDAD,LONGITUD,LATITUD,num,TYPE
114311,ENGAÑO,-74141053,4621416615,1,PERSONAS
114312,ATRACO,-7410123256,450116055,1,PERSONAS
114313,FACTOR DE OPORTUNIDAD,-74180603,4314790546,1,PERSONAS
114314,USO SUSTANCIAS TOXICAS / ESCOPOLAMINADO,-7414620928,4682309673,1,PERSONAS
114315,FACTOR DE OPORTUNIDAD,-7408589352,4618172064,1,PERSONAS


In [24]:
len(crime_df)

58155

### Crime types

In [25]:
crime_types_df = pd.read_csv('../../data/crime_types/bogota_types_categorized.csv', delimiter=';')
crime_types_df['UCR part'] = crime_types_df['UCR part'].astype(int)
crime_types_df.head()

Unnamed: 0,TYPE,MODALIDAD,UCR part,UCR1
0,PERSONAS,homicidios,1,Criminal homicide
1,COMERCIO,FACTOR DE OPORTUNIDAD,1,Larceny-theft (except motor vehicle theft)
2,COMERCIO,MECHERO,1,Larceny-theft (except motor vehicle theft)
3,COMERCIO,ATRACO,1,Robbery
4,COMERCIO,VIOLACIÓN DE CERRADURAS,1,Burglary (breaking or entering)


In [26]:
crime_types_df.loc[crime_types_df['UCR1'] == 'Larceny-theft (except motor vehicle theft)', 'UCR part'] = '11'

In [27]:
not_joined_df = pd.merge(crime_df, crime_types_df, on=['TYPE', 'MODALIDAD'], how='left')
not_joined_df = not_joined_df[not_joined_df['UCR part'].isnull()]
not_joined_df = not_joined_df[['TYPE', 'MODALIDAD']].drop_duplicates()
not_joined_df

Unnamed: 0,TYPE,MODALIDAD


In [28]:
print(crime_df['num'].count())
crime_df = pd.merge(crime_df, crime_types_df, on=['TYPE', 'MODALIDAD'])
print(crime_df['num'].count())

58155
58155


In [29]:
crime_df = crime_df[crime_df['UCR part'] == 1]
print(crime_df['num'].count())

35412


In [30]:
set(crime_df.UCR1.values)

{'Aggravated assault',
 'Arson',
 'Burglary (breaking or entering)',
 'Criminal homicide',
 'Motor vehicle theft',
 'Robbery'}

#### Subtypes of crimes

In [31]:
ucr_crimes_df = pd.read_csv('../../data/crime_types/UCR_crimes.csv')
ucr_crimes_df.head()

Unnamed: 0,Name,Category
0,Criminal homicide,Violent crime
1,Rape,Violent crime
2,Robbery,Violent crime
3,Aggravated assault,Violent crime
4,Burglary (breaking or entering),Property crime


In [32]:
df_ucr1 = pd.merge(crime_df, ucr_crimes_df.rename(columns={'Name': 'UCR1'}), on='UCR1')

a = set(df_ucr1['UCR1'].drop_duplicates().values)
b = set(crime_df['UCR1'].drop_duplicates().values)
assert(a.intersection(b) == a)

# Categories not present in crime dataset
df_ucr1[~(df_ucr1['UCR1'].isin(b))]

Unnamed: 0,MODALIDAD,LONGITUD,LATITUD,num,TYPE,UCR part,UCR1,Category


In [33]:
crime_df = df_ucr1.rename(columns={'LATITUD': 'lat', 'LONGITUD':'lng', 'MODALIDAD': 'description'})
crime_df['lng'] = crime_df['lng'].str.replace(',', '.').astype(float)
crime_df['lat'] = crime_df['lat'].str.replace(',', '.').astype(float)
crime_df.head()

Unnamed: 0,description,lng,lat,num,TYPE,UCR part,UCR1,Category
0,ATRACO,-74.101233,4.501161,1,PERSONAS,1,Robbery,Violent crime
1,ATRACO,-74.063506,4.668669,1,PERSONAS,1,Robbery,Violent crime
2,ATRACO,-74.077625,4.589667,1,PERSONAS,1,Robbery,Violent crime
3,ATRACO,-74.182119,4.616964,1,PERSONAS,1,Robbery,Violent crime
4,ATRACO,-74.079276,4.683952,1,PERSONAS,1,Robbery,Violent crime


In [34]:
df_2014 = crime_df[['lng', 'lat', 'description', 'num', 'UCR1', 'Category']]
df_2014.count()

lng            35412
lat            35412
description    35412
num            35412
UCR1           35412
Category       35412
dtype: int64

In [35]:
from geopandas import GeoDataFrame
from shapely.geometry import Point

geometry = [Point(xy) for xy in zip(df_2014.lng, df_2014.lat)]
df_2014 = df_2014.drop(['lng', 'lng'], axis=1)
crs = {'init': 'epsg:4326'}
gdf = GeoDataFrame(df_2014, crs=crs, geometry=geometry)
gdf.head()

  return _prepare_from_string(" ".join(pjargs))


Unnamed: 0,lat,description,num,UCR1,Category,geometry
0,4.501161,ATRACO,1,Robbery,Violent crime,POINT (-74.10123 4.50116)
1,4.668669,ATRACO,1,Robbery,Violent crime,POINT (-74.06351 4.66867)
2,4.589667,ATRACO,1,Robbery,Violent crime,POINT (-74.07762 4.58967)
3,4.616964,ATRACO,1,Robbery,Violent crime,POINT (-74.18212 4.61696)
4,4.683952,ATRACO,1,Robbery,Violent crime,POINT (-74.07928 4.68395)


In [36]:
insert_gdf = process_geometry_SQL_insert(gdf)
insert_gdf.to_sql('temptable_{}'.format(CITY), engine, if_exists='replace', index=True, dtype={'geom': Geometry('Point', srid=4326)})

In [37]:
sql = """
DROP TABLE IF EXISTS temptable2_{tempname};
CREATE TABLE temptable2_{tempname} AS
SELECT index, description, num, "UCR1", "Category", st_buffer(geom::geography, 30)::geometry as geom
FROM temptable_{tempname};
CREATE INDEX ON temptable2_{tempname} USING GIST (geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [38]:
sql = """
insert into crime (sp_id, num, city, ucr1, ucr_category) 
select bid, SUM(num), '{city}', "UCR1", "Category" from(
SELECT num, bid, "UCR1", "Category", ROW_NUMBER() OVER (PARTITION BY index) AS r
from (
select c.index, c.num, b.bid, "UCR1", "Category"
from temptable2_{tempname} as c
inner join blocks_group as b on ST_Intersects(b.geom, c.geom)
where b.city='{city}'
    ) as dtable
) x
group by bid, "UCR1", "Category";
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Refresh materialized views

In [41]:
sql = """
REFRESH MATERIALIZED VIEW spatial_groups_unused_areas;
"""

result = engine.execute(text(sql))

In [42]:
sql = """
REFRESH MATERIALIZED VIEW block_building;
"""

result = engine.execute(text(sql))

In [43]:
sql = """
REFRESH MATERIALIZED VIEW blocks_group_with_building;
"""

result = engine.execute(text(sql))