In [225]:
import pandas as pd
import numpy as np
import geopandas as gpd
import psycopg2
from geoalchemy2 import Geometry, WKTElement
from sqlalchemy import *
from shapely.geometry import MultiPolygon
from zipfile import ZipFile
import requests 
import sys
from tqdm import tqdm

In [226]:
try:
    engine = create_engine('postgresql://denadai:lollone@localhost:50013/crime-environment')
except Exception as e:
    print("Uh oh, can't connect. Invalid dbname, user or password?")
    print(e)

In [227]:
def process_geometry_SQL_insert(gdf):
    gdf['geom'] = gdf['geometry'].apply(lambda x: WKTElement((MultiPolygon([x]) if x.geom_type == 'Polygon' else x).wkt, srid=4326))
    gdf = gdf.drop('geometry', 1)
    return gdf

In [228]:
CITY='bogota'

In [229]:
bounds_gdf = gpd.read_file('zip://../../data/bogota/boundary/bogota.zip')
bounds_gdf = bounds_gdf[['geometry']]
bounds_gdf = bounds_gdf.to_crs({'init': 'epsg:4326'}) 
bounds_gdf['city'] = 'bogota'
bounds_gdf.head()

Unnamed: 0,geometry,city
0,(POLYGON ((-74.07884062499994 4.47324469200003...,bogota


In [230]:
insert_gdf = process_geometry_SQL_insert(bounds_gdf)
insert_gdf.to_sql('boundary', engine, if_exists='append', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

## Spatial groups and blocks_group

In [234]:
block_groups_gdf = gpd.read_file('../../data/bogota/blocks_group/barrios_2014.gpkg')
block_groups_gdf = block_groups_gdf[['SCaCodigo', 'geometry']]
#block_groups_gdf = block_groups_gdf.to_crs({'init': 'epsg:4326'}) 
block_groups_gdf.head()

Unnamed: 0,SCaCodigo,geometry
0,4316,(POLYGON ((-74.17498309299992 4.63673892400004...
1,4609,(POLYGON ((-74.17827740599995 4.64849833900007...
2,4607,(POLYGON ((-74.17213319099994 4.63789502300005...
3,4612,(POLYGON ((-74.17617317999994 4.63847400400004...
4,4626,(POLYGON ((-74.17864628299992 4.64915323600007...


In [235]:
block_groups_gdf = gpd.sjoin(block_groups_gdf, bounds_gdf, how="inner", op='intersects').drop('index_right', axis=1)

In [42]:
block_groups_gdf = block_groups_gdf.rename(columns={'SCaCodigo': 'original_id'})
block_groups_gdf['city'] = CITY

In [43]:
insert_gdf = process_geometry_SQL_insert(block_groups_gdf)
insert_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [44]:
sql = """
INSERT INTO blocks_group (original_id, city, geom) 
SELECT s.original_id, s.city, ST_Multi(ST_Intersection(s.geom, b.geom))
FROM temptable_{tempname} as s
INNER JOIN boundary b ON ST_Intersects(s.geom, b.geom) AND NOT ST_Touches(s.geom, b.geom)
where s.city='{city}' and ST_Area(ST_Intersection(s.geom, b.geom))/ST_Area(s.geom) >= 0.5
AND ST_GeometryType(ST_Multi(ST_Intersection(s.geom, b.geom))) = 'ST_MultiPolygon';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

### Neighborhoods

In [45]:
sql = """INSERT INTO spatial_groups (city, core_geom, core_id, lower_ids, spatial_name, approx_geom)
SELECT  a.city, a.geom as core_geom, a.bid as core_id, array_agg(b.bid), 'ego', ST_multi(ST_Union(b.geom))
FROM blocks_group a
INNER JOIN blocks_group b ON a.city = b.city AND ST_DWithin(a.geom::geography, ST_Centroid(b.geom)::geography, 805)
where a.city='{city}'
GROUP BY a.bid, a.geom, a.city;
delete from spatial_groups where ST_Area(approx_geom::geography) < 250000 and spatial_name='ego';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Blocks

In [46]:
block_gdf = gpd.read_file('zip://../../data/bogota/block/Manzana.zip')
block_gdf = block_gdf[['geometry']]
block_gdf = block_gdf.to_crs({'init': 'epsg:4326'}) 
block_gdf.head()

Unnamed: 0,geometry
0,POLYGON ((-74.04382887299994 4.819320739000034...
1,POLYGON ((-74.05200494899998 4.818991145000041...
2,POLYGON ((-74.05571883699997 4.814150852000068...
3,POLYGON ((-74.04858412899995 4.818270103000032...
4,"POLYGON ((-74.03558100499998 4.81683423100003,..."


In [47]:
insert_gdf = process_geometry_SQL_insert(block_gdf)
insert_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [48]:
sql = """
insert into block (sp_id, geom, city, geog, greater_1sm) select bid, geom, city, geom::geography, ST_AREA(geom::geography)>2.59e+6 
from(
    SELECT bid, st_multi(geom) as geom, city, ROW_NUMBER() OVER (PARTITION BY geom ORDER by area DESC) AS r
    from (
        select b.bid, c.geom, b.city, ST_Area(ST_Intersection(b.geom, c.geom)) as area
        from temptable_{tempname} as c
        inner join blocks_group as b on ST_Intersects(b.geom, c.geom)
        where b.city = '{city}'
    ) as dtable
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [49]:
sql = """
UPDATE block AS b SET geom=ST_Multi(ST_Intersection(b.geom, s.geom))
FROM boundary AS s
WHERE ST_Intersects(b.geom, s.geom) AND b.city=s.city AND s.city='{city}' AND NOT ST_Contains(s.geom, b.geom)
AND ST_GeometryType(ST_Intersection(b.geom, s.geom)) = 'ST_MultiPolygon';;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [50]:
sql = """
DELETE FROM block as b
USING boundary AS s
WHERE ST_Intersects(b.geom, s.geom) AND b.city=s.city AND s.city='{city}' 
AND ST_GeometryType(ST_Multi(ST_Intersection(b.geom, s.geom))) != 'ST_MultiPolygon';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [51]:
## Some blocks_group do not have blocks
sql = """
DELETE FROM blocks_group bg
WHERE NOT EXISTS(SELECT * FROM block b WHERE b.sp_id = bg.bid AND b.city = bg.city) AND bg.city='{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Census

In [52]:
census_df = pd.read_excel('../../data/bogota/census/CENSO2005_BOGOTA_VIHOPE_AG.xlsx').reset_index()
census_df.columns = census_df.iloc[0].values
census_df = census_df.iloc[1:]

census_df['original_id'] = census_df['REDCODE'].str[:6].astype(str) + census_df['REDCODE'].str[8:]
# Fix for Sectors
census_df.loc[:, 'original_id'] = census_df['original_id'].str[:-2]
census_df = census_df.groupby('original_id', as_index=False).sum()

census_df['inforce'] = census_df['PER_DE 5 AÑOS Y MAS'].astype(int) - census_df['JUBILADO PENSIONADO'].astype(int) - census_df['INCAPACITADO PARA TRABAJAR'].astype(int) - census_df['ESTUDIO'].astype(int) - census_df['REALIZÓ OFICIOS DEL HOGAR'].astype(int)
census_df['employed'] = census_df['TRABAJÓ'].astype(int) + census_df['NO TRABAJÓ PERO TENIA TRABAJO'].astype(int)
census_df['armed'] = 0
census_df['population'] = census_df['PERSONAS']

census_df = census_df[['original_id', 'population', 'inforce', 'employed', 'armed']]

census_df.head()

Unnamed: 0,original_id,population,inforce,employed,armed
0,11001100000000110101,4224,2090,1749,0
1,11001100000000110201,2658,1483,878,0
2,11001100000000110202,2305,1122,948,0
3,11001100000000110203,2159,983,810,0
4,11001100000000110301,6047,3036,2252,0


In [53]:
census_df.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False)

In [54]:
sql = """
insert into census (bid, population, employed, inforce, tot_survey, city) 
select b.bid, c.population, c.employed+c.armed, c.inforce, c.population, '{city}' 
from temptable_{tempname} c 
inner join blocks_group b on b.original_id = c.original_id;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

### Residential stability

In [55]:
stab_df = pd.read_csv('../../data/bogota/residential_stability/residential_stability.csv', dtype={'ccnct': str})

stab_df['total'] = stab_df['changed'] + stab_df['nochanged']
stab_df['stable'] = stab_df['nochanged']
stab_df['original_id'] = stab_df['ccnct']
stab_df = stab_df[['original_id', 'stable', 'total']]

stab_df['original_id'] = stab_df['original_id'].str[:6].astype(str) + stab_df['original_id'].str[8:]
# Fix for Sectors
stab_df.loc[:, 'original_id'] = stab_df['original_id'].str[:-2]
stab_df = stab_df.groupby('original_id', as_index=False).sum()

stab_df['total'] = stab_df['total'].astype(int)
stab_df['stable'] = stab_df['stable'].astype(int)
stab_df['total2'] = stab_df['total']
stab_df['owner'] = stab_df['total']

stab_df.head()

Unnamed: 0,original_id,stable,total,total2,owner
0,11001100000000110101,2549,4224,4224,4224
1,11001100000000110201,1331,2304,2304,2304
2,11001100000000110202,1515,2305,2305,2305
3,11001100000000110203,1379,2159,2159,2159
4,11001100000000110301,4060,5788,5788,5788


In [56]:
stab_df.to_sql('temptable', engine, if_exists='replace', index=False)

In [57]:
sql = """
INSERT INTO residential_stability (bid, city, total, stable, total2, owner) 
SELECT b.bid, 'bogota', c.total, c.stable, c.total2, c.owner 
FROM temptable c 
INNER JOIN blocks_group b ON b.original_id = c.original_id;
"""

result = engine.execute(text(sql))

### Ethnic diversity

In [58]:
eth_df = pd.read_csv('../../data/bogota/ethnic_diversity/ethnic_diversity.csv', dtype={'ccnct': str})
eth_df = eth_df.rename(columns={
    'ccnct': 'original_id'
})
eth_df['original_id'] = eth_df['original_id'].str[:6].astype(str) + eth_df['original_id'].str[8:]
# Fix for Sectors
eth_df.loc[:, 'original_id'] = eth_df['original_id'].str[:-2]
eth_df = eth_df.groupby('original_id', as_index=False).sum()
eth_df.head()

Unnamed: 0,original_id,indigena,rom,raizal,palanguero,negro,ninguno
0,11001100000000110101,15,0,2,0,156,4050
1,11001100000000110201,8,0,0,0,73,2223
2,11001100000000110202,1,0,0,0,47,2257
3,11001100000000110203,0,0,0,0,68,2091
4,11001100000000110301,24,0,1,0,94,5669


In [59]:
eth_df.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False)

In [60]:
sql = """
INSERT INTO ethnic_diversity (bid, city, race1, race2, race3, race4, race5, race6) 
SELECT b.bid, '{city}', c.ninguno, c.negro, c.indigena, c.raizal, c.palanguero, c.rom
FROM temptable_{tempname} c 
INNER JOIN blocks_group b ON b.original_id = c.original_id;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

### Poverty

In [61]:
pov_gdf = gpd.read_file('zip://../../data/bogota/poverty/povert_filtered.zip')
pov_gdf = pov_gdf[(~(pov_gdf['geometry'].isnull()))]

pov_gdf.crs = {'init': 'epsg:4326'}

pov_gdf.head()

Unnamed: 0,npersons,geometry
0,4,"POLYGON ((-74.33891300000001 4.024703, -74.338..."
1,0,"POLYGON ((-74.327859 4.130376, -74.327828 4.13..."
2,43,"POLYGON ((-74.314576 4.014743, -74.314564 4.01..."
3,55,"POLYGON ((-74.114158 4.181829, -74.11412900000..."
4,41,"POLYGON ((-74.15319 4.196809, -74.153191000000..."


In [62]:
insert_gdf = process_geometry_SQL_insert(pov_gdf)
insert_gdf.to_sql('temptable', engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [63]:
sql = """
UPDATE temptable p SET geom=ST_Multi(ST_buffer(p.geom, 0.0)) 
FROM
boundary b
WHERE b.city = 'bogota' AND ST_Intersects(p.geom, b.geom) AND NOT ST_Touches(p.geom, b.geom)
"""

result = engine.execute(text(sql))

In [64]:
sql = """
INSERT INTO poverty_index (bid, city, total, poors) 
SELECT b.bid, b.city, ce.population, AVG((ST_AREA(ST_INTERSECTION(c.geom, b.geom))/ST_AREA(c.geom)*npersons::float))
FROM blocks_group b
INNER JOIN census ce ON ce.bid = b.bid AND ce.city = b.city
LEFT JOIN temptable c ON ST_INTERSECTS(c.geom, b.geom) AND ST_AREA(ST_INTERSECTION(c.geom, b.geom))/ST_AREA(b.geom) > 0.5
WHERE b.city = 'bogota'
GROUP by b.bid, b.city, population;
"""

result = engine.execute(text(sql))

## Buildings

In [65]:
bld_gdf = gpd.read_file('zip://../../data/bogota/buildings/Cons.zip')

bld_gdf = bld_gdf.rename(columns={'ConNPisos': 'floors'})
bld_gdf = bld_gdf[(~((bld_gdf['floors'] == 1) & (bld_gdf['floors'] == bld_gdf['ConNSotano']))) & ((bld_gdf['floors'] > 0))]

bld_gdf = bld_gdf[['floors', 'geometry']]
bld_gdf = bld_gdf.to_crs({'init': 'epsg:4326'}) 

bld_gdf.head()

Unnamed: 0,floors,geometry
0,3,POLYGON ((-74.06812331399993 4.628244778000067...
1,2,POLYGON ((-74.06798451299994 4.628095871000085...
2,2,"POLYGON ((-74.0679410539999 4.628066283000067,..."
3,1,POLYGON ((-74.06812777099992 4.628047357000071...
4,3,POLYGON ((-74.06795040799994 4.627963112000089...


In [66]:
len(bld_gdf)

2295275

In [67]:
insert_gdf = process_geometry_SQL_insert(bld_gdf)
insert_gdf.to_sql('temptable', engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [68]:
sql = """
UPDATE temptable p SET geom=ST_Multi(ST_buffer(p.geom, 0.0)) 
FROM
boundary b
WHERE b.city = 'bogota' AND ST_Intersects(p.geom, b.geom) AND NOT ST_Touches(p.geom, b.geom)
"""

result = engine.execute(text(sql))

In [69]:
sql = """
INSERT INTO building (bid, city, geom, floors, area) 
SELECT bid, 'bogota', geom, floors, barea
FROM (
    SELECT bid, geom, floors, barea, ROW_NUMBER() OVER (PARTITION BY geom ORDER BY area DESC) AS r
    from (
        SELECT p.geom, p.floors, ST_Area(p.geom::geography) as barea, d.bid, ST_Area(ST_Intersection(p.geom, d.geom)) as area
        FROM temptable as p
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = 'bogota' AND ST_Area(p.geom::geography) >= 40
        ) as dtable
    order by area
) x
WHERE x.r = 1;
"""

result = engine.execute(text(sql))

## Land use

In [167]:
land_gdf = gpd.read_file('../../data/bogota/land_use/Lots_2014.gpkg')
land_gdf = land_gdf[(~(land_gdf['geometry'].isnull())) & (~(land_gdf['LotCodigo'].isnull()))]

land_gdf = land_gdf[['LotCodigo', 'geometry']]

#land_gdf = land_gdf.to_crs({'init': 'epsg:4326'}) 

land_gdf.head()

Unnamed: 0,LotCodigo,geometry
0,4597039009,(POLYGON ((-74.20333918599995 4.60793434200007...
1,4593071010,(POLYGON ((-74.18928611899992 4.62772831500007...
2,4597039035,(POLYGON ((-74.20295012199995 4.60799430000008...
3,4597039020,(POLYGON ((-74.20311757299993 4.60791707800007...
4,1401046024,(POLYGON ((-74.09880285599991 4.57603526100007...


In [168]:
from simpledbf import Dbf5

dbf = Dbf5('../../data/bogota/land_use/Uso.dbf')
land_use_df = dbf.to_dataframe()

land_use_df = land_use_df.rename(columns={
    'USOAREA': 'sqftmain', 
    'USOTUSO': 'usecode',
    'USOCLOTE': 'LotCodigo'
})#.drop('OBJECTID', axis=1)

land_use_df.head()

Unnamed: 0,OBJECTID,LotCodigo,usecode,sqftmain
0,5100440.0,9233060055,38,291.3
1,5100441.0,8512012050,1,281.4
2,5100442.0,9209084054,1,57.0
3,5100443.0,9117040002,1,265.03
4,5100444.0,8512008010,4,159.13


In [170]:
land_gdf = pd.merge(land_gdf, land_use_df, on='LotCodigo')

In [175]:
land_gdf['landuse'] = 'none'

land_gdf.loc[land_gdf['usecode'].isin({'001', '002', '037', '038'}), 'landuse'] = 'residential'

# Be careful of the NOT in the query
land_gdf.loc[~land_gdf['usecode'].isin({'001', '002', '023', '029', '030', '031', '032', '036', '037', '038', '047', '048', '052', '065', '090'}), 'landuse'] = 'commercial'

land_gdf.loc[land_gdf['usecode'].isin({'023', '029', '030', '031', '032', '036',  '047', '052', '065'}), 'landuse'] = 'recreational'
land_gdf.loc[land_gdf['usecode'].isin({'090'}), 'landuse'] = 'vacant'

In [178]:
ins_gdf = process_geometry_SQL_insert(land_gdf)
ins_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [179]:
sql = """
UPDATE temptable_{tempname} p SET geom=ST_Multi(ST_buffer(p.geom, 0.0)) 
FROM
boundary b
WHERE b.city = '{city}' AND ST_Intersects(p.geom, b.geom) AND NOT ST_Touches(p.geom, b.geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [180]:
sql = """
INSERT INTO land_uses (bid, city, use_type, area) 
SELECT bid, '{city}', landuse, SUM(sqftmain) 
FROM (
    SELECT bid, landuse, sqftmain, ROW_NUMBER() OVER (PARTITION BY pid ORDER BY area DESC) AS r
    from (
        SELECT p.\"OBJECTID\" as pid, p.landuse, (CASE WHEN p.landuse <> 'vacant' THEN p.sqftmain ELSE ST_AREA(p.geom::geography) END) as sqftmain, d.bid, ST_Area(ST_Intersection(p.geom, d.geom)) as area
        FROM temptable_{tempname} as p
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' AND p.landuse <> 'none' AND ST_Isvalid(p.geom)
        ) as dtable
    order by area
) x
WHERE x.r = 1
GROUP BY bid, landuse;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Income (Extra)

In [181]:
from simpledbf import Dbf5

dbf = Dbf5('../../data/bogota/poverty/ESoc.dbf')
strata_df = dbf.to_dataframe()

strata_df = strata_df.rename(columns={
    'ESoCLote': 'LotCodigo',
}).drop('OBJECTID', axis=1)#.drop('ESoChip', axis=1)

strata_df.head()

Unnamed: 0,LotCodigo,ESoChip,ESoEstrato
0,4630014009,AAA0148WSAW,1
1,4630014011,AAA0148WSCN,1
2,4630014015,AAA0148WSHK,1
3,4630014016,AAA0148WSJZ,1
4,4630014019,AAA0148WSMR,1


In [182]:
len(strata_df), len(strata_df.drop_duplicates(subset={'LotCodigo'}))

(2221946, 763864)

In [183]:
lots_list_df = strata_df.drop_duplicates(subset={'LotCodigo'})[['LotCodigo']]
lots_list_df.head()

Unnamed: 0,LotCodigo
0,4630014009
1,4630014011
2,4630014015
3,4630014016
4,4630014019


In [184]:
import os
from math import modf
from joblib import Parallel, delayed

def fetch_new_objectid(row):
    payload = {'where': "LOTCODIGO='{}'".format(row), 'returnIdsOnly': True, 'f': 'json'}
    r = requests.post("https://serviciosgis.catastrobogota.gov.co/arcgis/rest/services/catastro/lote/MapServer/0/query", params=payload)
    try:
        ids = r.json()['objectIds'][0]
    except:
        ids = None
    return {'LotCodigo': str(row), 'objectid': ids}



if os.path.isfile('../../data/bogota/poverty/lot2objectid.csv'):
    lot2objectid_df = pd.read_csv('../../data/bogota/poverty/lot2objectid.csv')
else:
    lotcodigos = lots_list_df['LotCodigo'].values
    dict_lot_objid = Parallel(n_jobs=30)(delayed(fetch_new_objectid)(x) for x in tqdm(lotcodigos))
    
    lot2objectid_df = pd.DataFrame(dict_lot_objid)
    lot2objectid_df = lot2objectid_df[~lot2objectid_df.objectid.isnull()]
    lot2objectid_df['objectid'] = lot2objectid_df['objectid'].astype(str)
    lot2objectid_df.to_csv('../../data/bogota/poverty/lot2objectid.csv', index=False)

lot2objectid_df.head()

Unnamed: 0,LotCodigo,objectid
0,4630014009,724028.0
1,4630014011,724033.0
2,4630014015,728447.0
3,4630014016,728450.0
4,4630014019,725431.0


In [185]:
import itertools 

def fetch_object_info(row):
    payload = {'objectIds': str(int(row)), 'f': 'json', 'relationshipId': 2, 'outFields': 'PRECHIP,PREACONST,PREATERRE,PREVETUSTZ', 'returnGeometry': False}
    r = requests.get("https://serviciosgis.catastrobogota.gov.co/arcgis/rest/services/catastro/lote/MapServer/0/queryRelatedRecords", params=payload)
    try:
        infos = r.json()['relatedRecordGroups'][0]['relatedRecords']
        new_infos = []
        for x in infos:
            x['attributes']['objectid'] = str(int(row))
            new_infos.append(x['attributes'])
        infos = new_infos
    except:
        infos = [{'objectid': str(int(row)), 'PRECHIP': None,'PREACONST': None,'PREATERRE': None,'PREVETUSTZ': None}]
    return infos


if os.path.isfile('../../data/bogota/poverty/chips.csv'):
    chip_info_df = pd.read_csv('../../data/bogota/poverty/chips.csv')
else:
    dict_chips = Parallel(n_jobs=30)(delayed(fetch_object_info)(x) for x in tqdm(lot2objectid_df['objectid'].values))
    dict_chips_flatten = list(itertools.chain.from_iterable(dict_chips))
    chip_info_df = pd.DataFrame(dict_chips_flatten)
    #chip_info_df = chip_info_df.drop_duplicates(subset='PRECHIP')
    chip_info_df.to_csv('../../data/bogota/poverty/chips.csv', index=False)

chip_info_df.head()

Unnamed: 0,PREACONST,PREATERRE,PRECHIP,PREVETUSTZ,objectid
0,147.5,71.8,AAA0148WSAW,1996.0,724028
1,293.2,70.2,AAA0148WSCN,1996.0,724033
2,98.4,70.9,AAA0148WSHK,2008.0,728447
3,124.4,71.8,AAA0148WSJZ,1986.0,728450
4,25.0,77.5,AAA0148WSMR,1986.0,725431


In [186]:
chip_info_df['objectid'] = chip_info_df['objectid'].astype(str)
lot2objectid_df['objectid'] = lot2objectid_df['objectid'].astype(str).str[:-2]
val_chips_df = pd.merge(chip_info_df, lot2objectid_df, on='objectid')
val_chips_df.head()

Unnamed: 0,PREACONST,PREATERRE,PRECHIP,PREVETUSTZ,objectid,LotCodigo
0,147.5,71.8,AAA0148WSAW,1996.0,724028,4630014009
1,293.2,70.2,AAA0148WSCN,1996.0,724033,4630014011
2,98.4,70.9,AAA0148WSHK,2008.0,728447,4630014015
3,124.4,71.8,AAA0148WSJZ,1986.0,728450,4630014016
4,25.0,77.5,AAA0148WSMR,1986.0,725431,4630014019


In [187]:
len(val_chips_df)

2254424

In [188]:
strata_chips_df = pd.merge(val_chips_df[['PREACONST', 'PRECHIP']].rename(columns={'PRECHIP': 'ESoChip'}), 
         strata_df[['ESoChip', 'LotCodigo', 'ESoEstrato']], on=['ESoChip'])
strata_chips_df.head()

Unnamed: 0,PREACONST,ESoChip,LotCodigo,ESoEstrato
0,147.5,AAA0148WSAW,4630014009,1
1,293.2,AAA0148WSCN,4630014011,1
2,98.4,AAA0148WSHK,4630014015,1
3,124.4,AAA0148WSJZ,4630014016,1
4,25.0,AAA0148WSMR,4630014019,1


In [189]:
unique_land_gdf = land_gdf.copy()
unique_land_gdf.loc[:, 'x'] = unique_land_gdf.geometry.centroid.x
unique_land_gdf.loc[:, 'y'] = unique_land_gdf.geometry.centroid.y
unique_land_gdf = unique_land_gdf.drop_duplicates(subset=['x', 'y'])

In [190]:
strata_chips_df.to_sql('temptable2', engine, if_exists='replace', index=False)

In [191]:
ins_gdf = process_geometry_SQL_insert(unique_land_gdf)
ins_gdf.to_sql('temptable3'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [192]:
sql = """
CREATE INDEX ON temptable2 ("LotCodigo");
CREATE INDEX ON temptable3 ("LotCodigo");
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [193]:
sql = """
INSERT INTO property_valuev2 (bid, value, area) 
SELECT bid, strato, area_building
FROM (
    SELECT bid, area_building, strato, ROW_NUMBER() OVER (PARTITION BY chip_id ORDER BY area DESC) AS r
    from (
        SELECT p2."ESoChip"::text as chip_id, d.bid, p2."PREACONST"::float as area_building, p2."ESoEstrato"::float as strato, ST_Area(ST_Intersection(p.geom, d.geom)::geography) as area
        FROM temptable2 as p2
        INNER JOIN temptable3 as p ON p."LotCodigo" = p2."LotCodigo"
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' 
        ) as dtable
    order by area
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Property age

In [194]:
age_df = val_chips_df[~val_chips_df['PREVETUSTZ'].isnull()][['PRECHIP', 'LotCodigo', 'PREACONST', 'PREVETUSTZ']].copy()
age_df['LotCodigo'] = age_df['LotCodigo'].astype(str)
age_df.loc[:, 'LotCodigo'] = age_df['LotCodigo'].str.zfill(12)
age_df.head()

Unnamed: 0,PRECHIP,LotCodigo,PREACONST,PREVETUSTZ
0,AAA0148WSAW,4630014009,147.5,1996.0
1,AAA0148WSCN,4630014011,293.2,1996.0
2,AAA0148WSHK,4630014015,98.4,2008.0
3,AAA0148WSJZ,4630014016,124.4,1986.0
4,AAA0148WSMR,4630014019,25.0,1986.0


In [195]:
age_df.to_sql('temptable4', engine, if_exists='replace', index=False)

In [196]:
sql = """
CREATE INDEX ON temptable4 ("LotCodigo");
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [197]:
sql = """
INSERT INTO property_age (bid, age, area, city) 
SELECT bid, age, area_building, '{city}'
FROM (
    SELECT bid, area_building, age, ROW_NUMBER() OVER (PARTITION BY chip_id ORDER BY area DESC) AS r
    from (
        SELECT p2."PRECHIP"::text as chip_id, d.bid, p2."PREACONST"::float as area_building, p2."PREVETUSTZ"::int as age, ST_Area(ST_Intersection(p.geom, d.geom)::geography) as area
        FROM temptable4 as p2
        INNER JOIN temptable3 as p ON p."LotCodigo" = p2."LotCodigo"
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' 
        ) as dtable
    order by area
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Land values

In [198]:
pvalues_gdf = gpd.read_file('zip://../../data/bogota/land_use/property_values.geojson.zip')
pvalues_gdf = pvalues_gdf[(~(pvalues_gdf['geometry'].isnull()))] 

pvalues_gdf.head()

Unnamed: 0,gid,vrevalor,value,geometry
0,35019,450000,450000,(POLYGON ((-74.10787133899998 4.60391339900002...
1,12,100,100,(POLYGON ((-74.18296181099998 4.22776320700001...
2,584,1800,1800,(POLYGON ((-74.18256634399999 4.34027581100002...
3,97,100,100,(POLYGON ((-74.20164602599999 4.17417431899997...
4,108,100,100,(POLYGON ((-74.20409400699998 4.15925419500001...


In [199]:
ins_gdf = process_geometry_SQL_insert(pvalues_gdf)
ins_gdf.to_sql('temptable6_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [200]:
sql = """
UPDATE temptable6_{tempname} p SET geom=ST_Multi(ST_buffer(p.geom, 0.0));
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [201]:
sql = """
DELETE FROM temptable6_{tempname} t WHERE NOT EXISTS(
    SELECT * 
    FROM temptable_{tempname} l WHERE l.landuse <> 'none' AND ST_INTERSECTS(l.geom, t.geom) AND ST_AREA(ST_Intersection(l.geom, t.geom))/ST_AREA(l.geom) > 0.5
);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [202]:
sql = """
INSERT INTO property_value (bid, area, value, city) 
SELECT bid, area, value, '{city}'
FROM (
    SELECT bid, area, value, ROW_NUMBER() OVER (PARTITION BY bid ORDER BY area DESC) AS r
    from (
        SELECT ST_Area(ST_Intersection(p.geom, d.geom)::geography) as area, d.bid, p.vrevalor::float as value
        FROM temptable6_{tempname} as p
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' AND ST_Isvalid(p.geom)
        ) as dtable
    order by area
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Unused areas

In [203]:
unused_gdf = gpd.read_file('zip://../../data/bogota/unused_area/CAgu.zip')
unused_gdf = unused_gdf[['geometry']]

unused_gdf['type'] = 'water'
unused_gdf['city'] = CITY
unused_gdf.head()

Unnamed: 0,geometry,type,city
0,POLYGON ((-74.12166546899994 4.327357672000062...,water,bogota
1,"POLYGON ((-74.12027904299993 4.32474654300006,...",water,bogota
2,POLYGON ((-74.12679069799992 4.305990166000072...,water,bogota
3,"POLYGON ((-74.1700288479999 4.386548856000047,...",water,bogota
4,POLYGON ((-74.06112364799992 4.677830766000056...,water,bogota


In [204]:
ins_gdf = process_geometry_SQL_insert(unused_gdf)
ins_gdf.to_sql('unused_areas', engine, if_exists='append', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [205]:
unused_gdf = gpd.read_file('../../data/bogota/unused_area/parks_rivers_osm.geojson')
unused_gdf = unused_gdf[['geometry']]

unused_gdf['type'] = 'park'
unused_gdf['city'] = CITY
unused_gdf.head()

Unnamed: 0,geometry,type,city
0,"POLYGON ((-74.1293891 4.6688163, -74.1294596 4...",park,bogota
1,"POLYGON ((-74.20716830000001 4.800857, -74.207...",park,bogota
2,"(POLYGON ((-74.04452259999999 4.7822051, -74.0...",park,bogota
3,"POLYGON ((-74.12229170000001 4.675506, -74.122...",park,bogota
4,"POLYGON ((-74.1612729 4.6214761, -74.1614759 4...",park,bogota


In [206]:
unused_gdf = unused_gdf[(unused_gdf.geometry.geom_type == 'Polygon') | (unused_gdf.geometry.geom_type == 'MultiPolygon')]
    

In [207]:
ins_gdf = process_geometry_SQL_insert(unused_gdf)
ins_gdf.to_sql('unused_areas', engine, if_exists='append', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

Bogota has streets in the parcels, so let's remove them

In [210]:
unused_gdf = gpd.read_file('../../data/bogota/unused_area/street_areas.gpkg')
unused_gdf = unused_gdf[['geometry']]

unused_gdf['type'] = 'street'
unused_gdf['city'] = CITY
unused_gdf.head()

Unnamed: 0,geometry,type,city
0,(POLYGON ((-74.10046490799994 4.67042331700008...,street,bogota
1,(POLYGON ((-74.09210463799991 4.66452891800008...,street,bogota
2,(POLYGON ((-74.09187302299995 4.66423139500005...,street,bogota
3,(POLYGON ((-74.09125876699994 4.66331810700006...,street,bogota
4,(POLYGON ((-74.08901989099991 4.65981915200006...,street,bogota


In [211]:
ins_gdf = process_geometry_SQL_insert(unused_gdf)
ins_gdf.to_sql('unused_areas', engine, if_exists='append', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [212]:
unused_gdf = gpd.read_file('../../data/bogota/unused_area/parksandrivers.geojson')
unused_gdf = unused_gdf[['geometry']]
unused_gdf = unused_gdf.to_crs({'init': 'epsg:4326'}) 
unused_gdf['type'] = 'parksrivers'
unused_gdf['city'] = CITY
unused_gdf.head()

Unnamed: 0,geometry,type,city
0,"POLYGON ((-74.0480755 4.6914119, -74.048240000...",parksrivers,bogota
1,"POLYGON ((-74.085229 4.6393695, -74.0851914 4....",parksrivers,bogota
2,"POLYGON ((-74.0861987 4.63813, -74.08573730000...",parksrivers,bogota
3,"POLYGON ((-74.10028149999999 4.7177718, -74.10...",parksrivers,bogota
4,"POLYGON ((-74.12229170000001 4.675506, -74.122...",parksrivers,bogota


In [213]:
ins_gdf = process_geometry_SQL_insert(unused_gdf)
ins_gdf.to_sql('temptable_{tempname}'.format(tempname=CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [214]:
sql = """
INSERT INTO unused_areas (geom, type, city) 
SELECT p.geom, p.type, p.city
FROM temptable_{tempname} as p
WHERE ST_Isvalid(p.geom) AND NOT EXISTS(SELECT * FROM unused_areas u WHERE ST_Intersects(u.geom, p.geom) AND u.city=p.city)
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [215]:
sql = """
update unused_areas set geom=st_multi(st_buffer(geom, 0.0)) WHERE city = '{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Net area

In [216]:
land_gdf = process_geometry_SQL_insert(unique_land_gdf)
land_gdf.to_sql('temptable_u_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [217]:
sql = """
UPDATE temptable_u_{tempname} p SET geom=ST_Multi(ST_buffer(p.geom, 0.0)) 
FROM
boundary b
WHERE b.city = '{city}' AND ST_Intersects(p.geom, b.geom) AND NOT ST_Touches(p.geom, b.geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [218]:
sql = """
DELETE 
FROM temptable_u_{tempname} t
USING unused_areas u 
WHERE u.city = '{city}' AND ST_Intersects(u.geom, t.geom) AND (NOT ST_Touches(u.geom, t.geom)) 
AND (ST_Contains(u.geom, t.geom) OR ST_AREA(ST_Intersection(t.geom, u.geom))/ST_Area(t.geom) > 0.5);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [220]:
sql = """
INSERT INTO spatial_groups_net_area (sp_id, city, spatial_name, used_area) 
SELECT sp_id, city, spatial_name, SUM(ST_Area(t.geom::geography))/1000000.
FROM temptable_u_{tempname} t
INNER JOIN spatial_groups s ON ST_Contains(s.approx_geom, t.geom)
WHERE s.city = '{city}' 
GROUP BY sp_id, city, spatial_name;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

We don't clean directly the blocks as land use is not clean

In [221]:
land_gdf = gpd.read_file('../../data/bogota/land_use/Lots_2014.gpkg')
land_gdf = land_gdf[(~(land_gdf['geometry'].isnull())) & (~(land_gdf['LotCodigo'].isnull()))]

land_gdf = land_gdf[['LotCodigo', 'geometry']]

#land_gdf = land_gdf.to_crs({'init': 'epsg:4326'}) 

land_gdf.head()

Unnamed: 0,LotCodigo,geometry
0,4597039009,(POLYGON ((-74.20333918599995 4.60793434200007...
1,4593071010,(POLYGON ((-74.18928611899992 4.62772831500007...
2,4597039035,(POLYGON ((-74.20295012199995 4.60799430000008...
3,4597039020,(POLYGON ((-74.20311757299993 4.60791707800007...
4,1401046024,(POLYGON ((-74.09880285599991 4.57603526100007...


In [222]:
ins_gdf = process_geometry_SQL_insert(land_gdf)
ins_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [223]:
## This deletes the blocks that are related to streets
sql = """
DELETE FROM block b
WHERE city='{city}' and NOT EXISTS (select * from temptable_{tempname} t where st_intersects(t.geom, b.geom));
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Roads

Run this command

'osm2pgsql -c -d crime-environment --create --style "config/osm2pgsql.style" --multi-geometry  --port 50013 --number-processes 5 --latlong -C 30000 data/bogota/OSM/bogota_colombia.osm.bz2'

In [110]:
sql = """
INSERT INTO roads (geom, city) 
SELECT ST_MULTI(ST_LineSubstring(geom, 0.002*n/length,
  CASE
	WHEN 0.002*(n+1) < length THEN 0.002*(n+1)/length
	ELSE 1
  END)) As geom, city 
FROM
  (SELECT b.city, ST_LineMerge(p.way) AS geom,
  ST_Length(p.way) As length
  FROM planet_osm_roads p
  INNER JOIN boundary b ON ST_Intersects(p.way, b.geom) AND NOT ST_Touches(p.way, b.geom)
  WHERE b.city = 'bogota'
  ) AS t
CROSS JOIN generate_series(0, 50) AS n
WHERE n*0.002/length < 1;
"""

result = engine.execute(text(sql))

## Vacuums

In [111]:
sql = """
REFRESH MATERIALIZED VIEW block_w_buildings;
"""

result = engine.execute(text(sql))

In [112]:
sql = """
REFRESH MATERIALIZED VIEW block_centroids;
"""

result = engine.execute(text(sql))

In [113]:
sql = """
REFRESH MATERIALIZED VIEW pois_requests;
"""

result = engine.execute(text(sql))

In [114]:
from collections import defaultdict
from joblib import Parallel, delayed

In [115]:
def make_trip(lon1, lat1, dest):
    # PERSONALIZE HERE
    r = requests.get(
        'http://localhost:5000/table/v1/foot/{lon1},{lat1};{dest}?annotations=distance&sources=0'.format(lon1=lon1, lat1=lat1, dest=dest))
    distances = r.json()['distances']
    distance = np.array(distances[0][1:], np.float32)
    return distance

def walkscore_list(bid, clon, clat, list_dests, ws, straight_distances):
    dists = make_trip(clon, clat, list_dests)
    straight_distances = np.array(straight_distances)
    #good_idxs = np.argwhere(dists < 1600)
    ws = np.array(ws)+ 0.00000001 #[good_idxs] 
    zeros_idxs = np.argwhere(dists == 0)
    dists[zeros_idxs] = 1
    if np.sum(ws) == 0 or len(ws) == 0:
        return bid, -1
    return bid, np.average(straight_distances/dists, weights=ws) #[good_idxs] [good_idxs]

cat_limit = {
		'grocery': 3,
		'Food': 3,
		'Shops': 2,
		'Schools': 1,
		'Entertainment': 1,
		'Parks and outside': 1,
		'Coffee': 2,
		'Banks': 1,
		'Books': 1
	}

def walkscore(meters):
	max_walk = 1500
	score = np.exp(-5 * (meters / max_walk) ** 5)
	score = np.clip(score, 0, 1)
	return score

def walkscore2_list(bid, clon, clat, list_dests, c):
    dists = make_trip(clon, clat, list_dests)
    #good_idxs = np.argwhere(dists < 1600)
    limit = cat_limit[c]
    
    w = walkscore(dists)
    d = np.zeros(limit)
    d[:w.shape[0]] = w
    
    w = np.average(d)
    
    return bid, w, cat_limit[c] #[good_idxs] [good_idxs]

In [116]:
sql = """
SELECT bid, lon, lat, dests, parent_cat FROM pois_requests WHERE city = 'bogota'
"""

blocks_df = pd.read_sql_query(sql, con=engine)
blocks_df.head()

Unnamed: 0,bid,lon,lat,dests,parent_cat
0,248293,-74.051166,4.764691,"-74.046939,4.761992",Banks
1,248293,-74.051416,4.764832,"-74.046939,4.761992",Banks
2,248293,-74.05165,4.764976,"-74.046939,4.761992",Banks
3,248293,-74.052008,4.764952,"-74.046939,4.761992",Banks
4,248293,-74.050987,4.765408,"-74.046939,4.761992",Banks


In [117]:
list(set(blocks_df.parent_cat.values))

['Coffee',
 'grocery',
 'Food',
 'Shops',
 'Schools',
 'Banks',
 'Books',
 'Entertainment']

In [118]:
block_groups = defaultdict(list)
for index, row in blocks_df.iterrows():
    block_groups[row['bid']].append(row.values[1:])

In [119]:
print(len(blocks_df))
results = [(idx, score, w) for idx, score, w in Parallel(n_jobs=10, verbose=5)(delayed(walkscore2_list)(bid, req[0], req[1], req[2], req[3]) for bid, reqs in block_groups.items() for req in reqs)]

247980


[Parallel(n_jobs=10)]: Done 1444 tasks      | elapsed:    0.9s
[Parallel(n_jobs=10)]: Done 10624 tasks      | elapsed:    4.4s
[Parallel(n_jobs=10)]: Done 23476 tasks      | elapsed:    9.3s
[Parallel(n_jobs=10)]: Done 40000 tasks      | elapsed:   15.8s
[Parallel(n_jobs=10)]: Done 60196 tasks      | elapsed:   23.8s
[Parallel(n_jobs=10)]: Done 84064 tasks      | elapsed:   33.2s
[Parallel(n_jobs=10)]: Done 111604 tasks      | elapsed:   44.1s
[Parallel(n_jobs=10)]: Done 142816 tasks      | elapsed:   56.4s
[Parallel(n_jobs=10)]: Done 177700 tasks      | elapsed:  1.2min
[Parallel(n_jobs=10)]: Done 216256 tasks      | elapsed:  1.4min
[Parallel(n_jobs=10)]: Done 247980 out of 247980 | elapsed:  1.6min finished


In [120]:
block_vacuum_index = defaultdict(list)
block_vacuum_index2 = defaultdict(list)

for bid, score, ws in results:
    block_vacuum_index[bid].append(score)
    block_vacuum_index2[bid].append(ws)

for (bid, score), ws in zip(block_vacuum_index.items(), block_vacuum_index2.values()):
    scores = np.array(score)
    if len(scores) > 0:
        sql = "INSERT INTO walk_index (bid, score) VALUES ({}, {})".format(bid, np.average(scores, weights=np.array(ws)))
        result = engine.execute(text(sql))

## Crime

In [121]:
df = pd.read_csv('../../data/bogota/crime/homicidios.csv')
df = df.rename(columns={'2014': 'num', 'fecha': 'FECHA'})
df['MODALIDAD'] = 'homicidios'
df.head()

Unnamed: 0,LOCALIDADES,LATITUD,LONGITUD,HORA,MODALIDAD,num,FECHA
0,COMANDO ZONA SUMAPAZ,4.049379,-74.282926,00:00,homicidios,3,01/11/2014
1,COMANDO ZONA SUMAPAZ,4.442501,-74.164324,17:30,homicidios,1,04/05/2014
2,COMANDO ZONA SUMAPAZ,4.443186,-74.139348,19:00,homicidios,1,03/16/2014
3,COMANDO ZONA SUMAPAZ,4.46321,-74.096604,13:42,homicidios,1,11/09/2014
4,ESTACION E-01 USAQUEN,4.687711,-74.044076,05:00,homicidios,1,12/18/2014


In [122]:
df2 = pd.read_csv('../../data/bogota/crime/hurto.csv')
df2['num'] = df2['HURTO A RESIDENCIAS'] + df2['HURTO A COMERCIO'] + df2['HURTO A PERSONAS']
df2.head()

Unnamed: 0,LATITUD,LONGITUD,HORA,MODALIDAD,HURTO A RESIDENCIAS,HURTO A COMERCIO,HURTO A PERSONAS,FECHA,num
0,4.314791,-74.180603,12:18,FACTOR DE OPORTUNIDAD,0,0,1,02/08/2014,1
1,4.599425,-74.026051,18:33,FACTOR DE OPORTUNIDAD,0,0,1,10/08/2014,1
2,4.6049,-74.025365,10:15,FACTOR DE OPORTUNIDAD,0,0,1,08/28/2014,1
3,4.68791,-74.137882,07:30,FACTOR DE OPORTUNIDAD,0,1,0,07/03/2014,1
4,4.68851,-74.12451,13:40,RAPONAZO,0,0,1,09/18/2014,1


In [123]:
print(df.count())
print(df2.count())
crime_df = df[['LATITUD', 'LONGITUD', 'HORA', 'FECHA', 'num', 'MODALIDAD']].append(df2[['LATITUD', 'LONGITUD', 'HORA', 'FECHA', 'num', 'MODALIDAD']])
print(crime_df.count())
crime_df.head()

LOCALIDADES    1286
LATITUD        1286
LONGITUD       1286
HORA           1286
MODALIDAD      1286
num            1286
FECHA          1286
dtype: int64
LATITUD                39165
LONGITUD               39165
HORA                   39165
MODALIDAD              39165
HURTO A RESIDENCIAS    39165
HURTO A COMERCIO       39165
HURTO A PERSONAS       39165
FECHA                  39162
num                    39165
dtype: int64
LATITUD      40451
LONGITUD     40451
HORA         40451
FECHA        40448
num          40451
MODALIDAD    40451
dtype: int64


Unnamed: 0,LATITUD,LONGITUD,HORA,FECHA,num,MODALIDAD
0,4.049379,-74.282926,00:00,01/11/2014,3,homicidios
1,4.442501,-74.164324,17:30,04/05/2014,1,homicidios
2,4.443186,-74.139348,19:00,03/16/2014,1,homicidios
3,4.46321,-74.096604,13:42,11/09/2014,1,homicidios
4,4.687711,-74.044076,05:00,12/18/2014,1,homicidios


In [124]:
print(crime_df.count())
crime_df = crime_df.dropna()
print(crime_df.count())

LATITUD      40451
LONGITUD     40451
HORA         40451
FECHA        40448
num          40451
MODALIDAD    40451
dtype: int64
LATITUD      40448
LONGITUD     40448
HORA         40448
FECHA        40448
num          40448
MODALIDAD    40448
dtype: int64


In [125]:
crime_df = crime_df[~(crime_df['MODALIDAD'].isin(['NO REPORTADO', 'NO REPORTADA']))]

In [126]:
crime_df['datetime'] = pd.to_datetime(crime_df['FECHA'] + ' ' + crime_df['HORA'], format='%m/%d/%Y %H:%M')
crime_df.head()

Unnamed: 0,LATITUD,LONGITUD,HORA,FECHA,num,MODALIDAD,datetime
0,4.049379,-74.282926,00:00,01/11/2014,3,homicidios,2014-01-11 00:00:00
1,4.442501,-74.164324,17:30,04/05/2014,1,homicidios,2014-04-05 17:30:00
2,4.443186,-74.139348,19:00,03/16/2014,1,homicidios,2014-03-16 19:00:00
3,4.46321,-74.096604,13:42,11/09/2014,1,homicidios,2014-11-09 13:42:00
4,4.687711,-74.044076,05:00,12/18/2014,1,homicidios,2014-12-18 05:00:00


### Crime types

In [127]:
crime_types_df = pd.read_csv('../../data/crime_types/bogota_types_categorized.csv')[['MODALIDAD', 'UCR part', 'UCR1']]
crime_types_df['UCR part'] = crime_types_df['UCR part'].astype(int)
crime_types_df.head()

Unnamed: 0,MODALIDAD,UCR part,UCR1
0,FACTOR DE OPORTUNIDAD,1,Larceny-theft (except motor vehicle theft)
1,ATRACO,1,Robbery
2,RAPONAZO,1,Larceny-theft (except motor vehicle theft)
3,COSQUILLEO,1,Larceny-theft (except motor vehicle theft)
4,MECHERO,1,Arson


In [128]:
print(crime_df['num'].count())
crime_df = pd.merge(crime_df, crime_types_df, on='MODALIDAD')
print(crime_df['num'].count())

40401
40401


In [129]:
crime_df = crime_df[crime_df['UCR part'] == 1]
print(crime_df['num'].count())

38135


#### Subtypes of crimes

In [130]:
ucr_crimes_df = pd.read_csv('../../data/crime_types/UCR_crimes.csv')
ucr_crimes_df.head()

Unnamed: 0,Name,Category
0,Criminal homicide,Violent crime
1,Rape,Violent crime
2,Robbery,Violent crime
3,Aggravated assault,Violent crime
4,Burglary (breaking or entering),Property crime


In [131]:
df_ucr1 = pd.merge(crime_df, ucr_crimes_df.rename(columns={'Name': 'UCR1'}), on='UCR1')

a = set(df_ucr1['UCR1'].drop_duplicates().values)
b = set(crime_df['UCR1'].drop_duplicates().values)
assert(a.intersection(b) == a)

# Categories not present in crime dataset
df_ucr1[~(df_ucr1['UCR1'].isin(b))]

Unnamed: 0,LATITUD,LONGITUD,HORA,FECHA,num,MODALIDAD,datetime,UCR part,UCR1,Category


In [132]:
crime_df = df_ucr1.rename(columns={'LATITUD': 'lat', 'LONGITUD':'lng', 'MODALIDAD': 'description'})
crime_df.head()

Unnamed: 0,lat,lng,HORA,FECHA,num,description,datetime,UCR part,UCR1,Category
0,4.049379,-74.282926,00:00,01/11/2014,3,homicidios,2014-01-11 00:00:00,1,Criminal homicide,Violent crime
1,4.442501,-74.164324,17:30,04/05/2014,1,homicidios,2014-04-05 17:30:00,1,Criminal homicide,Violent crime
2,4.443186,-74.139348,19:00,03/16/2014,1,homicidios,2014-03-16 19:00:00,1,Criminal homicide,Violent crime
3,4.46321,-74.096604,13:42,11/09/2014,1,homicidios,2014-11-09 13:42:00,1,Criminal homicide,Violent crime
4,4.687711,-74.044076,05:00,12/18/2014,1,homicidios,2014-12-18 05:00:00,1,Criminal homicide,Violent crime


In [133]:
df_2014 = crime_df[['lng', 'lat', 'description', 'num', 'UCR1', 'Category']]
df_2014.count()

lng            38135
lat            38135
description    38135
num            38135
UCR1           38135
Category       38135
dtype: int64

In [134]:
from geopandas import GeoDataFrame
from shapely.geometry import Point

geometry = [Point(xy) for xy in zip(df_2014.lng, df_2014.lat)]
df_2014 = df_2014.drop(['lng', 'lng'], axis=1)
crs = {'init': 'epsg:4326'}
gdf = GeoDataFrame(df_2014, crs=crs, geometry=geometry)
gdf.head()

Unnamed: 0,lat,description,num,UCR1,Category,geometry
0,4.049379,homicidios,3,Criminal homicide,Violent crime,POINT (-74.2829260745 4.0493789544)
1,4.442501,homicidios,1,Criminal homicide,Violent crime,POINT (-74.1643244398 4.4425012194)
2,4.443186,homicidios,1,Criminal homicide,Violent crime,POINT (-74.1393477095 4.4431858016)
3,4.46321,homicidios,1,Criminal homicide,Violent crime,POINT (-74.0966040266 4.4632095497)
4,4.687711,homicidios,1,Criminal homicide,Violent crime,POINT (-74.0440756453 4.6877113861)


In [135]:
insert_gdf = process_geometry_SQL_insert(gdf)
insert_gdf.to_sql('temptable2', engine, if_exists='replace', index=True, dtype={'geom': Geometry('Point', srid=4326)})

In [136]:
sql = """
insert into crime (sp_id, num, city, ucr1, ucr_category) 
select bid, SUM(num), 'bogota', "UCR1", "Category" from(
SELECT num, bid, "UCR1", "Category", ROW_NUMBER() OVER (PARTITION BY index) AS r
from (
select c.index, c.num, b.bid, "UCR1", "Category"
from temptable2 as c
inner join blocks_group as b on ST_Intersects(b.geom, c.geom)
    ) as dtable
) x
WHERE x.r = 1
group by bid, "UCR1", "Category";
"""

result = engine.execute(text(sql))

## Refresh materialized views

In [137]:
sql = """
REFRESH MATERIALIZED VIEW join_building_ways;
"""

result = engine.execute(text(sql))

In [151]:
sql = """
REFRESH MATERIALIZED VIEW spatial_groups_unused_areas;
"""

result = engine.execute(text(sql))

In [224]:
sql = """
REFRESH MATERIALIZED VIEW block_building;
"""

result = engine.execute(text(sql))

In [153]:
sql = """
REFRESH MATERIALIZED VIEW blocks_group_with_building;
"""

result = engine.execute(text(sql))