In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import psycopg2
from geoalchemy2 import Geometry, WKTElement
from sqlalchemy import *
from shapely.geometry import MultiPolygon
from zipfile import ZipFile
import requests 
import sys

In [2]:
import yaml

with open('../config/postgres.yaml') as f:
    engine_configs = yaml.load(f, Loader=yaml.FullLoader)
    
try:
    engine = create_engine('postgresql://{username}:{password}@{host}:{port}/{dbname}'.format(**engine_configs))
except Exception as e:
    print("Uh oh, can't connect. Invalid dbname, user or password?")
    print(e)

In [3]:
def process_geometry_SQL_insert(gdf):
    gdf['geom'] = gdf['geometry'].apply(lambda x: WKTElement((MultiPolygon([x]) if x.geom_type == 'Polygon' else x).wkt, srid=4326))
    gdf = gdf.drop('geometry', 1)
    return gdf

# Often when reading in a ShapeFile from Basemap, you'll get: "ValueError: readshapefile can only handle 2D shape types"
# A trick can be to convert your geometry in your GeoPandas Dataframe and restoring the new flattened 2D geometry
# series back into a shapefile and try again.

# edit from http://stackoverflow.com/questions/33417764/basemap-readshapefile-valueerror  

from shapely.geometry import Polygon, MultiPolygon, shape, Point
def convert_3D_2D(geometry):
    '''
    Takes a GeoSeries of 3D Multi/Polygons (has_z) and returns a list of 2D Multi/Polygons
    '''
    new_geo = []
    for p in geometry:
        if p.has_z:
            if p.geom_type == 'Polygon':
                lines = [xy[:2] for xy in list(p.exterior.coords)]
                new_p = Polygon(lines)
                new_geo.append(new_p)
            elif p.geom_type == 'MultiPolygon':
                new_multi_p = []
                for ap in p:
                    lines = [xy[:2] for xy in list(ap.exterior.coords)]
                    new_p = Polygon(lines)
                    new_multi_p.append(new_p)
                new_geo.append(MultiPolygon(new_multi_p))
    return new_geo


In [4]:
CITY='chicago'
NEIGHBORHOOD_SIZE = 805 # 805 OR 1609

In [5]:
bounds_gdf = gpd.read_file('../../data/chicago/boundary/chicago_cook.geojson')
bounds_gdf = bounds_gdf[['geometry']]
bounds_gdf['city'] = CITY
bounds_gdf.head()

Unnamed: 0,geometry,city
0,"MULTIPOLYGON (((-87.52978 41.74055, -87.52979 ...",chicago1m


In [6]:
insert_gdf = process_geometry_SQL_insert(bounds_gdf)
insert_gdf.to_sql('boundary', engine, if_exists='append', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

## Spatial groups and blocks_group

In [7]:
block_groups_gdf = gpd.read_file('zip://../../data/chicago/blocks_group/cb_2014_17_bg_500k_edited.zip')
block_groups_gdf = block_groups_gdf[['GEOID', 'geometry']]
block_groups_gdf = block_groups_gdf.to_crs({'init': 'epsg:4326'}) 
block_groups_gdf.geometry = convert_3D_2D(block_groups_gdf.geometry)
block_groups_gdf.head()

  return _prepare_from_string(" ".join(pjargs))


Unnamed: 0,GEOID,geometry
0,170318281002,"POLYGON ((-87.53519 41.56473, -87.53216 41.564..."
1,170679542001,"POLYGON ((-91.45224 40.35367, -91.44783 40.359..."
2,170310801004,"POLYGON ((-87.62889 41.91122, -87.62627 41.911..."
3,170318261003,"POLYGON ((-87.54941 41.60634, -87.54924 41.608..."
4,170370009003,"POLYGON ((-88.76821 41.93952, -88.76820 41.942..."


In [8]:
block_groups_gdf = gpd.sjoin(block_groups_gdf, bounds_gdf, how="inner", op='intersects').drop('index_right', axis=1)

  "(%s != %s)" % (left_df.crs, right_df.crs)


In [9]:
block_groups_gdf = block_groups_gdf.rename(columns={'GEOID': 'original_id'})
block_groups_gdf['city'] = CITY

In [10]:
insert_gdf = process_geometry_SQL_insert(block_groups_gdf)
insert_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [13]:
sql = """
INSERT INTO blocks_group (original_id, city, geom) 
SELECT s.original_id, s.city, ST_Multi(ST_Intersection(s.geom, b.geom))
FROM temptable_{tempname} as s
INNER JOIN boundary b ON ST_Intersects(s.geom, b.geom) AND NOT ST_Touches(s.geom, b.geom) AND s.city=b.city
where s.city='{city}' and ST_Area(ST_Intersection(s.geom, b.geom))/ST_Area(s.geom) > 0.5;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

### Neighborhoods

In [14]:
sql = """INSERT INTO spatial_groups (city, core_geom, core_id, lower_ids, spatial_name, approx_geom)
SELECT  a.city, a.geom as core_geom, a.bid as core_id, array_agg(b.bid), 'ego', ST_multi(ST_Union(b.geom))
FROM blocks_group a
INNER JOIN blocks_group b ON a.city = b.city AND (a.bid = b.bid OR ST_DWithin(a.geom::geography, ST_Centroid(b.geom)::geography, {distance}) OR st_touches(a.geom, b.geom)) 
where a.city='{city}'
GROUP BY a.bid, a.geom, a.city;
delete from spatial_groups where ST_Area(approx_geom::geography) < 250000 and spatial_name='ego';
""".format(city=CITY, tempname=CITY.lower(), distance=NEIGHBORHOOD_SIZE)

result = engine.execute(text(sql))

## Blocks

In [15]:
block_gdf = gpd.read_file('zip://../../data/chicago/block/tl_2014_17_tabblock10.zip')
block_gdf = block_gdf[['geometry']]
block_gdf = block_gdf.to_crs({'init': 'epsg:4326'}) 
block_gdf.head()

  return _prepare_from_string(" ".join(pjargs))


Unnamed: 0,geometry
0,"POLYGON ((-87.78590 41.80641, -87.78544 41.806..."
1,"POLYGON ((-89.04697 42.39051, -89.04686 42.390..."
2,"POLYGON ((-88.72926 41.93660, -88.72891 41.936..."
3,"POLYGON ((-89.81550 38.86064, -89.81543 38.863..."
4,"POLYGON ((-88.18369 42.26466, -88.18356 42.264..."


In [16]:
insert_gdf = process_geometry_SQL_insert(block_gdf)
insert_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [17]:
sql = """
insert into block (sp_id, geom, city, geog, greater_1sm) select bid, geom, city, geom::geography, ST_AREA(geom::geography)>2.59e+6 
from(
    SELECT bid, st_multi(geom) as geom, city, ROW_NUMBER() OVER (PARTITION BY geom ORDER by area DESC) AS r
    from (
        select b.bid, c.geom, b.city, ST_Area(ST_Intersection(b.geom, c.geom)) as area
        from temptable_{tempname} as c
        inner join blocks_group as b on ST_Intersects(b.geom, c.geom)
        where b.city = '{city}'
    ) as dtable
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [18]:
sql = """
UPDATE block AS b SET geom=ST_Multi(ST_Intersection(b.geom, s.geom))
FROM boundary AS s
WHERE ST_Intersects(b.geom, s.geom) AND b.city=s.city AND s.city='{city}' AND NOT ST_Contains(s.geom, b.geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [19]:
## Some blocks_group do not have blocks
sql = """
DELETE FROM blocks_group bg
WHERE NOT EXISTS(SELECT * FROM block b WHERE b.sp_id = bg.bid) AND bg.city='{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Census

In [20]:
zip_file = ZipFile('../../data/chicago/employment/ACS_14_5YR_B23025.zip')
zip_file.infolist()

[<ZipInfo filename='ACS_14_5YR_B23025_with_ann.csv' compress_type=deflate file_size=586146 compress_size=123334>,
 <ZipInfo filename='ACS_14_5YR_B23025_metadata.csv' compress_type=deflate file_size=830 compress_size=218>,
 <ZipInfo filename='ACS_14_5YR_B23025.txt' compress_type=deflate file_size=3722 compress_size=1361>,
 <ZipInfo filename='aff_download_readme_ann.txt' compress_type=deflate file_size=1062 compress_size=463>]

In [21]:
employment_df = pd.read_csv(zip_file.open('ACS_14_5YR_B23025_with_ann.csv'))
employment_df = employment_df.rename(columns={'GEO.id2': 'original_id', 'HD01_VD02': 'inforce', 'HD01_VD04': 'employed', 'HD01_VD06': 'armed'})
employment_df = employment_df[['original_id', 'inforce', 'employed', 'armed']]
# Skip first header line
employment_df = employment_df[employment_df['original_id'] != 'Id2']

employment_df['inforce'] = employment_df['inforce'].astype(int)
employment_df['employed'] = employment_df['employed'].astype(int)
employment_df['armed'] = employment_df['armed'].astype(int)
employment_df.head()

Unnamed: 0,original_id,inforce,employed,armed
1,170310101001,234,183,0
2,170310101002,678,476,0
3,170310101003,1634,1502,0
4,170310102011,863,824,0
5,170310102012,2307,1977,0


In [22]:
zip_file = ZipFile('../../data/chicago/population/ACS_14_5YR_B01003.zip')
zip_file.infolist()

[<ZipInfo filename='ACS_14_5YR_B01003_with_ann.csv' compress_type=deflate file_size=413967 compress_size=56405>,
 <ZipInfo filename='ACS_14_5YR_B01003_metadata.csv' compress_type=deflate file_size=114 compress_size=91>,
 <ZipInfo filename='ACS_14_5YR_B01003.txt' compress_type=deflate file_size=3672 compress_size=1319>,
 <ZipInfo filename='aff_download_readme_ann.txt' compress_type=deflate file_size=1062 compress_size=463>]

In [23]:
pop_df = pd.read_csv(zip_file.open('ACS_14_5YR_B01003_with_ann.csv'))
pop_df = pop_df.rename(columns={'GEO.id2': 'original_id', 'HD01_VD01': 'population'})
pop_df = pop_df[['original_id', 'population']]
# Skip first header line
pop_df = pop_df[pop_df['original_id'] != 'Id2']

pop_df['population'] = pop_df['population'].astype(int)
pop_df.head()

Unnamed: 0,original_id,population
1,170310101001,421
2,170310101002,1525
3,170310101003,2243
4,170310102011,1716
5,170310102012,4592


In [25]:
zip_file = ZipFile('../../data/chicago/population/ACS_14_5YR_B25001.zip')
zip_file.infolist()

[<ZipInfo filename='ACS_14_5YR_B25001_with_ann.csv' compress_type=deflate file_size=409039 compress_size=53034>,
 <ZipInfo filename='ACS_14_5YR_B25001_metadata.csv' compress_type=deflate file_size=114 compress_size=91>,
 <ZipInfo filename='ACS_14_5YR_B25001.txt' compress_type=deflate file_size=3666 compress_size=1316>,
 <ZipInfo filename='aff_download_readme_ann.txt' compress_type=deflate file_size=1062 compress_size=463>]

In [26]:
dwellings_df = pd.read_csv(zip_file.open('ACS_14_5YR_B25001_with_ann.csv'), dtype={'GEO.id2': str})
dwellings_df = dwellings_df.rename(columns={'GEO.id2': 'original_id', 'HD01_VD01': 'dwellings'})
dwellings_df = dwellings_df[['original_id', 'dwellings']]

dwellings_df['dwellings'] = dwellings_df['dwellings'].astype(int)
dwellings_df.head()

Unnamed: 0,original_id,dwellings
0,170310101001,365
1,170310101002,837
2,170310101003,1412
3,170310102011,727
4,170310102012,1743


In [27]:
print(len(pop_df))
census_df = pd.merge(employment_df, pop_df, on='original_id')
census_df = pd.merge(census_df, dwellings_df, on='original_id')
print(len(census_df))
census_df.head()

3993
3993


Unnamed: 0,original_id,inforce,employed,armed,population,dwellings
0,170310101001,234,183,0,421,365
1,170310101002,678,476,0,1525,837
2,170310101003,1634,1502,0,2243,1412
3,170310102011,863,824,0,1716,727
4,170310102012,2307,1977,0,4592,1743


In [28]:
census_df.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False)

In [35]:
sql = """
insert into census (bid, population, employed, inforce, tot_survey, dwellings, city) 
select b.bid, c.population, c.employed+c.armed, c.inforce, c.population, c.dwellings, '{city}' 
from temptable_{tempname} c 
inner join blocks_group b on b.original_id = c.original_id
where b.city = '{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

### Residential stability

In [36]:
zip_file = ZipFile('../../data/chicago/residential_stability/ACS_14_5YR_B07201.zip')
zip_file.infolist()

[<ZipInfo filename='ACS_14_5YR_B07201.csv' compress_type=deflate file_size=720413 compress_size=132489>,
 <ZipInfo filename='ACS_14_5YR_B07201_metadata.csv' compress_type=deflate file_size=3381 compress_size=400>,
 <ZipInfo filename='ACS_14_5YR_B07201.txt' compress_type=deflate file_size=4052 compress_size=1513>,
 <ZipInfo filename='aff_download_readme.txt' compress_type=deflate file_size=1951 compress_size=802>]

In [37]:
stab_df = pd.read_csv(zip_file.open('ACS_14_5YR_B07201.csv'))
stab_df = stab_df.rename(columns={'GEO.id2': 'original_id', 'HD01_VD01': 'total', 'HD01_VD02': 'stable'})
stab_df = stab_df[['original_id', 'total', 'stable']]
# Skip first header line
stab_df = stab_df[stab_df['original_id'] != 'Id2']

stab_df['total'] = stab_df['total'].astype(int)
stab_df['stable'] = stab_df['stable'].astype(int)
stab_df.head()

Unnamed: 0,original_id,total,stable
1,170310101001,421,355
2,170310101002,1460,1122
3,170310101003,2190,1719
4,170310102011,1690,1021
5,170310102012,4445,3947


In [38]:
zip_file = ZipFile('../../data/chicago/tenure/ACS_14_5YR_B25003.zip')
zip_file.infolist()

[<ZipInfo filename='ACS_14_5YR_B25003.csv' compress_type=deflate file_size=464400 compress_size=80587>,
 <ZipInfo filename='ACS_14_5YR_B25003_metadata.csv' compress_type=deflate file_size=312 compress_size=140>,
 <ZipInfo filename='ACS_14_5YR_B25003.txt' compress_type=deflate file_size=3668 compress_size=1312>,
 <ZipInfo filename='aff_download_readme.txt' compress_type=deflate file_size=1951 compress_size=802>]

In [39]:
tenure_df = pd.read_csv(zip_file.open('ACS_14_5YR_B25003.csv'))
tenure_df = tenure_df.rename(columns={'GEO.id2': 'original_id', 'HD01_VD01': 'total2', 'HD01_VD02': 'owner'})
tenure_df = tenure_df[['original_id', 'total2', 'owner']]
# Skip first header line
tenure_df = tenure_df[tenure_df['original_id'] != 'Id2']

tenure_df['total2'] = tenure_df['total2'].astype(int)
tenure_df['owner'] = tenure_df['owner'].astype(int)
tenure_df.head()

Unnamed: 0,original_id,total2,owner
1,170310101001,304,60
2,170310101002,703,59
3,170310101003,1140,270
4,170310102011,590,195
5,170310102012,1522,412


In [40]:
res_stability_df = pd.merge(stab_df, tenure_df, on='original_id')
res_stability_df.head()

Unnamed: 0,original_id,total,stable,total2,owner
0,170310101001,421,355,304,60
1,170310101002,1460,1122,703,59
2,170310101003,2190,1719,1140,270
3,170310102011,1690,1021,590,195
4,170310102012,4445,3947,1522,412


In [41]:
res_stability_df.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False)

In [42]:
sql = """
INSERT INTO residential_stability (bid, city, total, stable, total2, owner) 
SELECT b.bid, '{city}', c.total, c.stable, c.total2, c.owner 
FROM temptable_{tempname} c 
INNER JOIN blocks_group b ON b.original_id = c.original_id
where b.city = '{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

### Ethnic diversity

In [43]:
zip_file = ZipFile('../../data/chicago/ethnic_diversity/ACS_14_5YR_B02001.zip')
zip_file.infolist()

[<ZipInfo filename='ACS_14_5YR_B02001.csv' compress_type=deflate file_size=627420 compress_size=122766>,
 <ZipInfo filename='ACS_14_5YR_B02001_metadata.csv' compress_type=deflate file_size=1373 compress_size=336>,
 <ZipInfo filename='ACS_14_5YR_B02001.txt' compress_type=deflate file_size=3660 compress_size=1307>,
 <ZipInfo filename='aff_download_readme.txt' compress_type=deflate file_size=1951 compress_size=802>]

In [44]:
eth_df = pd.read_csv(zip_file.open('ACS_14_5YR_B02001.csv'))
eth_df = eth_df.rename(columns={'GEO.id2': 'original_id', 'HD01_VD02': 'white', 'HD01_VD03': 'black', 'HD01_VD04': 'native', 'HD01_VD05': 'asian', 'HD01_VD06': 'native2', 
                               'HD01_VD08': 'o1', 'HD01_VD09': 'o2', 'HD01_VD10': 'o3'})
eth_df = eth_df[['original_id', 'white', 'black', 'asian', 'native', 'native2', 'o1', 'o2', 'o3']]
# Skip first header line
eth_df = eth_df[eth_df['original_id'] != 'Id2']

for x in ['white', 'black', 'asian', 'native', 'native2', 'o1', 'o2', 'o3']:
    eth_df[x] = eth_df[x].astype(int)
    
eth_df['other'] = eth_df['o1'] + eth_df['o2'] + eth_df['o3']
eth_df = eth_df.drop(['o1', 'o2', 'o3'], axis=1)
eth_df.head()

Unnamed: 0,original_id,white,black,asian,native,native2,other
1,170310101001,211,191,0,0,0,38
2,170310101002,354,1085,2,0,0,102
3,170310101003,1508,411,160,0,0,208
4,170310102011,971,633,62,0,0,66
5,170310102012,1848,2562,66,0,0,204


In [45]:
eth_df.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False)

In [46]:
sql = """
INSERT INTO ethnic_diversity (bid, city, race1, race2, race3, race4, race5, race6) 
SELECT b.bid, '{city}', c.white, c.black, c.native, c.asian, c.native2, c.other
FROM temptable_{tempname} c 
INNER JOIN blocks_group b ON b.original_id = c.original_id
where b.city = '{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

### Poverty

In [47]:
zip_file = ZipFile('../../data/chicago/poverty/ACS_14_5YR_C17002.zip')
zip_file.infolist()

[<ZipInfo filename='ACS_14_5YR_C17002_with_ann.csv' compress_type=deflate file_size=597401 compress_size=134682>,
 <ZipInfo filename='ACS_14_5YR_C17002_metadata.csv' compress_type=deflate file_size=759 compress_size=222>,
 <ZipInfo filename='ACS_14_5YR_C17002.txt' compress_type=deflate file_size=3742 compress_size=1373>,
 <ZipInfo filename='aff_download_readme_ann.txt' compress_type=deflate file_size=1062 compress_size=463>]

In [48]:
pov_df = pd.read_csv(zip_file.open('ACS_14_5YR_C17002_with_ann.csv'))
pov_df = pov_df.rename(columns={'GEO.id2': 'original_id', 'HD01_VD01': 'total', 'HD01_VD02': 'p50', 'HD01_VD03': 'p99'})
pov_df = pov_df[['original_id', 'total', 'p50', 'p99']]
# Skip first header line
pov_df = pov_df[pov_df['original_id'] != 'Id2']

for x in ['total', 'p50', 'p99']:
    pov_df[x] = pov_df[x].astype(int)
    
pov_df['poors'] = pov_df['p50'] + pov_df['p99']
pov_df = pov_df.drop(['p50', 'p99'], axis=1)
pov_df.head()

Unnamed: 0,original_id,total,poors
1,170310101001,421,164
2,170310101002,1516,622
3,170310101003,2146,426
4,170310102011,1716,598
5,170310102012,4582,1475


In [49]:
pov_df.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False)

In [50]:
sql = """
INSERT INTO poverty_index (bid, city, total, poors) 
SELECT b.bid, '{city}', c.total, c.poors
FROM temptable_{tempname} c 
INNER JOIN blocks_group b ON b.original_id = c.original_id
where b.city = '{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Buildings

In [51]:
bld_gdf = gpd.read_file('zip://../../data/chicago/buildings/Building Footprints (deprecated August 2015).zip')

bld_gdf = bld_gdf[(bld_gdf["bldg_statu"] == 'ACTIVE') & (bld_gdf['non_standa'].isnull())]
bld_gdf = bld_gdf[~((bld_gdf["st_name1"].isnull()) & (bld_gdf["comments"].isnull()))]

bld_gdf = bld_gdf[['geometry', 'year_built']]
bld_gdf = bld_gdf[~(bld_gdf['geometry'].isnull())]

bld_gdf = bld_gdf.to_crs({'init': 'epsg:4326'}) 

bld_gdf.head()

  return _prepare_from_string(" ".join(pjargs))


Unnamed: 0,geometry,year_built
0,"POLYGON ((-87.61997 41.69755, -87.61997 41.697...",1893.0
2,"POLYGON ((-87.79547 41.97488, -87.79547 41.974...",1943.0
3,"POLYGON ((-87.66417 41.68906, -87.66422 41.689...",0.0
4,"POLYGON ((-87.66673 41.70798, -87.66687 41.707...",1899.0
5,"POLYGON ((-87.79347 41.93183, -87.79347 41.931...",1957.0


In [52]:
insert_gdf = process_geometry_SQL_insert(bld_gdf)
insert_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [53]:
sql = """
UPDATE temptable_{tempname} p SET geom=ST_Multi(ST_buffer(p.geom, 0.0)) 
WHERE NOT ST_ISValid(p.geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [54]:
sql = """
INSERT INTO building (bid, city, geom, area) 
SELECT bid, '{city}', geom, barea
FROM (
    SELECT bid, geom, barea, ROW_NUMBER() OVER (PARTITION BY geom ORDER BY area DESC) AS r
    from (
        SELECT p.geom, ST_Area(p.geom::geography) as barea, d.bid, ST_Area(ST_Intersection(p.geom, d.geom)) as area
        FROM temptable_{tempname} as p
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' AND ST_Area(p.geom::geography) >= 40
        ) as dtable
    order by area
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Property age

In [55]:
sql = """
INSERT INTO property_age (bid, age, area, city) 
SELECT bid, age, area_building, '{city}'
FROM (
    SELECT bid, age, area_building, ROW_NUMBER() OVER (PARTITION BY geom ORDER BY area DESC) AS r
    from (
        SELECT p.geom, p."year_built"::int as age, ST_Area(p.geom::geography) as area_building, d.bid, ST_Area(ST_Intersection(p.geom, d.geom)) as area
        FROM temptable_{tempname} as p
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' AND ST_Area(p.geom::geography) >= 40 AND p."year_built"::int > 0
        ) as dtable
    order by area
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Land use

In [56]:
parcels_gdf = gpd.read_file('zip://../../data/chicago/land_use/ccgisdata - Parcel 2014.zip')

parcels_gdf = parcels_gdf.rename(columns={'pin14': 'pid'})

#land_gdf = land_gdf[['pid', 'sqftmain', 'usecode', 'usecode2', 'yearbuilt', 'geometry', 'value']]

parcels_gdf = parcels_gdf[~(parcels_gdf['geometry'].isnull())]
#land_gdf = land_gdf.to_crs({'init': 'epsg:4326'}) 

parcels_gdf.head()

Unnamed: 0,pin10,upper_elev,taxcode,pinsa,lower_elev,pid,pinac,pinu,pina,survey_cal,shape_star,job_no,parceltype,shape_stle,pinp,pinb,geometry
0,1601202047,0.0,0.0,1.0,0.0,16012020470000,0.0,0.0,16.0,0.0,2699.316881,0.0,1.0,265.972946,47.0,202.0,"POLYGON ((-87.69380 41.90989, -87.69380 41.909..."
1,1601206046,0.0,0.0,1.0,0.0,16012060460000,0.0,0.0,16.0,0.0,1452.845991,0.0,1.0,168.576188,46.0,206.0,"POLYGON ((-87.68955 41.90981, -87.68955 41.909..."
2,1601203004,0.0,0.0,1.0,0.0,16012030040000,0.0,0.0,16.0,0.0,2705.465621,0.0,1.0,266.094863,4.0,203.0,"POLYGON ((-87.69273 41.90983, -87.69282 41.909..."
3,1601203026,0.0,0.0,1.0,0.0,16012030260000,0.0,0.0,16.0,0.0,3164.710384,0.0,1.0,303.149471,26.0,203.0,"POLYGON ((-87.69221 41.90983, -87.69221 41.909..."
4,1613201006,0.0,0.0,13.0,0.0,16132010060000,0.0,0.0,16.0,0.0,8133.658481,0.0,1.0,376.816399,6.0,201.0,"POLYGON ((-87.69224 41.88067, -87.69232 41.880..."


In [57]:
ins_gdf = process_geometry_SQL_insert(parcels_gdf[['pid', 'geometry']].copy())
ins_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [58]:
sql = """
UPDATE temptable_{tempname} p SET geom=ST_Multi(ST_buffer(p.geom, 0.0)) 
WHERE NOT ST_ISVALID(p.geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [59]:
land_gdf = gpd.read_file('zip://../../data/chicago/land_use/land_use.zip', dtype={'LANDUSE': str})
land_gdf = land_gdf.to_crs({'init': 'epsg:4326'}) 
land_gdf.head()

  return _prepare_from_string(" ".join(pjargs))


Unnamed: 0,LANDUSE,OS_MGMT,FAC_NAME,PLATTED,MODIFIER,Shape_Leng,Shape_Area,geometry
0,1111,,,,,608.931265,20878.200769,"POLYGON ((-88.19304 41.20262, -88.19342 41.202..."
1,1111,,,,,787.202461,38719.34641,"POLYGON ((-88.18844 41.20272, -88.18879 41.202..."
2,1111,,,,,931.208086,45822.45737,"POLYGON ((-88.15646 41.20339, -88.15646 41.203..."
3,1111,,,,,740.975876,33953.759347,"POLYGON ((-88.11770 41.20415, -88.11845 41.204..."
4,1111,,,,,1396.015199,109212.531687,"POLYGON ((-88.12903 41.20399, -88.13071 41.203..."


In [60]:
land_gdf['landuse'] = 'none'

land_gdf.loc[(land_gdf['LANDUSE'].str[:2].isin({'11'})) | (land_gdf['LANDUSE'].isin({'1216'})), 'landuse'] = 'residential'
land_gdf.loc[(land_gdf['LANDUSE'].str[:2].isin({'12', '13', '14', '15', '20'})) & (~land_gdf['LANDUSE'].isin({'1510', '1511', '1512', '1520', '1550', '1561', '1565'})), 'landuse'] = 'commercial'
 
land_gdf.loc[land_gdf['LANDUSE'].str[:1].isin({'3'}), 'landuse'] = 'recreational'
land_gdf.loc[land_gdf['LANDUSE'].str[:1].isin({'4'}), 'landuse'] = 'vacant'
land_gdf.head()

Unnamed: 0,LANDUSE,OS_MGMT,FAC_NAME,PLATTED,MODIFIER,Shape_Leng,Shape_Area,geometry,landuse
0,1111,,,,,608.931265,20878.200769,"POLYGON ((-88.19304 41.20262, -88.19342 41.202...",residential
1,1111,,,,,787.202461,38719.34641,"POLYGON ((-88.18844 41.20272, -88.18879 41.202...",residential
2,1111,,,,,931.208086,45822.45737,"POLYGON ((-88.15646 41.20339, -88.15646 41.203...",residential
3,1111,,,,,740.975876,33953.759347,"POLYGON ((-88.11770 41.20415, -88.11845 41.204...",residential
4,1111,,,,,1396.015199,109212.531687,"POLYGON ((-88.12903 41.20399, -88.13071 41.203...",residential


In [61]:
ins_gdf = process_geometry_SQL_insert(land_gdf[['landuse', 'LANDUSE', 'geometry']].rename(columns={'LANDUSE': 'use'}))
ins_gdf.to_sql('temptable2_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [62]:
sql = """
UPDATE temptable2_{tempname} p SET geom=ST_Multi(ST_buffer(p.geom, 0.0)) 
WHERE NOT ST_ISVALID(p.geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [64]:
sql = """
DROP TABLE IF EXISTS temptable_parcels_{tempname};
CREATE TABLE temptable_parcels_{tempname} AS 
SELECT pid, landuse, geom
FROM (
    SELECT pid, landuse, geom, ROW_NUMBER() OVER (PARTITION BY pid ORDER BY area DESC) AS r
    from (
        SELECT p.pid, p2.landuse, p.geom, ST_Area(ST_Intersection(p.geom, p2.geom)) as area
        FROM temptable_{tempname} as p
        INNER JOIN temptable2_chicago as p2 ON ST_Intersects(p.geom, p2.geom) AND NOT ST_Touches(p.geom, p2.geom) 
        WHERE ST_Isvalid(p.geom)
        ) as dtable
) x
WHERE x.r = 1;
CREATE INDEX ON temptable_parcels_{tempname} USING GIST (geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [65]:
sql = """
INSERT INTO land_uses (bid, city, use_type, area) 
SELECT bid, '{city}', landuse, SUM(area) 
FROM (
    SELECT bid, landuse, area, ROW_NUMBER() OVER (PARTITION BY pid ORDER BY area DESC) AS r
    from (
        SELECT p.pid, p.landuse, d.bid, ST_Area(ST_Intersection(p.geom, d.geom)::geography) as area
        FROM temptable_parcels_{tempname} as p
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' AND p.landuse <> 'none' AND ST_Isvalid(p.geom)
        ) as dtable
) x
WHERE x.r = 1
GROUP BY bid, landuse;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Land values

In [66]:
zip_file = ZipFile('../../data/chicago/land_use/parcels.csv.zip')
zip_file.infolist()

[<ZipInfo filename='parcels.csv' compress_type=deflate filemode='-rw-r--r--' file_size=40857559 compress_size=11296328>]

In [67]:
parcels_df = pd.read_csv(zip_file.open('parcels.csv'), dtype={'pin14': str})
parcels_df = parcels_df[parcels_df.usecode > 0]
parcels_df = parcels_df.rename(columns={'pin14': 'pid'})
parcels_df['pid'] = parcels_df['pid'].astype(str)
parcels_df.head()

Unnamed: 0,pid,market_value,sqftmain,usecode
0,16012020470000,0,0,100
1,16012060460000,0,0,597
2,16012030040000,414650,4725,212
3,16012030260000,353600,2458,211
5,16012040300000,630830,2568,278


In [69]:
parcels_df.to_sql('temptable_market_{}'.format(CITY.lower()), engine, if_exists='replace')

In [70]:
sql = """
INSERT INTO property_value (bid, area, value, city) 
SELECT bid, sqftmain, market_value, '{city}'
FROM (
    SELECT bid, sqftmain, market_value, ROW_NUMBER() OVER (PARTITION BY pid ORDER BY area DESC) AS r
    from (
        SELECT p.pid, m.market_value, m.sqftmain, d.bid, ST_Area(ST_Intersection(p.geom, d.geom)) as area
        FROM temptable_parcels_{tempname} as p
        INNER JOIN temptable_market_{tempname} as m ON m.pid = p.pid
        INNER JOIN blocks_group as d on ST_Intersects(p.geom, d.geom) AND NOT ST_Touches(p.geom, d.geom)
        WHERE d.city = '{city}' AND p.landuse <> 'none'AND p.landuse <> 'vacant' AND ST_Isvalid(p.geom) AND m.market_value > 0
        ) as dtable
) x
WHERE x.r = 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Unused areas

In [71]:
unused_gdf = gpd.read_file('zip://../../data/chicago/unused_areas/tl_2014_17_arealm.zip')
unused_gdf = unused_gdf[['geometry', 'MTFCC']]
unused_gdf = unused_gdf.to_crs({'init': 'epsg:4326'}) 

unused_gdf = unused_gdf[unused_gdf['MTFCC'].isin({'K2180', 'K2181', 'K2182', 'K2183', 'K2184', 'K2185',
                                                 'K2186', 'K2187', 'K2188', 'K2189', 'K2190'})].drop('MTFCC', axis=1)

unused_gdf['type'] = 'park'
unused_gdf['city'] = CITY
unused_gdf.head()

  return _prepare_from_string(" ".join(pjargs))


Unnamed: 0,geometry,type,city
9,"POLYGON ((-88.65277 40.89391, -88.65154 40.893...",park,chicago1m
12,"POLYGON ((-87.84328 41.83419, -87.84327 41.834...",park,chicago1m
14,"POLYGON ((-88.09474 41.04273, -88.09474 41.043...",park,chicago1m
23,"POLYGON ((-90.65858 39.88426, -90.65770 39.884...",park,chicago1m
24,"POLYGON ((-90.90720 39.87320, -90.90713 39.873...",park,chicago1m


In [72]:
ins_gdf = process_geometry_SQL_insert(unused_gdf)
ins_gdf.to_sql('unused_areas', engine, if_exists='append', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [73]:
unused_gdf = gpd.read_file('zip://../../data/chicago/unused_areas/tl_2014_17031_areawater.zip')
unused_gdf = unused_gdf[['geometry']]
unused_gdf = unused_gdf.to_crs({'init': 'epsg:4326'}) 
unused_gdf['type'] = 'water'
unused_gdf['city'] = CITY
unused_gdf.head()

  return _prepare_from_string(" ".join(pjargs))


Unnamed: 0,geometry,type,city
0,"POLYGON ((-87.75900 42.15237, -87.75477 42.152...",water,chicago1m
1,"POLYGON ((-87.67317 42.04794, -87.67221 42.048...",water,chicago1m
2,"POLYGON ((-88.12929 42.14292, -88.12927 42.142...",water,chicago1m
3,"POLYGON ((-88.15167 42.13962, -88.15165 42.139...",water,chicago1m
4,"POLYGON ((-88.17768 42.14862, -88.17764 42.148...",water,chicago1m


In [74]:
ins_gdf = process_geometry_SQL_insert(unused_gdf)
ins_gdf.to_sql('unused_areas', engine, if_exists='append', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [75]:
sql = """
update unused_areas set geom=st_multi(st_buffer(geom, 0.0)) WHERE city = '{city}';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [76]:
unused_gdf = gpd.read_file('../../data/chicago/unused_areas/parkandrivers.geojson')
unused_gdf = unused_gdf[['geometry']]
unused_gdf = unused_gdf.to_crs({'init': 'epsg:4326'}) 
unused_gdf['type'] = 'parksrivers'
unused_gdf['city'] = CITY
unused_gdf.head()

  return _prepare_from_string(" ".join(pjargs))


Unnamed: 0,geometry,type,city
0,"MULTIPOLYGON (((-87.58285 41.78850, -87.58271 ...",parksrivers,chicago1m
1,"POLYGON ((-87.58801 41.68935, -87.58803 41.689...",parksrivers,chicago1m
2,"POLYGON ((-87.57597 41.69006, -87.57606 41.690...",parksrivers,chicago1m
3,"POLYGON ((-87.70994 41.76602, -87.70983 41.766...",parksrivers,chicago1m
4,"POLYGON ((-87.99389 41.67825, -87.99374 41.678...",parksrivers,chicago1m


In [77]:
ins_gdf = process_geometry_SQL_insert(unused_gdf)
ins_gdf.to_sql('temptable_{tempname}'.format(tempname=CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [78]:
sql = """
update temptable_{tempname} set geom=st_multi(st_buffer(geom, 0.0)) WHERE not st_isvalid(geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [79]:
sql = """
DROP TABLE IF EXISTS temptable_unusedhelper_{tempname};
CREATE TEMPORARY TABLE temptable_unusedhelper_{tempname} AS
SELECT ST_Union(geom) as geom FROM unused_areas u 
WHERE city='{city}';

DROP TABLE IF EXISTS temptable_unusedhelper_exp_{tempname};
CREATE TEMPORARY TABLE temptable_unusedhelper_exp_{tempname} AS
SELECT (ST_Dump(geom)).geom FROM temptable_unusedhelper_{tempname} u;
CREATE INDEX ON temptable_unusedhelper_exp_{tempname} USING GIST (geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [80]:
sql = """
update temptable_{tempname} t set geom=ST_Multi(st_buffer(ST_Difference(t.geom, h.geom), 0.0))
FROM temptable_unusedhelper_{tempname} h
WHERE st_intersects(t.geom, h.geom) AND (NOT ST_Touches(t.geom, h.geom)) AND ST_GeometryType(ST_Multi(ST_Difference(t.geom, h.geom))) <> 'ST_GeometryCollection';
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [81]:
sql = """
DELETE FROM temptable_{tempname} t 
USING temptable_unusedhelper_exp_{tempname} h
WHERE ST_Within(t.geom, h.geom) OR (st_intersects(t.geom, h.geom) AND (NOT ST_Touches(t.geom, h.geom)) AND ST_GeometryType(ST_Multi(ST_Difference(t.geom, h.geom))) = 'ST_GeometryCollection');
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [82]:
sql = """
update temptable_{tempname} set geom=st_multi(st_buffer(geom, 0.0));
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [83]:
sql = """
INSERT INTO unused_areas (geom, type, city) 
SELECT p.geom, p.type, p.city
FROM temptable_{tempname} as p
WHERE ST_Isvalid(p.geom) AND NOT EXISTS(SELECT * FROM unused_areas u WHERE ST_Intersects(u.geom, p.geom) AND u.city=p.city)
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Net area

In [84]:
land_gdf_unique = land_gdf.copy()

land_gdf_unique.loc[:, 'x'] = land_gdf_unique.geometry.centroid.x
land_gdf_unique.loc[:, 'y'] = land_gdf_unique.geometry.centroid.y
land_gdf_unique = land_gdf_unique.drop_duplicates(subset=['x', 'y'])[['geometry', 'landuse']]

In [85]:
ins_gdf = process_geometry_SQL_insert(land_gdf_unique)
ins_gdf.to_sql('temptable_unique_{}'.format(CITY.lower()), engine, if_exists='replace', index=False, dtype={'geom': Geometry('MultiPolygon', srid=4326)})

In [86]:
sql = """
UPDATE temptable_unique_{tempname} p SET geom=ST_Multi(ST_buffer(p.geom, 0.0)) 
WHERE NOT ST_Isvalid(p.geom);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [87]:
## This deletes the blocks that are related to streets
sql = """
DELETE FROM block b
WHERE city='{city}' and NOT EXISTS (select * from temptable_unique_{tempname} t where st_intersects(t.geom, b.geom) and t.landuse <> 'none');
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [88]:
sql = """
DELETE 
FROM temptable_unique_{tempname} t
USING unused_areas u 
WHERE u.city = '{city}' AND ST_Intersects(u.geom, t.geom) AND (NOT ST_Touches(u.geom, t.geom)) 
AND (ST_Contains(u.geom, t.geom) OR ST_AREA(ST_Intersection(t.geom, u.geom))/ST_Area(t.geom) > 0.5);
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

In [89]:
sql = """
INSERT INTO spatial_groups_net_area (sp_id, city, spatial_name, used_area) 
SELECT sp_id, city, spatial_name, SUM(ST_Area(ST_Intersection(s.approx_geom, t.geom)::geography))/1000000.
FROM temptable_unique_{tempname} t
INNER JOIN spatial_groups s ON ST_Intersects(s.approx_geom, t.geom) AND NOT ST_Touches(s.approx_geom, t.geom)
WHERE s.city = '{city}' 
GROUP BY sp_id, city, spatial_name;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Roads

In [5]:
sql = """
create index on planet_osm_line (highway);
INSERT INTO roads (geom, motorway, city) 
SELECT ST_MULTI(ST_LineSubstring(geom, 0.002*n/length,
  CASE
	WHEN 0.002*(n+1) < length THEN 0.002*(n+1)/length
	ELSE 1
  END)) As geom, (t.highway='motorway' OR t.highway='trunk'), city 
FROM
  (SELECT b.city, ST_LineMerge(p.way) AS geom, p.highway, 
  ST_Length(p.way) As length
  FROM planet_osm_line p
  INNER JOIN boundary b ON ST_Intersects(p.way, b.geom) AND NOT ST_Touches(p.way, b.geom)
  WHERE b.city = '{city}' AND p.highway <> 'service' AND p.highway <> 'path' AND p.highway IS NOT NULL
  ) AS t
CROSS JOIN generate_series(0, 50) AS n
WHERE n*0.002/length < 1;
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Vacuums

In [79]:
sql = """
REFRESH MATERIALIZED VIEW block_centroids;
"""

result = engine.execute(text(sql))

In [80]:
sql = """
REFRESH MATERIALIZED VIEW pois_requests;
"""

result = engine.execute(text(sql))

In [5]:
from collections import defaultdict
from joblib import Parallel, delayed

In [6]:
def make_trip(lon1, lat1, dest):
    # PERSONALIZE HERE
    r = requests.get(
        'http://localhost:5000/table/v1/foot/{lon1},{lat1};{dest}?annotations=distance&sources=0'.format(lon1=lon1, lat1=lat1, dest=dest))
    distances = r.json()['distances']
    distance = np.array(distances[0][1:], np.float32)
    distance[distance < 0] = 0
    return distance

def walkscore_list(bid, clon, clat, list_dests, ws, straight_distances):
    dists = make_trip(clon, clat, list_dests)
    straight_distances = np.array(straight_distances)
    #good_idxs = np.argwhere(dists < 1600)
    ws = np.array(ws)+ 0.00000001 #[good_idxs] 
    zeros_idxs = np.argwhere(dists == 0)
    dists[zeros_idxs] = 1
    if np.sum(ws) == 0 or len(ws) == 0:
        return bid, -1
    return bid, np.average(straight_distances/dists, weights=ws) #[good_idxs] [good_idxs]

cat_weights = {
    'grocery': [3],
    'Food': [.75,.45,.25,.25,.225,.225,.225,.225,.2,.2],
    'Shops': [.5,.45,.4,.35,.3],
    'Schools': [1],
    'Entertainment': [1],
    'Parks and outside': [1],
    'Coffee': [1.25,.75],
    'Banks': [1],
    'Books': [1]
}


def walkscore(meters):
	max_walk = 1500
	score = np.exp(-5 * (meters / max_walk) ** 2.5)
	score = np.clip(score, 0, 1)
	return score

def walkscore2_list(bid, clon, clat, list_dests, c):
    dists = make_trip(clon, clat, list_dests)
    #good_idxs = np.argwhere(dists < 1600)
    scores = np.sort(walkscore(dists))[::-1]
    n = len(cat_weights[c])
    d = np.zeros(n)
    d[:scores.shape[0]] = scores[:n]
    w = np.sum(np.array(d)*np.array(cat_weights[c]))
    assert w <= np.sum(cat_weights[c]) and w >= 0
    
    return bid, w #[good_idxs] [good_idxs]

In [7]:
sql = """
SELECT bid, lon, lat, dests, parent_cat FROM pois_requests p WHERE p.city = '{city}' 
""".format(city=CITY, tempname=CITY.lower())

blocks_df = pd.read_sql_query(sql, con=engine)
blocks_df.head()

Unnamed: 0,bid,lon,lat,dests,parent_cat
0,506084,-87.695262,42.009909,"-87.694755,42.011589;-87.693826,42.007856;-87....",Coffee
1,506084,-87.696459,42.009658,"-87.694755,42.011589;-87.693826,42.007856;-87....",Coffee
2,506084,-87.698561,42.009614,"-87.694755,42.011589;-87.693826,42.007856;-87....",Coffee
3,506084,-87.696457,42.01011,"-87.694755,42.011589;-87.693826,42.007856;-87....",Coffee
4,506084,-87.696165,42.010567,"-87.694755,42.011589;-87.693826,42.007856;-87....",Coffee


In [8]:
sql = """
SELECT bid, COUNT(*) as size
FROM block_centroids b WHERE b.city = '{city}' 
GROUP BY bid
ORDER BY bid
""".format(city=CITY, tempname=CITY.lower())

n_blocks_df = pd.read_sql_query(sql, con=engine).set_index('bid')
n_blocks_df.head()

Unnamed: 0_level_0,size
bid,Unnamed: 1_level_1
506084,13
506085,12
506086,6
506087,11
506088,29


In [9]:
block_groups = defaultdict(list)
for index, row in blocks_df.iterrows():
    block_groups[row['bid']].append(row.values[1:])

In [10]:
from tqdm import tqdm

results = [(idx, score) for idx, score in Parallel(n_jobs=10)(delayed(walkscore2_list)(bid, req[0], req[1], req[2], req[3]) for bid, reqs in tqdm(block_groups.items()) for req in reqs)]

100%|██████████| 2175/2175 [01:59<00:00, 18.16it/s]


In [11]:
block_vacuum_index = defaultdict(list)
bid2size = {k: v['size'] for k, v in n_blocks_df.iterrows()}

for bid, score in results:
    block_vacuum_index[bid].append(score)
    
sum_cat_weights = np.sum([y for x in cat_weights.values() for y in x])

for bid, score in block_vacuum_index.items():
    if len(score) > 0:
        score = (np.sum(score)/bid2size[bid])/sum_cat_weights
        assert score <= 1.01
        sql = "INSERT INTO walk_index (bid, score, city) VALUES ({}, {}, '{}')".format(bid, score, CITY)
        result = engine.execute(text(sql))

## Crime

In [12]:
df = pd.read_csv('../../data/chicago/crime/Crimes_-_2014.csv')
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,9470797,HX123824,01/22/2014 04:02:00 PM,031XX S ASHLAND AVE,460,BATTERY,SIMPLE,RESTAURANT,True,False,...,11.0,59,08B,1166178.0,1883833.0,2014,05/24/2017 03:50:24 PM,41.836816,-87.665724,"(41.836816162, -87.665724279)"
1,10954493,JA274635,01/01/2014 12:01:00 AM,007XX W 107TH ST,1753,OFFENSE INVOLVING CHILDREN,SEX ASSLT OF CHILD BY FAM MBR,RESIDENCE,False,True,...,34.0,49,02,,,2014,05/23/2017 03:50:10 PM,,,
2,10953925,JA273316,12/19/2014 08:00:00 AM,085XX S MARQUETTE AVE,820,THEFT,$500 AND UNDER,STREET,False,False,...,7.0,46,06,,,2014,05/23/2017 03:50:10 PM,,,
3,10953348,JA273652,09/01/2014 12:00:00 PM,014XX N LOREL AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,...,37.0,25,11,,,2014,05/22/2017 03:58:12 PM,,,
4,10953070,JA263372,09/01/2014 01:00:00 PM,031XX N ODELL AVE,1150,DECEPTIVE PRACTICE,CREDIT CARD FRAUD,RESIDENCE,False,False,...,36.0,17,11,,,2014,05/22/2017 03:58:12 PM,,,


In [13]:
df = df[['Date', 'IUCR', 'Description', 'Latitude', 'Location', 'Community Area']]
df.head()

Unnamed: 0,Date,IUCR,Description,Latitude,Location,Community Area
0,01/22/2014 04:02:00 PM,460,SIMPLE,41.836816,"(41.836816162, -87.665724279)",59
1,01/01/2014 12:01:00 AM,1753,SEX ASSLT OF CHILD BY FAM MBR,,,49
2,12/19/2014 08:00:00 AM,820,$500 AND UNDER,,,46
3,09/01/2014 12:00:00 PM,1153,FINANCIAL IDENTITY THEFT OVER $ 300,,,25
4,09/01/2014 01:00:00 PM,1150,CREDIT CARD FRAUD,,,17


In [14]:
print(df.count())
df = df.dropna()
print(df.count())
df.head()

Date              274666
IUCR              274666
Description       274666
Latitude          269459
Location          269459
Community Area    274666
dtype: int64
Date              269459
IUCR              269459
Description       269459
Latitude          269459
Location          269459
Community Area    269459
dtype: int64


Unnamed: 0,Date,IUCR,Description,Latitude,Location,Community Area
0,01/22/2014 04:02:00 PM,460,SIMPLE,41.836816,"(41.836816162, -87.665724279)",59
5,08/08/2014 10:50:00 AM,4651,SEX OFFENDER: FAIL REG NEW ADD,41.876949,"(41.876949072, -87.736685059)",26
6,02/26/2014 07:40:00 AM,610,FORCIBLE ENTRY,41.94882,"(41.948820056, -87.718899701)",16
12,03/20/2014 07:25:00 PM,460,SIMPLE,41.692359,"(41.692358646, -87.623096746)",49
15,09/13/2014 02:00:00 PM,1725,CONTRIBUTE CRIM DELINQUENCY JUVENILE,41.767282,"(41.767282129, -87.572663824)",43


In [15]:
df['datetime'] = pd.to_datetime(df['Date'], format='%m/%d/%Y  %I:%M:%S %p')
df.head()

Unnamed: 0,Date,IUCR,Description,Latitude,Location,Community Area,datetime
0,01/22/2014 04:02:00 PM,460,SIMPLE,41.836816,"(41.836816162, -87.665724279)",59,2014-01-22 16:02:00
5,08/08/2014 10:50:00 AM,4651,SEX OFFENDER: FAIL REG NEW ADD,41.876949,"(41.876949072, -87.736685059)",26,2014-08-08 10:50:00
6,02/26/2014 07:40:00 AM,610,FORCIBLE ENTRY,41.94882,"(41.948820056, -87.718899701)",16,2014-02-26 07:40:00
12,03/20/2014 07:25:00 PM,460,SIMPLE,41.692359,"(41.692358646, -87.623096746)",49,2014-03-20 19:25:00
15,09/13/2014 02:00:00 PM,1725,CONTRIBUTE CRIM DELINQUENCY JUVENILE,41.767282,"(41.767282129, -87.572663824)",43,2014-09-13 14:00:00


In [16]:
df['Location'] = df['Location'].str.replace('(', '')
df['Location'] = df['Location'].str.replace(')', '')

In [17]:
df['lng'] = df['Location'].str.split(', ').str[1]
df['lat'] = df['Location'].str.split(', ').str[0]
df['num'] = 1
df.head()

Unnamed: 0,Date,IUCR,Description,Latitude,Location,Community Area,datetime,lng,lat,num
0,01/22/2014 04:02:00 PM,460,SIMPLE,41.836816,"41.836816162, -87.665724279",59,2014-01-22 16:02:00,-87.665724279,41.836816162,1
5,08/08/2014 10:50:00 AM,4651,SEX OFFENDER: FAIL REG NEW ADD,41.876949,"41.876949072, -87.736685059",26,2014-08-08 10:50:00,-87.736685059,41.876949072,1
6,02/26/2014 07:40:00 AM,610,FORCIBLE ENTRY,41.94882,"41.948820056, -87.718899701",16,2014-02-26 07:40:00,-87.718899701,41.948820056,1
12,03/20/2014 07:25:00 PM,460,SIMPLE,41.692359,"41.692358646, -87.623096746",49,2014-03-20 19:25:00,-87.623096746,41.692358646,1
15,09/13/2014 02:00:00 PM,1725,CONTRIBUTE CRIM DELINQUENCY JUVENILE,41.767282,"41.767282129, -87.572663824",43,2014-09-13 14:00:00,-87.572663824,41.767282129,1


### Crime types

In [18]:
crime_types_df = pd.read_csv('../../data/crime_types/Chicago_Police_Department_-_Illinois_Uniform_Crime_Reporting__IUCR__Codes.csv', dtype='str')
crime_types_df['IUCR'] = crime_types_df['IUCR'].str.zfill(4)
crime_types_df.head()

Unnamed: 0,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,INDEX CODE,UCR1
0,110,HOMICIDE,FIRST DEGREE MURDER,I,Criminal homicide
1,130,HOMICIDE,SECOND DEGREE MURDER,I,Criminal homicide
2,141,HOMICIDE,INVOLUNTARY MANSLAUGHTER,N,
3,142,HOMICIDE,RECKLESS HOMICIDE,N,
4,261,CRIM SEXUAL ASSAULT,AGGRAVATED: HANDGUN,I,Rape


In [19]:
crime_types_df.loc[(crime_types_df['INDEX CODE'] == 'I') & (crime_types_df['SECONDARY DESCRIPTION'].str.contains('RECOVERY')), 'INDEX CODE'] = 'N'
crime_types_df[crime_types_df['INDEX CODE'] == 'I']

Unnamed: 0,IUCR,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,INDEX CODE,UCR1
0,0110,HOMICIDE,FIRST DEGREE MURDER,I,Criminal homicide
1,0130,HOMICIDE,SECOND DEGREE MURDER,I,Criminal homicide
4,0261,CRIM SEXUAL ASSAULT,AGGRAVATED: HANDGUN,I,Rape
5,0262,CRIM SEXUAL ASSAULT,AGGRAVATED: OTHER FIREARM,I,Rape
6,0263,CRIM SEXUAL ASSAULT,AGGRAVATED: KNIFE/CUT INSTR,I,Rape
...,...,...,...,...,...
112,1020,ARSON,BY FIRE,I,Arson
113,1025,ARSON,AGGRAVATED,I,Arson
118,1090,ARSON,ATTEMPT ARSON,I,Arson
256,1753,OFFENSE INVOLVING CHILDREN,SEX ASSLT OF CHILD BY FAM MBR,I,Rape


In [20]:
crime_types_df.loc[(crime_types_df['UCR1'] == 'Larceny-theft (except motor vehicle theft)'), 'INDEX CODE'] = 'N'

In [21]:
print(df['num'].count())
df = pd.merge(df, crime_types_df, on='IUCR')
print(df['num'].count())

269459
268827


In [22]:
df = df[df['INDEX CODE'] == 'I']
print(df['num'].count())

45936


#### Subtypes of crimes

In [23]:
ucr_crimes_df = pd.read_csv('../../data/crime_types/UCR_crimes.csv')
ucr_crimes_df.head()

Unnamed: 0,Name,Category
0,Criminal homicide,Violent crime
1,Rape,Violent crime
2,Robbery,Violent crime
3,Aggravated assault,Violent crime
4,Burglary (breaking or entering),Property crime


In [24]:
df_ucr1 = pd.merge(df, ucr_crimes_df.rename(columns={'Name': 'UCR1'}), on='UCR1')

a = set(df_ucr1['UCR1'].drop_duplicates().values)
b = set(df['UCR1'].drop_duplicates().values)
assert(a.intersection(b) == a)

# Categories not present in crime dataset
df_ucr1[~(df_ucr1['UCR1'].isin(b))]

Unnamed: 0,Date,IUCR,Description,Latitude,Location,Community Area,datetime,lng,lat,num,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,INDEX CODE,UCR1,Category


In [25]:
df_ucr1['lng'] = df_ucr1['lng'].astype('float32')
df_ucr1['lat'] = df_ucr1['lat'].astype('float32')

In [26]:
df_2014 = df_ucr1[df_ucr1['datetime'].dt.year == 2014][['lng', 'lat', 'Description', 'num', 'UCR1', 'Category']]
df_2014 = df_2014.rename(columns={'Description': 'description'})
df_2014.count()

lng            45936
lat            45936
description    45936
num            45936
UCR1           45936
Category       45936
dtype: int64

In [27]:
from geopandas import GeoDataFrame
from shapely.geometry import Point

geometry = [Point(xy) for xy in zip(df_2014.lng, df_2014.lat)]
df_2014 = df_2014.drop(['lng', 'lng'], axis=1)
crs = {'init': 'epsg:4326'}
gdf = GeoDataFrame(df_2014, crs=crs, geometry=geometry)
gdf.head()

  return _prepare_from_string(" ".join(pjargs))


Unnamed: 0,lat,description,num,UCR1,Category,geometry
0,41.948818,FORCIBLE ENTRY,1,Burglary (breaking or entering),Property crime,POINT (-87.71890 41.94882)
1,41.696712,FORCIBLE ENTRY,1,Burglary (breaking or entering),Property crime,POINT (-87.62801 41.69671)
2,41.966583,FORCIBLE ENTRY,1,Burglary (breaking or entering),Property crime,POINT (-87.78344 41.96658)
3,41.913776,FORCIBLE ENTRY,1,Burglary (breaking or entering),Property crime,POINT (-87.78235 41.91378)
4,41.764996,FORCIBLE ENTRY,1,Burglary (breaking or entering),Property crime,POINT (-87.65816 41.76500)


In [28]:
insert_gdf = process_geometry_SQL_insert(gdf)
insert_gdf.to_sql('temptable_{}'.format(CITY.lower()), engine, if_exists='replace', index=True, dtype={'geom': Geometry('Point', srid=4326)})

In [29]:
sql = """
insert into crime (sp_id, num, city, ucr1, ucr_category) 
select bid, SUM(num), '{city}', "UCR1", "Category" from(
SELECT num, bid, "UCR1", "Category", ROW_NUMBER() OVER (PARTITION BY index) AS r
from (
select c.index, c.num, b.bid, "UCR1", "Category"
from temptable_{tempname} as c
inner join blocks_group as b on ST_Intersects(b.geom, st_buffer(c.geom::geography, 30)::geometry)
where b.city='{city}'
    ) as dtable
) x
group by bid, "UCR1", "Category";
""".format(city=CITY, tempname=CITY.lower())

result = engine.execute(text(sql))

## Refresh materialized views

In [14]:
sql = """
REFRESH MATERIALIZED VIEW join_building_ways;
"""

result = engine.execute(text(sql))

In [50]:
sql = """
REFRESH MATERIALIZED VIEW spatial_groups_unused_areas;
"""

result = engine.execute(text(sql))

In [15]:
sql = """
REFRESH MATERIALIZED VIEW block_building;
"""

result = engine.execute(text(sql))

In [16]:
sql = """
REFRESH MATERIALIZED VIEW blocks_group_with_building;
"""

result = engine.execute(text(sql))

In [None]:
2