In [1]:
import pandas as pd
import os
from shapely.geometry import Point, Polygon
from shapely import geometry
import ast

#### Reading In Relevant Files

In [2]:
directory = '../../data/processed'
void = ['clean_areas.csv', 'clean_disadvantaged_areas.csv', 'clean_police_districts.csv', 'clean_public_healthindicator.csv', 'clean_train_ridership.csv', 'clean_bike_trips.csv']

# read in the code files
def read_in(directory, void):
    data = {}
    for filename in os.listdir(directory):
        if filename not in void and filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            data[filename[:-4]] = pd.read_csv(file_path)
            print(f'{filename[:-4]} successfully read in')
    return data

In [3]:
data = read_in(directory, void)

agg_public_healthindicator successfully read in
clean_alleylights successfully read in
clean_bike_stations successfully read in
clean_bus_stops successfully read in
clean_crime successfully read in
clean_police_stations successfully read in
clean_streetlights_allout successfully read in
clean_streetlights_oneout successfully read in
clean_train_stations successfully read in
clean_vacant_buildings successfully read in
disadvantaged_areas_within_areas successfully read in


In [4]:
clean_police_districts = pd.read_csv('../../data/processed/clean_police_districts.csv')
clean_areas = pd.read_csv('../../data/processed/clean_areas.csv')
clean_disadvantaged_areas = pd.read_csv('../../data/processed/clean_disadvantaged_areas.csv')

#### Data Cleaning of Geom Types

In [5]:
def parse_polygon1(polygon_string):
    points = polygon_string.strip('POLYGON ((').strip('))').split(', ')
    points = [tuple(map(float, point.split())) for point in points]
    return Polygon(points)

In [6]:
def parse_polygon2(polygon_string):
    points = ast.literal_eval(polygon_string)
    return Polygon(points)

In [7]:
def swap_coordinates(polygon):
    if polygon.is_empty:
        return polygon
    swapped_coords = [(y, x) for x, y in polygon.exterior.coords]
    return Polygon(swapped_coords)

In [8]:
clean_police_districts['geom'] = clean_police_districts['geom'].apply(parse_polygon1)
clean_police_districts.head()

Unnamed: 0,district,geom,disadvantaged_score,disadvantaged_score.1,centroid,status,areas
0,17,POLYGON ((41.997365655369435 -87.7106708939135...,0.0,0.0,0.0,0.0,0.0
1,20,POLYGON ((41.990916338539776 -87.6602942357235...,4.0,4.0,4.0,4.0,4.0
2,31,POLYGON ((41.98384222028714 -87.82817787271652...,0.0,0.0,0.0,0.0,0.0
3,31,POLYGON ((41.97535481526603 -87.83365455160555...,0.0,0.0,0.0,0.0,0.0
4,19,POLYGON ((41.969727093814825 -87.6449179856867...,2.0,2.0,2.0,2.0,2.0


In [10]:
clean_areas['poly'] = clean_areas['poly'].apply(parse_polygon2)
clean_areas.head()

Unnamed: 0,id,poly,district
0,35,POLYGON ((41.84469250265398 -87.60914087617894...,2
1,36,POLYGON ((41.81692934626684 -87.59215283879394...,2
2,37,POLYGON ((41.80189303368919 -87.62879823733725...,9
3,38,"POLYGON ((41.81681377057218 -87.6067081256125,...",2
4,39,POLYGON ((41.81692934626684 -87.59215283879394...,2


In [11]:
clean_disadvantaged_areas['poly'] = clean_disadvantaged_areas['poly'].apply(parse_polygon1)
clean_disadvantaged_areas['poly'] = clean_disadvantaged_areas['poly'].apply(swap_coordinates)
clean_disadvantaged_areas.head()

ValueError: could not convert string to float: '[(-87.57366299996758'

In [11]:
clean_areas['centroid'] = clean_areas['poly'].apply(lambda poly : poly.centroid)
clean_areas

Unnamed: 0,id,poly,centroid
0,35,POLYGON ((41.84469250265398 -87.60914087617894...,POINT (41.835118342623176 -87.61867772050564)
1,36,POLYGON ((41.81692934626684 -87.59215283879394...,POINT (41.823750345730396 -87.60321641296161)
2,37,POLYGON ((41.80189303368919 -87.62879823733725...,POINT (41.8090854843581 -87.63242456957649)
3,38,"POLYGON ((41.81681377057218 -87.6067081256125,...",POINT (41.81294935894828 -87.6178596907689)
4,39,POLYGON ((41.81692934626684 -87.59215283879394...,POINT (41.808916370166855 -87.59618357877342)
...,...,...,...
72,74,POLYGON ((41.70714491233857 -87.69645961375822...,POINT (41.69487944254784 -87.7131918580596)
73,75,POLYGON ((41.685082119670845 -87.6421520465139...,POINT (41.689729585922855 -87.66905398718522)
74,76,POLYGON ((41.986396111591276 -87.8365808787436...,POINT (41.977778053264366 -87.89466582726284)
75,77,POLYGON ((41.99816614970252 -87.65455590025104...,POINT (41.98671222550283 -87.6634166067842)


In [12]:
clean_disadvantaged_areas['centroid'] = clean_disadvantaged_areas['poly'].apply(lambda poly : poly.centroid)
clean_disadvantaged_areas

Unnamed: 0,poly,centroid
0,POLYGON ((41.707966000031426 -87.5736629999675...,POINT (41.683509683928065 -87.57231914155308)
1,POLYGON ((41.68632499999862 -87.60151899995586...,POINT (41.68931505835441 -87.60536971084375)
2,POLYGON ((41.69284299996604 -87.60036199996543...,POINT (41.70085594037496 -87.60030206564569)
3,POLYGON ((41.69942700001048 -87.62032800000847...,POINT (41.696058183380245 -87.6162676498575)
4,POLYGON ((41.75771799998948 -87.66036699995497...,POINT (41.754048514350146 -87.66110569971308)
...,...,...
275,POLYGON ((41.998169000012545 -87.6624109999806...,POINT (42.00032005328326 -87.66312691441664)
276,POLYGON ((42.00497899998903 -87.66572099996036...,POINT (42.00451710898514 -87.66332746402558)
277,POLYGON ((41.998316999982876 -87.6550000000143...,POINT (41.99727725336176 -87.65564251215129)
278,POLYGON ((41.99662699998114 -87.65029099995624...,POINT (41.99538042311063 -87.65469543980272)


#### Assigning Districts to Each Dataset

In [13]:
def determine_within(df):

    statuses = []
    districts = []
    cent = True if 'centroid' in df.columns else False

    for i in range(len(df)):
        point = df.centroid.loc[i] if cent else geometry.Point(df.long.loc[i], df.lat.loc[i])
        status = 0

        for index, row in clean_police_districts.iterrows():
            district = row['district']
            geom = row['geom']
            if (geom.contains(point)): 
                status = 1
                break
        statuses.append(status)
        districts.append(district)

    df['status'] = statuses
    df['district'] = districts
    df = df[df['status'] == 1].drop('status', axis=1)

    if cent: df.drop('centroid', axis=1, inplace=True)

    return df

In [14]:
def determine_districts(data):
    for df_name, df in data.items():
        data[df_name] = determine_within(df)
        print(f'{df_name} successfully completed')
    return data

In [15]:
clean_disadvantaged_areas = determine_within(clean_disadvantaged_areas)
clean_areas = determine_within(clean_areas)

In [16]:
clean_areas

Unnamed: 0,id,poly,district
0,35,POLYGON ((41.84469250265398 -87.60914087617894...,2
1,36,POLYGON ((41.81692934626684 -87.59215283879394...,2
2,37,POLYGON ((41.80189303368919 -87.62879823733725...,9
3,38,"POLYGON ((41.81681377057218 -87.6067081256125,...",2
4,39,POLYGON ((41.81692934626684 -87.59215283879394...,2
...,...,...,...
72,74,POLYGON ((41.70714491233857 -87.69645961375822...,22
73,75,POLYGON ((41.685082119670845 -87.6421520465139...,22
74,76,POLYGON ((41.986396111591276 -87.8365808787436...,16
75,77,POLYGON ((41.99816614970252 -87.65455590025104...,20


In [100]:
#data = determine_districts(data)

#### Assigning Areas Stats to Each Crime

In [48]:
def determine_area_for_crimes(df):

    statuses = []
    areas = []
    cent = True if 'centroid' in df.columns else False
    for i in range(len(df)):
        point = df.loc[i, 'centroid'] if cent else geometry.Point(df.loc[i, 'long'], df.loc[i, 'lat'])
        status = 0

        for index, row in clean_areas.iterrows():
            area = row['id']
            geom = row['poly']
            if (geom.contains(point)): 
                status = 1
                break
        statuses.append(status)
        areas.append(area)

    df['status'] = statuses
    df['areas'] = areas
    df = df[df['status'] == 1].drop('status', axis=1)

    if cent: df.drop('centroid', axis=1, inplace=True)

    return df

In [101]:
#data['clean_crime'] = determine_area_for_crimes(data['clean_crime'])

In [89]:
clean_bike_trips = pd.read_csv('../../data/processed/clean_bike_trips.csv')
clean_train_ridership = pd.read_csv('../../data/processed/clean_train_ridership.csv')

In [90]:
clean_disadvantaged_areas['centroid'] = clean_disadvantaged_areas['poly'].apply(lambda poly : poly.centroid)
clean_disadvantaged_areas

Unnamed: 0,poly,district,centroid,status,areas
0,POLYGON ((41.707966000031426 -87.5736629999675...,4,POINT (41.683509683928065 -87.57231914155308),1,51
1,POLYGON ((41.68632499999862 -87.60151899995586...,5,POINT (41.68931505835441 -87.60536971084375),1,50
2,POLYGON ((41.69284299996604 -87.60036199996543...,5,POINT (41.70085594037496 -87.60030206564569),1,50
3,POLYGON ((41.69942700001048 -87.62032800000847...,5,POINT (41.696058183380245 -87.6162676498575),1,49
4,POLYGON ((41.75771799998948 -87.66036699995497...,6,POINT (41.754048514350146 -87.66110569971308),1,71
...,...,...,...,...,...
270,POLYGON ((41.998169000012545 -87.6624109999806...,24,POINT (42.00032005328326 -87.66312691441664),1,1
271,POLYGON ((42.00497899998903 -87.66572099996036...,24,POINT (42.00451710898514 -87.66332746402558),1,1
272,POLYGON ((41.998316999982876 -87.6550000000143...,24,POINT (41.99727725336176 -87.65564251215129),1,77
273,POLYGON ((41.99662699998114 -87.65029099995624...,24,POINT (41.99538042311063 -87.65469543980272),1,77


In [91]:
clean_disadvantaged_areas['poly'].drop_duplicates(inplace=True)
clean_disadvantaged_areas.reset_index(drop=True, inplace=True)

In [92]:
trains_with_areas = determine_area_for_crimes(data['clean_train_stations'])
bikes_with_areas = determine_area_for_crimes(data['clean_bike_stations'])
disadvantaged_areas_within_areas = determine_area_for_crimes(clean_disadvantaged_areas)

#### Data Cleaning to Finalize Datasets

In [93]:
clean_bike_trips = clean_bike_trips[['station_id','date','lat','long']].merge(right=bikes_with_areas[['id', 'district', 'areas']], how='left', left_on='station_id', right_on='id').dropna(subset=['district'])
clean_train_ridership = clean_train_ridership[['date','line','station_name','lat','long', 'rides']].merge(right=trains_with_areas[['station_name', 'district', 'areas']], how='left', on='station_name').dropna(subset=['district'])

In [94]:
clean_police_districts = clean_police_districts.merge(right=clean_disadvantaged_areas.groupby('district').agg('count'), how='left', on='district').fillna(0).rename(columns={'poly':'disadvantaged_score'})
clean_police_districts

Unnamed: 0,district,geom,disadvantaged_score,disadvantaged_score.1,centroid,status,areas
0,17,POLYGON ((41.997365655369435 -87.7106708939135...,0.0,0.0,0.0,0.0,0.0
1,20,POLYGON ((41.990916338539776 -87.6602942357235...,4.0,4.0,4.0,4.0,4.0
2,31,POLYGON ((41.98384222028714 -87.82817787271652...,0.0,0.0,0.0,0.0,0.0
3,31,POLYGON ((41.97535481526603 -87.83365455160555...,0.0,0.0,0.0,0.0,0.0
4,19,POLYGON ((41.969727093814825 -87.6449179856867...,2.0,2.0,2.0,2.0,2.0
5,25,POLYGON ((41.93921621331353 -87.71739917372257...,7.0,7.0,7.0,7.0,7.0
6,14,POLYGON ((41.93942656894302 -87.69256666010364...,3.0,3.0,3.0,3.0,3.0
7,31,POLYGON ((41.69898580247497 -87.69123096883077...,0.0,0.0,0.0,0.0,0.0
8,22,"POLYGON ((41.7361768878879 -87.63631615604635,...",9.0,9.0,9.0,9.0,9.0
9,5,POLYGON ((41.72230545260696 -87.58775992207175...,16.0,16.0,16.0,16.0,16.0


In [95]:
clean_public_healthindicator = pd.read_csv('../../data/processed/clean_public_healthindicator.csv')

In [96]:
agg_public_healthindicator = pd.merge(left=clean_public_healthindicator, right=clean_areas[['id','district']], on='id', how='left').drop('id', axis=1)
agg_public_healthindicator = agg_public_healthindicator.groupby('district').agg('mean').reset_index()

#### Save Datasets with District and Areas Data

In [97]:
clean_bike_trips.to_csv('../../data/processed/clean_bike_trips.csv', index=False)
clean_train_ridership.to_csv('../../data/processed/clean_train_ridership.csv', index=False)
clean_police_districts.to_csv('../../data/processed/clean_police_districts.csv', index=False)
agg_public_healthindicator.to_csv('../../data/processed/agg_public_healthindicator.csv', index=False)
disadvantaged_areas_within_areas.to_csv('../../data/processed/disadvantaged_areas_within_areas.csv', index=False)
clean_areas.to_csv('../../data/processed/clean_areas.csv', index=False)

In [98]:
def save_data():
    for df_name, df in data.items():
        df.to_csv(f'../../data/processed/{df_name}.csv', index=False)

In [99]:
save_data()