In [1]:
import pandas as pd
import os
from shapely.geometry import Point, Polygon
from shapely import geometry

In [2]:
directory = '../../data/processed'
void = ['clean_areas.csv', 'clean_disadvantaged_areas.csv', 'clean_police_districts.csv', 'clean_public_healthindicator.csv', 'clean_police_sentiment.csv', 'clean_ridership.csv']
data = {}

for filename in os.listdir(directory):
    if filename not in void and filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)
        data[filename[:-4]] = pd.read_csv(file_path)
        print(f'{filename[:-4]} successfully read in')

clean_311 successfully read in
clean_bike_stations successfully read in
clean_bike_trips successfully read in
clean_bus_stations successfully read in
clean_crime successfully read in
clean_cta_stations successfully read in
clean_police_stations successfully read in


In [3]:
for df_name, df in data.items():
    print(df_name, len(df), df.columns)

clean_311 468201 Index(['date', 'id', 'type', 'lat', 'long'], dtype='object')
clean_bike_stations 575 Index(['station', 'lat', 'long', 'id'], dtype='object')
clean_bike_trips 37424072 Index(['date', 'station_id', 'station_name', 'lat', 'long', 'id'], dtype='object')
clean_bus_stations 398 Index(['stop_id', 'cta_stop_name', 'lat', 'long'], dtype='object')
clean_crime 7824317 Index(['id', 'date', 'type', 'lat', 'long'], dtype='object')
clean_cta_stations 143 Index(['station_number', 'station_name', 'lat', 'long'], dtype='object')
clean_police_stations 23 Index(['id', 'lat', 'long'], dtype='object')


In [4]:
clean_police_districts = pd.read_csv('../../data/processed/clean_police_districts.csv')

In [5]:
def parse_polygon(polygon_string):
    points = polygon_string.strip('POLYGON ((').strip('))').split(', ')
    points = [tuple(map(float, point.split())) for point in points]
    return Polygon(points)

In [6]:
clean_police_districts['geom'] = clean_police_districts['geom'].apply(parse_polygon)

In [7]:
def determine_within(df):

    statuses = []
    districts = []
    for i in range(len(df)):
        point = geometry.Point(df.lat.loc[i], df.long.loc[i])
        status = 0

        for index, row in clean_police_districts.iterrows():
            district = row['district']
            geom = row['geom']
            if (geom.contains(point)): 
                status = 1
                break
        statuses.append(status)
        districts.append(district)

    df['status'] = statuses
    df['district'] = districts
    df = df[df['status'] == 1].drop('status', axis=1)

    return df

In [8]:
def determine_districts(data):
    for df_name, df in data.items():
        data[df_name] = determine_within(df)
        print(f'{df_name} successfully completed')
    return data

In [9]:
data = determine_districts(data)

clean_311 successfully completed
clean_bike_stations successfully completed
clean_bike_trips successfully completed
clean_bus_stations successfully completed
clean_crime successfully completed
clean_cta_stations successfully completed
clean_police_stations successfully completed


In [13]:
data['clean_crime']['district'].value_counts()

8     521037
11    501613
6     457139
7     448148
4     446771
25    442058
3     397404
12    389342
9     388166
2     365862
19    353093
18    350610
5     344028
10    336419
15    330726
1     322370
14    301590
16    256504
22    254077
24    230903
17    226052
20    139742
31      1726
Name: district, dtype: int64

- Gather district geom data
- Assign district to all locational data
- Create a master dataset with all features
    - based on crime dataset
    - attach aggregated data from each other dataset based on time and district of each crime
    - attach lag and leads of crime, bike, cta, and 311 data 