In [1]:
import pandas as pd
from geopy.distance import geodesic
import geopandas as gpd
from scipy.spatial import cKDTree
import numpy as np
import os



In [2]:
directory = '../../data/processed'
target = ['clean_bike_stations.csv', 'clean_bus_stops.csv', 'clean_police_stations.csv', 'clean_train_stations.csv']

# Creating a Dict to Store Datasets with long,lat columns
def read_in(directory, target):
    data = {}
    for filename in os.listdir(directory):
        if filename in target and filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            data[filename[:-4]] = pd.read_csv(file_path)
            print(f'{filename[:-4]} successfully read in')
    return data

In [3]:
data = read_in(directory, target)

clean_bike_stations successfully read in
clean_bus_stops successfully read in
clean_police_stations successfully read in
clean_train_stations successfully read in


In [4]:
clean_crime = pd.read_csv('../../data/processed/clean_crime.csv')

In [5]:
clean_crime

Unnamed: 0,id,date,type,lat,long,district
0,HZ100419,2016-01-01 01:00:00,CRIMINAL DAMAGE,41.910470,-87.751597,25
1,HZ101782,2016-01-01 01:00:00,BURGLARY,41.949534,-87.661660,19
2,HZ100063,2016-01-01 01:00:00,CRIMINAL TRESPASS,41.756605,-87.576096,4
3,HZ382257,2016-01-01 01:00:00,THEFT,41.941525,-87.639650,19
4,HZ101703,2016-01-01 01:00:00,THEFT,41.828313,-87.626498,2
...,...,...,...,...,...,...
1259029,JE100384,2020-12-31 00:00:00,CRIMINAL DAMAGE,41.778384,-87.699467,8
1259030,JE115472,2020-12-31 00:00:00,DECEPTIVE PRACTICE,41.765187,-87.582087,3
1259031,JE100691,2020-12-31 00:00:00,OTHER OFFENSE,42.009145,-87.664753,24
1259032,JE100752,2020-12-31 00:00:00,WEAPONS VIOLATION,41.654384,-87.603630,5


In [6]:
clean_crime['point'] = clean_crime.apply(lambda row: (row['lat'], row['long']), axis=1)

In [7]:
clean_crime

Unnamed: 0,id,date,type,lat,long,district,point
0,HZ100419,2016-01-01 01:00:00,CRIMINAL DAMAGE,41.910470,-87.751597,25,"(41.910469677, -87.751597381)"
1,HZ101782,2016-01-01 01:00:00,BURGLARY,41.949534,-87.661660,19,"(41.949534028, -87.661660031)"
2,HZ100063,2016-01-01 01:00:00,CRIMINAL TRESPASS,41.756605,-87.576096,4,"(41.756604794, -87.576095781)"
3,HZ382257,2016-01-01 01:00:00,THEFT,41.941525,-87.639650,19,"(41.941524519, -87.639649996)"
4,HZ101703,2016-01-01 01:00:00,THEFT,41.828313,-87.626498,2,"(41.828312991, -87.626497936)"
...,...,...,...,...,...,...,...
1259029,JE100384,2020-12-31 00:00:00,CRIMINAL DAMAGE,41.778384,-87.699467,8,"(41.778383988, -87.699466535)"
1259030,JE115472,2020-12-31 00:00:00,DECEPTIVE PRACTICE,41.765187,-87.582087,3,"(41.765187118, -87.582086968)"
1259031,JE100691,2020-12-31 00:00:00,OTHER OFFENSE,42.009145,-87.664753,24,"(42.009145091, -87.664753261)"
1259032,JE100752,2020-12-31 00:00:00,WEAPONS VIOLATION,41.654384,-87.603630,5,"(41.654384025, -87.603629807)"


In [8]:
def proximity_scan(df, distances):
    output = []

    for crime_idx, crime_row in clean_crime.iterrows():
        crime_cnt = [0 for i in range(len(distances))]
        clean_pt = crime_row['point']

        for df_idx, df_row in df.iterrows():
            comp_pt = (df_row['lat'], df_row['long'])
            calculated_distance = geodesic(clean_pt, comp_pt).miles
            print(f'row {df_idx} calculated distance of {calculated_distance}')
            
            for idx, distance in enumerate(distances):
                if calculated_distance <= distance: crime_cnt[idx] += 1
    
        print(f'crime {crime_idx} found counts of {crime_cnt}')
        output.append(crime_cnt)
    
    temp_df = pd.DataFrame(output, columns=[f'distance_{str(distance)}' for distance in distances])
    return pd.concat([df, temp_df], axis=1)        

In [9]:
def proximity_scan(df, col_name, distances):
    clean_crime_gdf = gpd.GeoDataFrame(clean_crime, geometry=gpd.points_from_xy(clean_crime.long, clean_crime.lat))
    df_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.long, df.lat))
    
    df_coords = np.array(list(zip(df_gdf.geometry.x, df_gdf.geometry.y)))
    tree = cKDTree(df_coords)
    
    output = []
    distances_in_degrees = [d / 69 for d in distances] # Convert distances from miles to degrees

    for crime_idx, crime_row in clean_crime_gdf.iterrows():            
        crime_cnt = [0 for _ in range(len(distances))]
        crime_point = crime_row.geometry
        
        for idx, distance in enumerate(distances_in_degrees):
            indices = tree.query_ball_point([crime_point.x, crime_point.y], distance)
            crime_cnt[idx] = len(indices)
        
        output.append(crime_cnt)
    
    temp_df = pd.DataFrame(output, columns=[f'{col_name}_distance_{str(distance)}' for distance in distances])
    return pd.concat([clean_crime, temp_df], axis=1)

In [10]:
def running_proxim(data, distances):
    for df_name, df in data.items():
        clean_crime = proximity_scan(df, df_name.replace("clean_",""), distances)
        print(f'{df_name} proximity scan completed')

In [11]:
running_proxim(data, [0.1, 0.3, 0.5, 1, 3, 5])