In [1]:
import pandas as pd
from geopy.distance import geodesic # deprecated
import geopandas as gpd
from scipy.spatial import cKDTree
import numpy as np
import os



#### Reading in Files

In [2]:
directory = '../../data/processed'
target = ['clean_vacant_buildings.csv', 'clean_bike_stations.csv', 'clean_bus_stops.csv', 'clean_police_stations.csv', 'clean_train_stations.csv']

# Creating a Dict to Store Datasets with long,lat columns
def read_in(directory, target):
    data = {}
    for filename in os.listdir(directory):
        if filename in target:
            file_path = os.path.join(directory, filename)
            data[filename[:-4]] = pd.read_csv(file_path)
            print(f'{filename[:-4]} successfully read in')
    return data

In [3]:
data = read_in(directory, target)

clean_bike_stations successfully read in
clean_bus_stops successfully read in
clean_police_stations successfully read in
clean_train_stations successfully read in
clean_vacant_buildings successfully read in


In [4]:
clean_crime = pd.read_csv('../../data/processed/clean_crime.csv')

In [5]:
clean_crime['point'] = clean_crime.apply(lambda row: (row['lat'], row['long']), axis=1)

#### Conducing Proximity Analysis

In [6]:
# Deprecated: The following function is no longer in use.
# It has been replaced by a more efficient implementation.
# Retained for reference and documentation purposes.

def proximity_scan(df, distances):
    output = []

    for crime_idx, crime_row in clean_crime.iterrows():
        crime_cnt = [0 for i in range(len(distances))]
        clean_pt = crime_row['point']

        for df_idx, df_row in df.iterrows():
            comp_pt = (df_row['lat'], df_row['long'])
            calculated_distance = geodesic(clean_pt, comp_pt).miles
            print(f'row {df_idx} calculated distance of {calculated_distance}')
            
            for idx, distance in enumerate(distances):
                if calculated_distance <= distance: crime_cnt[idx] += 1
    
        print(f'crime {crime_idx} found counts of {crime_cnt}')
        output.append(crime_cnt)
    
    temp_df = pd.DataFrame(output, columns=[f'distance_{str(distance)}' for distance in distances])
    return pd.concat([df, temp_df], axis=1)        

In [7]:
def proximity_scan(df, col_name, distances):
    clean_crime_gdf = gpd.GeoDataFrame(clean_crime, geometry=gpd.points_from_xy(clean_crime.long, clean_crime.lat))
    df_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.long, df.lat))
    
    df_coords = np.array(list(zip(df_gdf.geometry.x, df_gdf.geometry.y)))
    tree = cKDTree(df_coords)
    
    output = []
    distances_in_degrees = [d / 69 for d in distances] # Convert distances from miles to degrees

    for crime_idx, crime_row in clean_crime_gdf.iterrows():            
        crime_cnt = [0 for _ in range(len(distances))]
        crime_point = crime_row.geometry
        
        for idx, distance in enumerate(distances_in_degrees):
            indices = tree.query_ball_point([crime_point.x, crime_point.y], distance)
            crime_cnt[idx] = len(indices)
        
        output.append(crime_cnt)
    
    temp_df = pd.DataFrame(output, columns=[f'{col_name}_distance_{str(distance)}' for distance in distances])
    return temp_df

In [8]:
crime_with_police = proximity_scan(data['clean_police_stations'], 'police_stations', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_bikes = proximity_scan(data['clean_bike_stations'], 'bike_stations', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_buses = proximity_scan(data['clean_bus_stops'], 'bus_stops', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_trains = proximity_scan(data['clean_train_stations'], 'train_stations', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_vacant_buildings = proximity_scan(data['clean_vacant_buildings'], 'vacant_buildings', [0.1, 0.3, 0.5, 1, 3, 5])

#### Accounting for Unique Time-Related Case (311 & Bike Ride datasets)

In [None]:
clean_alleylights = pd.read_csv('../../data/processed/clean_alleylights.csv')
clean_streetlights_allout = pd.read_csv('../../data/processed/clean_streetlights_allout.csv')
clean_streetlights_oneout = pd.read_csv('../../data/processed/clean_streetlights_oneout.csv')

In [None]:
clean_crime['date'] = pd.to_datetime(clean_crime['date'])

In [None]:
def patch_datetypes(data):
    for df in data:
        df['start_date'] = pd.to_datetime(df['start_date'])
        df['end_date'] = pd.to_datetime(df['end_date'])

In [None]:
def proximity_scan_311(df, col_name, distances):
    clean_crime_gdf = gpd.GeoDataFrame(clean_crime, geometry=gpd.points_from_xy(clean_crime.long, clean_crime.lat))
    
    output = []
    distances_in_degrees = [d / 69 for d in distances] # Convert distances from miles to degrees

    for crime_idx, crime_row in clean_crime_gdf.iterrows():          
        crime_cnt = [0 for _ in range(len(distances))]
        crime_point = crime_row.geometry
        
        df_filtered = df[(df.start_date <= crime_row['date']) & ((df.end_date.isna()) | (df.end_date >= crime_row['date']))]
        df_gdf = gpd.GeoDataFrame(df_filtered, geometry=gpd.points_from_xy(df_filtered.long, df_filtered.lat))
        df_coords = np.array(list(zip(df_gdf.geometry.x, df_gdf.geometry.y)))
        tree = cKDTree(df_coords)

        for idx, distance in enumerate(distances_in_degrees):
            indices = tree.query_ball_point([crime_point.x, crime_point.y], distance)
            crime_cnt[idx] = len(indices)
        
        output.append(crime_cnt)
    
    temp_df = pd.DataFrame(output, columns=[f'{col_name}_distance_{str(distance)}' for distance in distances])
    return temp_df

In [None]:
patch_datetypes([clean_alleylights, clean_streetlights_allout, clean_streetlights_oneout])

In [None]:
crime_with_alleylights = proximity_scan_311(clean_alleylights, 'alleylights', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_streetlights_allout = proximity_scan_311(clean_streetlights_allout, 'streetlights_allout', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_streetlights_oneout = proximity_scan_311(clean_streetlights_oneout, 'streetlights_oneout', [0.1, 0.3, 0.5, 1, 3, 5])

In [None]:
clean_bike_trips = pd.read_csv('../../data/processed/clean_bike_trips.csv')

In [None]:
clean_bike_trips['date'] = pd.to_datetime(clean_bike_trips['date'])

In [None]:
def proximity_scan_bike_rides(df, col_name, distances):
    clean_crime_gdf = gpd.GeoDataFrame(clean_crime, geometry=gpd.points_from_xy(clean_crime.long, clean_crime.lat))
    
    output_5_min = []
    output_10_min = []
    output_15_min = []
    distances_in_degrees = [d / 69 for d in distances] # Convert distances from miles to degrees

    for crime_idx, crime_row in clean_crime_gdf.iterrows():          
        crime_cnt_5_min = [0 for _ in range(len(distances))]
        crime_cnt_10_min = [0 for _ in range(len(distances))]
        crime_cnt_15_min = [0 for _ in range(len(distances))]
        crime_point = crime_row.geometry
        
        time_window_5_min = pd.Timedelta(minutes=5)
        time_window_10_min = pd.Timedelta(minutes=10)
        time_window_15_min = pd.Timedelta(minutes=15)

        df_filtered_5_min = df[(df['date'] <= crime_row['date'] + time_window_5_min) & (df['date'] >= crime_row['date'] - time_window_5_min)]
        df_filtered_10_min = df[(df['date'] <= crime_row['date'] + time_window_10_min) & (df['date'] >= crime_row['date'] - time_window_10_min)]
        df_filtered_15_min = df[(df['date'] <= crime_row['date'] + time_window_15_min) & (df['date'] >= crime_row['date'] - time_window_15_min)]
        
        for df_filtered, crime_cnt in zip([df_filtered_5_min, df_filtered_10_min, df_filtered_15_min], 
                                          [crime_cnt_5_min, crime_cnt_10_min, crime_cnt_15_min]):
            df_gdf = gpd.GeoDataFrame(df_filtered, geometry=gpd.points_from_xy(df_filtered.long, df_filtered.lat))
            df_coords = np.array(list(zip(df_gdf.geometry.x, df_gdf.geometry.y)))
            
            if len(df_coords) == 0:
                continue
            
            tree = cKDTree(df_coords)

            for idx, distance in enumerate(distances_in_degrees):
                indices = tree.query_ball_point([crime_point.x, crime_point.y], distance)
                crime_cnt[idx] = len(indices)
        
        output_5_min.append(crime_cnt_5_min)
        output_10_min.append(crime_cnt_10_min)
        output_15_min.append(crime_cnt_15_min)
    
    temp_df_5_min = pd.DataFrame(output_5_min, columns=[f'{col_name}_distance_{str(distance)}_5_min' for distance in distances])
    temp_df_10_min = pd.DataFrame(output_10_min, columns=[f'{col_name}_distance_{str(distance)}_10_min' for distance in distances])
    temp_df_15_min = pd.DataFrame(output_15_min, columns=[f'{col_name}_distance_{str(distance)}_15_min' for distance in distances])
    
    return pd.concat([temp_df_5_min, temp_df_10_min, temp_df_15_min], axis=1)

In [None]:
crime_with_bike_trips = proximity_scan_bike_rides(clean_bike_trips, 'bike_trips', [0.1, 0.3, 0.5])

#### Cleaning and Saving Finalized Dataset

In [None]:
crime_with_proximity = pd.concat([clean_crime, crime_with_police, crime_with_bikes, crime_with_buses, crime_with_trains, crime_with_alleylights, crime_with_streetlights_allout, crime_with_streetlights_oneout, crime_with_bike_trips], axis=1).drop('point', axis=1)

In [None]:
crime_with_proximity

Unnamed: 0,id,date,type,lat,long,district,police_stations_distance_0.1,police_stations_distance_0.3,police_stations_distance_0.5,police_stations_distance_1,...,streetlights_allout_distance_0.5,streetlights_allout_distance_1,streetlights_allout_distance_3,streetlights_allout_distance_5,streetlights_oneout_distance_0.1,streetlights_oneout_distance_0.3,streetlights_oneout_distance_0.5,streetlights_oneout_distance_1,streetlights_oneout_distance_3,streetlights_oneout_distance_5
0,HZ100419,2016-01-01 01:00:00,CRIMINAL DAMAGE,41.910470,-87.751597,25,0,0,0,0,...,0,0,0,1,0,0,0,0,2,8
1,HZ101782,2016-01-01 01:00:00,BURGLARY,41.949534,-87.661660,19,0,0,0,1,...,0,0,0,1,0,0,0,0,2,9
2,HZ100063,2016-01-01 01:00:00,CRIMINAL TRESPASS,41.756605,-87.576096,4,0,0,0,0,...,0,0,3,5,0,0,0,0,3,6
3,HZ382257,2016-01-01 01:00:00,THEFT,41.941525,-87.639650,19,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6
4,HZ101703,2016-01-01 01:00:00,THEFT,41.828313,-87.626498,2,0,1,1,1,...,0,0,0,1,0,0,0,0,2,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259029,JE100384,2020-12-31 00:00:00,CRIMINAL DAMAGE,41.778384,-87.699467,8,0,0,0,1,...,0,0,22,51,0,1,2,20,131,319
1259030,JE115472,2020-12-31 00:00:00,DECEPTIVE PRACTICE,41.765187,-87.582087,3,0,0,0,0,...,2,3,20,40,0,1,5,18,145,317
1259031,JE100691,2020-12-31 00:00:00,OTHER OFFENSE,42.009145,-87.664753,24,0,0,0,1,...,0,2,8,14,0,5,11,24,117,261
1259032,JE100752,2020-12-31 00:00:00,WEAPONS VIOLATION,41.654384,-87.603630,5,0,0,0,0,...,0,2,17,36,0,0,3,7,64,179


In [None]:
crime_with_proximity.columns

Index(['id', 'date', 'type', 'lat', 'long', 'district',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_

In [None]:
crime_with_proximity.to_csv('../../data/pre_training/crime_with_proximity.csv', index=False)