In [49]:
import pandas as pd
from geopy.distance import geodesic # deprecated
from math import radians, cos, sin, asin, sqrt
import geopandas as gpd
from scipy.spatial import cKDTree
import numpy as np
import os
from datetime import timedelta, datetime

#### Reading in Files

In [2]:
directory = '../../data/processed'
target = ['clean_vacant_buildings.csv', 'clean_bike_stations.csv', 'clean_bus_stops.csv', 'clean_police_stations.csv', 'clean_train_stations.csv']

# Creating a Dict to Store Datasets with long,lat columns
def read_in(directory, target):
    data = {}
    for filename in os.listdir(directory):
        if filename in target:
            file_path = os.path.join(directory, filename)
            data[filename[:-4]] = pd.read_csv(file_path)
            print(f'{filename[:-4]} successfully read in')
    return data

In [3]:
data = read_in(directory, target)

clean_bike_stations successfully read in
clean_bus_stops successfully read in
clean_police_stations successfully read in
clean_train_stations successfully read in
clean_vacant_buildings successfully read in


In [4]:
clean_crime = pd.read_csv('../../data/processed/clean_crime.csv')

In [5]:
clean_crime['point'] = clean_crime.apply(lambda row: (row['lat'], row['long']), axis=1)

#### Conducing Proximity Analysis

In [6]:
# Deprecated: The following function is no longer in use.
# It has been replaced by a more efficient implementation.
# Retained for reference and documentation purposes.

def proximity_scan(df, distances):
    output = []

    for crime_idx, crime_row in clean_crime.iterrows():
        crime_cnt = [0 for i in range(len(distances))]
        clean_pt = crime_row['point']

        for df_idx, df_row in df.iterrows():
            comp_pt = (df_row['lat'], df_row['long'])
            calculated_distance = geodesic(clean_pt, comp_pt).miles
            print(f'row {df_idx} calculated distance of {calculated_distance}')
            
            for idx, distance in enumerate(distances):
                if calculated_distance <= distance: crime_cnt[idx] += 1
    
        print(f'crime {crime_idx} found counts of {crime_cnt}')
        output.append(crime_cnt)
    
    temp_df = pd.DataFrame(output, columns=[f'distance_{str(distance)}' for distance in distances])
    return pd.concat([df, temp_df], axis=1)        

In [7]:
def proximity_scan(df, col_name, distances):
    clean_crime_gdf = gpd.GeoDataFrame(clean_crime, geometry=gpd.points_from_xy(clean_crime.long, clean_crime.lat))
    df_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.long, df.lat))
    
    df_coords = np.array(list(zip(df_gdf.geometry.x, df_gdf.geometry.y)))
    tree = cKDTree(df_coords)
    
    output = []
    distances_in_degrees = [d / 69 for d in distances] # Convert distances from miles to degrees

    for crime_idx, crime_row in clean_crime_gdf.iterrows():            
        crime_cnt = [0 for _ in range(len(distances))]
        crime_point = crime_row.geometry
        
        for idx, distance in enumerate(distances_in_degrees):
            indices = tree.query_ball_point([crime_point.x, crime_point.y], distance)
            crime_cnt[idx] = len(indices)
        
        output.append(crime_cnt)
    
    temp_df = pd.DataFrame(output, columns=[f'{col_name}_distance_{str(distance)}' for distance in distances])
    return temp_df

In [8]:
crime_with_police = proximity_scan(data['clean_police_stations'], 'police_stations', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_bikes = proximity_scan(data['clean_bike_stations'], 'bike_stations', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_buses = proximity_scan(data['clean_bus_stops'], 'bus_stops', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_trains = proximity_scan(data['clean_train_stations'], 'train_stations', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_vacant_buildings = proximity_scan(data['clean_vacant_buildings'], 'vacant_buildings', [0.1, 0.3, 0.5, 1, 3, 5])

#### Accounting for Unique Time-Related Case (311 & Bike Ride datasets)

In [9]:
clean_alleylights = pd.read_csv('../../data/processed/clean_alleylights.csv')
clean_streetlights_allout = pd.read_csv('../../data/processed/clean_streetlights_allout.csv')
clean_streetlights_oneout = pd.read_csv('../../data/processed/clean_streetlights_oneout.csv')

In [10]:
clean_crime['date'] = pd.to_datetime(clean_crime['date'])

In [11]:
def patch_datetypes(data):
    for df in data:
        df['start_date'] = pd.to_datetime(df['start_date'])
        df['end_date'] = pd.to_datetime(df['end_date'])

In [12]:
def proximity_scan_311(df, col_name, distances):
    clean_crime_gdf = gpd.GeoDataFrame(clean_crime, geometry=gpd.points_from_xy(clean_crime.long, clean_crime.lat))
    
    output = []
    distances_in_degrees = [d / 69 for d in distances] # Convert distances from miles to degrees

    for crime_idx, crime_row in clean_crime_gdf.iterrows():          
        crime_cnt = [0 for _ in range(len(distances))]
        crime_point = crime_row.geometry
        
        df_filtered = df[(df.start_date <= crime_row['date']) & ((df.end_date.isna()) | (df.end_date >= crime_row['date']))]
        df_gdf = gpd.GeoDataFrame(df_filtered, geometry=gpd.points_from_xy(df_filtered.long, df_filtered.lat))
        df_coords = np.array(list(zip(df_gdf.geometry.x, df_gdf.geometry.y)))
        tree = cKDTree(df_coords)

        for idx, distance in enumerate(distances_in_degrees):
            indices = tree.query_ball_point([crime_point.x, crime_point.y], distance)
            crime_cnt[idx] = len(indices)
        
        output.append(crime_cnt)
    
    temp_df = pd.DataFrame(output, columns=[f'{col_name}_distance_{str(distance)}' for distance in distances])
    return temp_df

In [13]:
patch_datetypes([clean_alleylights, clean_streetlights_allout, clean_streetlights_oneout])

In [14]:
crime_with_alleylights = proximity_scan_311(clean_alleylights, 'alleylights', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_streetlights_allout = proximity_scan_311(clean_streetlights_allout, 'streetlights_allout', [0.1, 0.3, 0.5, 1, 3, 5])
crime_with_streetlights_oneout = proximity_scan_311(clean_streetlights_oneout, 'streetlights_oneout', [0.1, 0.3, 0.5, 1, 3, 5])

In [15]:
clean_bike_trips = pd.read_csv('../../data/processed/clean_bike_trips.csv')

In [16]:
clean_bike_trips['date'] = pd.to_datetime(clean_bike_trips['date'])

In [70]:
def print_timestamped_message(message):
    print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {message}")

In [138]:
def convert_to_numpy(df):
    # date, long (x), lat (y)
    base_time = datetime(2016, 1, 1, 0, 0, 0, 0)
    df['cleaned_date'] = (df['date'] - base_time).dt.total_seconds()
    df = df.sort_values(by='cleaned_date').reset_index(drop=True)
    df = df[['cleaned_date', 'long', 'lat']]
    
    df_np = df.to_numpy()
    df_np[:, 1] = np.radians(df_np[:, 1])
    df_np[:, 2] = np.radians(df_np[:, 2])

    return df_np

In [139]:
def binary_search_crime(arr, crime_time, last_row_idx):
    times = [300, 600, 900]  # 5, 10, 15 minutes in seconds
    target_times = [(crime_time - t, crime_time + t) for t in times]
    final_idx = []

    def find_start_index(arr, start_time, last_row_idx):
        left, right = last_row_idx, len(arr) - 1
        while left <= right:
            mid = (left + right) // 2
            curr_time = arr[mid][0]
            if curr_time < start_time:
                left = mid + 1
            else:
                right = mid - 1
        return left

    def find_end_index(arr, end_time, last_row_idx):
        left, right = last_row_idx, len(arr) - 1
        while left <= right:
            mid = (left + right) // 2
            curr_time = arr[mid][0]
            if curr_time > end_time:
                right = mid - 1
            else:
                left = mid + 1
        return right

    for i, (start_time, end_time) in enumerate(target_times):
        start_idx = find_start_index(arr, start_time, last_row_idx[i])
        end_idx = find_end_index(arr, end_time, last_row_idx[i])
        
        if start_idx <= end_idx and start_idx < len(arr) and end_idx >= 0:
            final_idx.append((start_idx, end_idx))
        else:
            final_idx.append((-1, -1))  # if no valid index is found within the time range

    return final_idx

In [150]:
def compute_haversine(arr, distances, target_pnt):
    row_lat = arr[:, 2]
    row_lon = arr[:, 1]
    crime_lat, crime_lon = target_pnt[1], target_pnt[0]
    
    dlon = row_lon - crime_lon
    dlat = row_lat - crime_lat

    a = np.sin(dlat / 2)**2 + np.cos(crime_lat) * np.cos(row_lat) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 3956  # Radius of earth in miles
    distances_in_miles = c * r

    cnt = np.zeros(len(distances))
    for i, distance in enumerate(distances):
        cnt[i] = np.sum(distances_in_miles <= distance)

    return cnt

In [151]:
def proximity_scan_bike_rides(df, distances):
    print_timestamped_message("Starting proximity_scan_bike_rides")
    crime_np = convert_to_numpy(clean_crime)
    df_np = convert_to_numpy(df)

    delta_cnts = [[] for _ in range(3)]

    for i, crime_row in enumerate(crime_np):
        if i % 1000 == 0: print_timestamped_message(f"Processing crime row {i} with time: {crime_row[0]}")
        last_row_idx = [0] * 3  # 5 min, 10 min, 15 min

        target_indicies = binary_search_crime(df_np, crime_row[0], last_row_idx)
        last_row_idx = [start_idx for (start_idx, end_idx) in target_indicies]

        for idx, target_tuple in enumerate(target_indicies):
            if target_tuple[0] != -1 and target_tuple[1] != -1:
                target_arr = df_np[target_tuple[0]:target_tuple[1]]
                crime_pnt = (crime_row[1], crime_row[2])
                counts = compute_haversine(target_arr, distances, crime_pnt)
            else:
                counts = [0, 0, 0]
            delta_cnts[idx].append(counts)
        if i % 1000 == 0: print_timestamped_message(f"Found the following bike ride counts for distance {distances[idx]} {counts}")
    
    print_timestamped_message("Completed proximity_scan_bike_rides")
    return delta_cnts

In [152]:
crime_with_bike_trips_cnts = proximity_scan_bike_rides(clean_bike_trips, [0.1, 0.3, 0.5])

2024-07-18 22:29:59 - Starting proximity_scan_bike_rides
2024-07-18 22:30:02 - Processing crime row 0 with time: 0.0
2024-07-18 22:30:02 - Found the following bike ride counts for distance 0.5 [0. 0. 0.]
2024-07-18 22:30:02 - Processing crime row 1000 with time: 86400.0
2024-07-18 22:30:02 - Found the following bike ride counts for distance 0.5 [0. 0. 0.]
2024-07-18 22:30:02 - Processing crime row 2000 with time: 238680.0
2024-07-18 22:30:02 - Found the following bike ride counts for distance 0.5 [0. 0. 0.]
2024-07-18 22:30:02 - Processing crime row 3000 with time: 385440.0
2024-07-18 22:30:02 - Found the following bike ride counts for distance 0.5 [0. 0. 0.]
2024-07-18 22:30:02 - Processing crime row 4000 with time: 510900.0
2024-07-18 22:30:02 - Found the following bike ride counts for distance 0.5 [0. 0. 0.]
2024-07-18 22:30:02 - Processing crime row 5000 with time: 646200.0
2024-07-18 22:30:02 - Found the following bike ride counts for distance 0.5 [0. 0. 0.]
2024-07-18 22:30:02 - 

In [159]:
crime_with_bike_trips_cnts

[[[0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],
  [0, 0, 0],

In [165]:
crime_with_bike_trips = clean_crime[['id']]

# Loop over time intervals and distances to populate the DataFrame
for i, time in enumerate(['5_min', '10_min', '15_min']):
    for j, distance in enumerate([0.1, 0.3, 0.5]):
        # Extract the list of counts for the given time and distance
        bike_rides_counts = [counts[j] for counts in crime_with_bike_trips_cnts[i]]
        
        # Assign the extracted counts to the corresponding column in the DataFrame
        crime_with_bike_trips[f'bike_rides_within_{distance}_and_{time}'] = bike_rides_counts

crime_with_bike_trips

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crime_with_bike_trips[f'bike_rides_within_{distance}_and_{time}'] = bike_rides_counts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crime_with_bike_trips[f'bike_rides_within_{distance}_and_{time}'] = bike_rides_counts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crime_with_bike_trips[f'bike_ride

Unnamed: 0,id,bike_rides_within_0.1_and_5_min,bike_rides_within_0.3_and_5_min,bike_rides_within_0.5_and_5_min,bike_rides_within_0.1_and_10_min,bike_rides_within_0.3_and_10_min,bike_rides_within_0.5_and_10_min,bike_rides_within_0.1_and_15_min,bike_rides_within_0.3_and_15_min,bike_rides_within_0.5_and_15_min
0,HZ100419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,HZ101782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,HZ100063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,HZ382257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,HZ101703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1258347,JE100384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1258348,JE115472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1258349,JE100691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1258350,JE100752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Cleaning and Saving Finalized Dataset

In [None]:
crime_with_proximity = pd.concat([clean_crime, crime_with_police, crime_with_bikes, crime_with_buses, crime_with_trains, crime_with_alleylights, crime_with_streetlights_allout, crime_with_streetlights_oneout, crime_with_bike_trips], axis=1).drop('point', axis=1)

In [None]:
crime_with_proximity

Unnamed: 0,id,date,type,lat,long,district,police_stations_distance_0.1,police_stations_distance_0.3,police_stations_distance_0.5,police_stations_distance_1,...,streetlights_allout_distance_0.5,streetlights_allout_distance_1,streetlights_allout_distance_3,streetlights_allout_distance_5,streetlights_oneout_distance_0.1,streetlights_oneout_distance_0.3,streetlights_oneout_distance_0.5,streetlights_oneout_distance_1,streetlights_oneout_distance_3,streetlights_oneout_distance_5
0,HZ100419,2016-01-01 01:00:00,CRIMINAL DAMAGE,41.910470,-87.751597,25,0,0,0,0,...,0,0,0,1,0,0,0,0,2,8
1,HZ101782,2016-01-01 01:00:00,BURGLARY,41.949534,-87.661660,19,0,0,0,1,...,0,0,0,1,0,0,0,0,2,9
2,HZ100063,2016-01-01 01:00:00,CRIMINAL TRESPASS,41.756605,-87.576096,4,0,0,0,0,...,0,0,3,5,0,0,0,0,3,6
3,HZ382257,2016-01-01 01:00:00,THEFT,41.941525,-87.639650,19,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6
4,HZ101703,2016-01-01 01:00:00,THEFT,41.828313,-87.626498,2,0,1,1,1,...,0,0,0,1,0,0,0,0,2,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259029,JE100384,2020-12-31 00:00:00,CRIMINAL DAMAGE,41.778384,-87.699467,8,0,0,0,1,...,0,0,22,51,0,1,2,20,131,319
1259030,JE115472,2020-12-31 00:00:00,DECEPTIVE PRACTICE,41.765187,-87.582087,3,0,0,0,0,...,2,3,20,40,0,1,5,18,145,317
1259031,JE100691,2020-12-31 00:00:00,OTHER OFFENSE,42.009145,-87.664753,24,0,0,0,1,...,0,2,8,14,0,5,11,24,117,261
1259032,JE100752,2020-12-31 00:00:00,WEAPONS VIOLATION,41.654384,-87.603630,5,0,0,0,0,...,0,2,17,36,0,0,3,7,64,179


In [None]:
crime_with_proximity.columns

Index(['id', 'date', 'type', 'lat', 'long', 'district',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_

In [168]:
crime_with_proximity.to_csv('../../data/pre_training/crime_with_proximity.csv', index=False)