In [1]:
# Import all the things...
# import numpy as np
import pandas as pd
from pathlib import Path
from utils.data_processing import (
    clean_data,
    null_strings,
    is_valid_child,
    column_categories,
    geocode_df,
    validate_neighborhood_data,
    prepare_additional_features,
)


In [2]:
# null_strings = ['(null)','#N/A', 'NA', '?', '', ' ', '&&', 'nan']

path = Path('./data/raw')
csv_files = path.glob('*.csv')

raw_df = pd.concat((pd.read_csv(f,dtype={
        'STOP_FRISK_DATE': 'object',  # Keep as string
        'STOP_FRISK_TIME': 'object'   # Keep as string
    },
    na_values=null_strings,  # Standardize nulls
    low_memory=False) for f in csv_files), ignore_index=True)
raw_df.head()

Unnamed: 0,STOP_ID,STOP_FRISK_DATE,STOP_FRISK_TIME,YEAR2,MONTH2,DAY2,STOP_WAS_INITIATED,RECORD_STATUS_CODE,ISSUING_OFFICER_RANK,ISSUING_OFFICER_COMMAND_CODE,...,SUSPECT_OTHER_DESCRIPTION,STOP_LOCATION_PRECINCT,STOP_LOCATION_SECTOR_CODE,STOP_LOCATION_APARTMENT,STOP_LOCATION_FULL_ADDRESS,STOP_LOCATION_STREET_NAME,STOP_LOCATION_X,STOP_LOCATION_Y,STOP_LOCATION_PATROL_BORO_NAME,STOP_LOCATION_BORO_NAME
0,1,2021-01-01,01:50:00,2021,January,Friday,Based on C/W on Scene,APP,SGT,42,...,,42.0,B,2C,850 JENNINGS ST,JENNINGS ST,1013737.0,242476.0,PBBX,BRONX
1,2,2021-01-01,10:55:00,2021,January,Friday,Based on Radio Run,APP,POM,105,...,,105.0,B,,219 ST && 139 AVE,219 ST,1052511.0,184460.0,PBQS,QUEENS
2,3,2021-01-01,10:10:00,2021,January,Friday,Based on C/W on Scene,APP,POM,106,...,YELLOW SWEATER,106.0,A,,151 AVE && 84 ST,151 AVE,1025662.0,182632.0,PBQS,QUEENS
3,4,2021-01-01,20:17:00,2021,January,Friday,Based on C/W on Scene,APP,POM,83,...,,73.0,A,,BROADWAY && MAC DOUGAL ST,BROADWAY,1010122.0,187312.0,PBBN,BROOKLYN
4,5,2021-01-01,19:59:00,2021,January,Friday,Based on Radio Run,APP,POM,26,...,UNKNOWN,26.0,C,4D,439 W 125 ST,W 125 ST,996623.0,235311.0,PBMN,MANHATTAN


In [3]:
cleaned_df = clean_data(raw_df, column_categories)
print(f'Approximately {(len(cleaned_df)/ len(raw_df)*100):.2f}% of the original data after cleaning')


plausible_df = cleaned_df[
    (cleaned_df.apply(is_valid_child, axis=1))
]
print(f'{(len(plausible_df) / len(cleaned_df)*100):.2f}% of data remains after age-height validation of children')


Approximately 83.02% of the original data after cleaning
99.96% of data remains after age-height validation of children


In [4]:
geocoded_df = geocode_df(plausible_df)

NTA CRS: EPSG:4326
Stops GDF CRS: EPSG:4326
NTA CRS: EPSG:4326
Sample of transformed coordinates:
         lat        lon
0  40.832167 -73.893445
1  40.672714 -73.753920
3  40.680767 -73.906721
4  40.812541 -73.955302
5  40.833561 -73.895933
Points outside expected NYC bounds: 20
Unmatched points: 26 out of 55073 (0.05%)
Unmatched points remaining after cleaning: 0 out of 55047 (0.00%)


In [5]:
validate_neighborhood_data(geocoded_df)

All neighborhood-borough mappings validated successfully!
Unique missing NTAs: {'Mount Olivet & All Faiths Cemeteries', "St. Michael's Cemetery", 'Holy Cross Cemetery', 'Rockaway Community Park', 'Ferry Point Park-St. Raymond Cemetery'}
Found 23 borough name mismatches:
      STOP_ID                                ntaname STOP_LOCATION_BORO_NAME  \
590       591                Howard Beach-Lindenwood                BROOKLYN   
1993     1994                             Ozone Park                BROOKLYN   
1994     1995                             Ozone Park                BROOKLYN   
4126     4127                        Bushwick (East)                  QUEENS   
4919     4920  Flatbush (West)-Ditmas Park-Parkville                  QUEENS   

      boroname  
590     Queens  
1993    Queens  
1994    Queens  
4126  Brooklyn  
4919  Brooklyn  
After cleaning: Found 0 borough name mismatches:
Empty DataFrame
Columns: [STOP_ID, ntaname, STOP_LOCATION_BORO_NAME, boroname]
Index: []


In [None]:
final_df = prepare_additional_features(geocoded_df)
final_df.info()

OUTCOME_OF_STOP 
No Charges Filed    34939
Arrested            18501
Summoned             1607
Name: count, dtype: int64
OFFICER_USED_FORCE
False                 41203
True                  13844
Name: count, dtype: int64
FORCE_TYPE          
No Force                41203
Handcuffs                8891
Firearm Drawn            2184
Restraint Used           1241
Other Physical Force     1199
Taser                     303
Weapon Impact              21
Pepper Spray                5
Name: count, dtype: int64


Unnamed: 0,STOP_FRISK_DATE,STOP_ID,OBSERVED_DURATION_MINUTES,STOP_DURATION_MINUTES,SUSPECT_REPORTED_AGE,SUSPECT_WEIGHT,STOP_LOCATION_X,STOP_LOCATION_Y,MONTH2,STOP_WAS_INITIATED,...,boroname,ntatype,nta2020,borocode,countyfips,ntaabbrev,cdta2020,OUTCOME_OF_STOP,OFFICER_USED_FORCE,FORCE_TYPE
0,2021-01-01 00:01:00,1,0,2,40,200,1013737,242476,January,Based on C/W on Scene,...,Bronx,0,BX0303,2,5,CrtnaPkEst,BX03,Arrested,True,Handcuffs
1,2021-01-01 00:01:00,2,1,10,19,160,1052511,184460,January,Based on Radio Run,...,Queens,0,QN1305,4,81,Lrltn,QN13,Arrested,False,No Force
2,2021-01-01 00:01:00,4,1,5,28,180,1010122,187312,January,Based on C/W on Scene,...,Brooklyn,0,BK1601,3,47,OcnHl,BK16,No Charges Filed,False,No Force
3,2021-01-01 00:01:00,5,1,30,19,185,996623,235311,January,Based on Radio Run,...,Manhattan,0,MN0901,1,61,MrngsdHts,MN09,No Charges Filed,True,Handcuffs
4,2021-01-01 00:01:00,6,0,6,32,160,1013048,242983,January,Based on Radio Run,...,Bronx,0,BX0303,2,5,CrtnaPkEst,BX03,Arrested,False,No Force


In [7]:
final_df.to_csv('./data/processed/stop-and-frisk.csv')
final_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 55047 entries, 0 to 55046
Data columns (total 70 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   STOP_FRISK_DATE                         42029 non-null  datetime64[ns]
 1   STOP_ID                                 55047 non-null  Int64         
 2   OBSERVED_DURATION_MINUTES               55047 non-null  Int64         
 3   STOP_DURATION_MINUTES                   55047 non-null  Int64         
 4   SUSPECT_REPORTED_AGE                    55047 non-null  Int64         
 5   SUSPECT_WEIGHT                          54113 non-null  Int64         
 6   STOP_LOCATION_X                         55047 non-null  Int64         
 7   STOP_LOCATION_Y                         55047 non-null  Int64         
 8   MONTH2                                  55047 non-null  object        
 9   STOP_WAS_INITIATED                      55