In [1]:
# Import all the things...
import pandas as pd
from pathlib import Path
from utils.data_processing import (
    clean_data,
    null_strings,
    is_valid_child,
    column_categories,
    geocode_df,
    validate_neighborhood_data,
    prepare_additional_features,
    standardize_dates
)


In [2]:
# null_strings = ['(null)','#N/A', 'NA', '?', '', ' ', '&&', 'nan']

path = Path('./data/raw')
csv_files = path.glob('*.csv')

raw_df = pd.concat((pd.read_csv(f,dtype={
        'STOP_FRISK_DATE': 'object',  # Keep as string
        'STOP_FRISK_TIME': 'object'   # Keep as string
    },
    na_values=null_strings,  # Standardize nulls
    low_memory=False) for f in csv_files), ignore_index=True)
raw_df.head()
print(f"There are {len(raw_df)} records before cleaning")

There are 66406 records before cleaning


In [3]:
# Usage example:
raw_df['STOP_FRISK_DATE'] = raw_df['STOP_FRISK_DATE'].apply(standardize_dates)
raw_df[raw_df['YEAR2'] == 2022]


Unnamed: 0,STOP_ID,STOP_FRISK_DATE,STOP_FRISK_TIME,YEAR2,MONTH2,DAY2,STOP_WAS_INITIATED,RECORD_STATUS_CODE,ISSUING_OFFICER_RANK,ISSUING_OFFICER_COMMAND_CODE,...,SUSPECT_OTHER_DESCRIPTION,STOP_LOCATION_PRECINCT,STOP_LOCATION_SECTOR_CODE,STOP_LOCATION_APARTMENT,STOP_LOCATION_FULL_ADDRESS,STOP_LOCATION_STREET_NAME,STOP_LOCATION_X,STOP_LOCATION_Y,STOP_LOCATION_PATROL_BORO_NAME,STOP_LOCATION_BORO_NAME
51304,1,2022-01-01,8:40:00,2022,January,Saturday,Based on Self Initiated,APP,POM,73,...,,73.0,C,,LIVONIA AVENUE && THATFORD AVENUE,LIVONIA AVENUE,1008275.0,183622.0,PBBN,BROOKLYN
51305,2,2022-01-01,3:25:00,2022,January,Saturday,Based on Self Initiated,APP,POM,183,...,,42.0,A,,WASHINGTON AVE && E 171 ST,WASHINGTON AVE,1010997.0,244468.0,PBBX,BRONX
51306,3,2022-01-01,0:19:00,2022,January,Saturday,Based on Self Initiated,APP,POF,52,...,,42.0,A,,E 170 ST && PARK AVE,E 170 ST,1010321.0,243768.0,PBBX,BRONX
51307,4,2022-01-01,3:00:00,2022,January,Saturday,Based on Radio Run,APP,POM,9,...,,9.0,A,,AVENUE A && E 1 ST,AVENUE A,988051.0,202409.0,PBMS,MANHATTAN
51308,5,2022-01-01,3:00:00,2022,January,Saturday,Based on Radio Run,APP,POM,9,...,,9.0,A,,AVENUE A && E 1 ST,AVENUE A,988051.0,202409.0,PBMS,MANHATTAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66401,15098,2022-12-31,17:09:00,2022,December,Saturday,Based on Self Initiated,APP,POM,808,...,,43.0,A,,RANDALL AVE && COMMONWEALTH AVE,RANDALL AVE,1021687.0,236876.0,PBBX,BRONX
66402,15099,2022-12-31,4:00:00,2022,December,Saturday,Based on Radio Run,APP,POM,42,...,,42.0,D,,E 163 ST && EAGLE AVE,E 163 ST,1009784.0,239429.0,PBBX,BRONX
66403,15100,2022-12-31,2:28:00,2022,December,Saturday,Based on Self Initiated,APP,POM,115,...,,115.0,D,,37-11 94 ST,94 ST,1019464.0,212685.0,PBQN,QUEENS
66404,15101,2022-12-31,20:25:00,2022,December,Saturday,Based on Radio Run,APP,POM,44,...,,44.0,E,,1310 SHERIDAN AVE,SHERIDAN AVE,1008017.0,244068.0,PBBX,BRONX


In [4]:
cleaned_df = clean_data(raw_df, column_categories)
print(f'Approximately {(len(cleaned_df)/ len(raw_df)*100):.2f}% of the original data after cleaning')


plausible_df = cleaned_df[
    (cleaned_df.apply(is_valid_child, axis=1))
]
print(f'{(len(plausible_df) / len(cleaned_df)*100):.2f}% of data remains after age-height validation of children')

Approximately 83.02% of the original data after cleaning
99.96% of data remains after age-height validation of children


In [5]:
geocoded_df = geocode_df(plausible_df)

NTA CRS: EPSG:4326
Stops GDF CRS: EPSG:4326
NTA CRS: EPSG:4326
Sample of transformed coordinates:
         lat        lon
0  40.832167 -73.893445
1  40.672714 -73.753920
3  40.680767 -73.906721
4  40.812541 -73.955302
5  40.833561 -73.895933
Points outside expected NYC bounds: 20
Unmatched points: 26 out of 55073 (0.05%)
Unmatched points remaining after cleaning: 0 out of 55047 (0.00%)


In [6]:
validate_neighborhood_data(geocoded_df)

All neighborhood-borough mappings validated successfully!
Unique missing NTAs: {'Rockaway Community Park', 'Mount Olivet & All Faiths Cemeteries', 'Ferry Point Park-St. Raymond Cemetery', 'Holy Cross Cemetery', "St. Michael's Cemetery"}
Found 23 borough name mismatches:
      STOP_ID                                ntaname STOP_LOCATION_BORO_NAME  \
590       591                Howard Beach-Lindenwood                BROOKLYN   
1993     1994                             Ozone Park                BROOKLYN   
1994     1995                             Ozone Park                BROOKLYN   
4126     4127                        Bushwick (East)                  QUEENS   
4919     4920  Flatbush (West)-Ditmas Park-Parkville                  QUEENS   

      boroname  
590     Queens  
1993    Queens  
1994    Queens  
4126  Brooklyn  
4919  Brooklyn  
After cleaning: Found 0 borough name mismatches:
Empty DataFrame
Columns: [STOP_ID, ntaname, STOP_LOCATION_BORO_NAME, boroname]
Index: []


In [7]:
final_df = prepare_additional_features(geocoded_df)
final_df

OUTCOME_OF_STOP 
No Charges Filed    34939
Arrested            18501
Summoned             1607
Name: count, dtype: int64
OFFICER_USED_FORCE
False                 41203
True                  13844
Name: count, dtype: int64
FORCE_TYPE          
No Force                41203
Handcuffs                8891
Firearm Drawn            2184
Restraint Used           1241
Other Physical Force     1199
Taser                     303
Weapon Impact              21
Pepper Spray                5
Name: count, dtype: int64


Unnamed: 0,STOP_FRISK_DATE,YEAR2,STOP_ID,OBSERVED_DURATION_MINUTES,STOP_DURATION_MINUTES,SUSPECT_REPORTED_AGE,SUSPECT_WEIGHT,STOP_LOCATION_X,STOP_LOCATION_Y,MONTH2,...,boroname,ntatype,nta2020,borocode,countyfips,ntaabbrev,cdta2020,OUTCOME_OF_STOP,OFFICER_USED_FORCE,FORCE_TYPE
0,2021-01-01 00:01:00,2021,1,0,2,40,200,1013737,242476,January,...,Bronx,0,BX0303,2,005,CrtnaPkEst,BX03,Arrested,True,Handcuffs
1,2021-01-01 00:01:00,2021,2,1,10,19,160,1052511,184460,January,...,Queens,0,QN1305,4,081,Lrltn,QN13,Arrested,False,No Force
2,2021-01-01 00:01:00,2021,4,1,5,28,180,1010122,187312,January,...,Brooklyn,0,BK1601,3,047,OcnHl,BK16,No Charges Filed,False,No Force
3,2021-01-01 00:01:00,2021,5,1,30,19,185,996623,235311,January,...,Manhattan,0,MN0901,1,061,MrngsdHts,MN09,No Charges Filed,True,Handcuffs
4,2021-01-01 00:01:00,2021,6,0,6,32,160,1013048,242983,January,...,Bronx,0,BX0303,2,005,CrtnaPkEst,BX03,Arrested,False,No Force
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55042,2022-01-31 00:12:00,2022,15098,1,7,45,250,1021687,236876,December,...,Bronx,0,BX0902,2,005,Sdvw_ClsPt,BX09,No Charges Filed,False,No Force
55043,2022-01-31 00:12:00,2022,15099,0,1,36,,1009784,239429,December,...,Bronx,0,BX0301,2,005,Mrrsnia,BX03,No Charges Filed,False,No Force
55044,2022-01-31 00:12:00,2022,15100,1,2,32,160,1019464,212685,December,...,Queens,0,QN0301,4,081,JcksnHts,QN03,No Charges Filed,False,No Force
55045,2022-01-31 00:12:00,2022,15101,1,1,18,150,1008017,244068,December,...,Bronx,0,BX0403,2,005,MtEdn,BX04,No Charges Filed,False,No Force


In [8]:
first_type = type(final_df['STOP_FRISK_DATE'].iloc[0])
mixed_rows = final_df[final_df['STOP_FRISK_DATE'].apply(lambda x: type(x)) != first_type]
print(f"# of mixed rows: {len(mixed_rows)}")
mixed_rows.head()

# of mixed rows: 0


Unnamed: 0,STOP_FRISK_DATE,YEAR2,STOP_ID,OBSERVED_DURATION_MINUTES,STOP_DURATION_MINUTES,SUSPECT_REPORTED_AGE,SUSPECT_WEIGHT,STOP_LOCATION_X,STOP_LOCATION_Y,MONTH2,...,boroname,ntatype,nta2020,borocode,countyfips,ntaabbrev,cdta2020,OUTCOME_OF_STOP,OFFICER_USED_FORCE,FORCE_TYPE


In [9]:
final_df = final_df.dropna(subset=['STOP_FRISK_DATE'])
final_df.to_csv('./data/processed/stop-and-frisk.csv', index=False)
final_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 55047 entries, 0 to 55046
Data columns (total 71 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   STOP_FRISK_DATE                         55047 non-null  datetime64[ns]
 1   YEAR2                                   55047 non-null  Int64         
 2   STOP_ID                                 55047 non-null  Int64         
 3   OBSERVED_DURATION_MINUTES               55047 non-null  Int64         
 4   STOP_DURATION_MINUTES                   55047 non-null  Int64         
 5   SUSPECT_REPORTED_AGE                    55047 non-null  Int64         
 6   SUSPECT_WEIGHT                          54113 non-null  Int64         
 7   STOP_LOCATION_X                         55047 non-null  Int64         
 8   STOP_LOCATION_Y                         55047 non-null  Int64         
 9   MONTH2                                  55