In [45]:
from pandas import read_csv, DataFrame
import numpy as np

data: DataFrame = read_csv("class_ny_arrests.csv",index_col="ARREST_KEY", na_values={"", "UNKNOWN"})

In [46]:
data.head()
data = data.sample(n=10000, random_state=1)
data.head()

Unnamed: 0_level_0,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude
ARREST_KEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
173644106,01/15/2018,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200502,F,Q,103,0.0,25-44,M,BLACK,1040260.0,193180.0,40.696742,-73.798014
83480788,03/04/2012,101.0,ASSAULT 3,344.0,ASSAULT 3 & RELATED OFFENSES,PL 1200001,M,M,18,0.0,18-24,M,WHITE,987121.0,217093.0,40.762554,-73.989638
87189586,10/11/2012,744.0,BAIL JUMPING 3,359.0,OFFENSES AGAINST PUBLIC ADMINISTRATION,PL 2155500,M,Q,102,0.0,25-44,M,WHITE,1032428.0,198872.0,40.712411,-73.826217
34021522,08/27/2007,478.0,"THEFT OF SERVICES, UNCLASSIFIED",343.0,OTHER OFFENSES RELATED TO THEFT,PL 1651503,M,B,40,0.0,25-44,M,BLACK,1007288.0,236469.0,40.815707,-73.916772
63162514,06/26/2009,503.0,"CONTROLLED SUBSTANCE,INTENT TO SELL 3",117.0,DANGEROUS DRUGS,PL 2201601,F,S,122,0.0,65+,F,WHITE,954943.0,148360.0,40.57385,-74.105496


In [47]:
def get_variable_types(df):
    return df.dtypes

# Get the variable types of the 'data' DataFrame
variable_types = get_variable_types(data)
numeric_columns = variable_types[variable_types == 'float64'].index.tolist() + variable_types[variable_types == 'int64'].index.tolist()
symbolic_columns = variable_types[variable_types == 'object'].index.tolist()

print("Numeric columns:", numeric_columns)
print("Symbolic columns:", symbolic_columns)

Numeric columns: ['PD_CD', 'KY_CD', 'JURISDICTION_CODE', 'X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude', 'ARREST_PRECINCT']
Symbolic columns: ['ARREST_DATE', 'PD_DESC', 'OFNS_DESC', 'LAW_CODE', 'LAW_CAT_CD', 'ARREST_BORO', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE']


In [48]:
for column in symbolic_columns:
    print(f"{column}: {data[column].nunique()} unique values")

ARREST_DATE: 4568 unique values
PD_DESC: 199 unique values
OFNS_DESC: 57 unique values
LAW_CODE: 464 unique values
LAW_CAT_CD: 2 unique values
ARREST_BORO: 5 unique values
AGE_GROUP: 5 unique values
PERP_SEX: 2 unique values
PERP_RACE: 7 unique values


In [49]:
unique_arrest_boro = data['AGE_GROUP'].unique()
print(unique_arrest_boro)

['25-44' '18-24' '65+' '45-64' '<18']


In [50]:
data['CLASS'] = data['JURISDICTION_CODE'].apply(lambda x: 0 if x < 3 else 1)
data.drop(columns=['LAW_CODE'], inplace=True)

In [51]:
# Define valid age groups
valid_age_groups = ['<18', '18-24', '25-44', '45-64', '65+']

# Replace erroneous values with NaN
data['AGE_GROUP'] = data['AGE_GROUP'].apply(lambda x: x if x in valid_age_groups else np.nan)


In [52]:
data['LAW_CAT_CD'] = data['LAW_CAT_CD'].map({'M': 0, 'F': 1})
data['PERP_SEX'] = data['PERP_SEX'].map({'M': 0, 'F': 1})
data['PERP_RACE'] = data['PERP_RACE'].map({
    'BLACK': 0,
    'WHITE': 1,
    'BLACK HISPANIC': 2,
    'WHITE HISPANIC': 3,
    'ASIAN / PACIFIC ISLANDER': 4,
    'AMERICAN INDIAN/ALASKAN NATIVE': 5
})
data['ARREST_BORO'] = data['ARREST_BORO'].map({
    'Q': 0,
    'M': 1,
    'K': 2,
    'B': 3,
    'S': 4
})

data['AGE_GROUP'] = data['AGE_GROUP'].map({
    '<18': 0,
    '18-24': 1,
    '25-44': 2,
    '45-64': 3,
    '65+': 4
})

In [53]:
variable_types = get_variable_types(data)
numeric_columns = variable_types[variable_types == 'float64'].index.tolist() + variable_types[variable_types == 'int64'].index.tolist()
symbolic_columns = variable_types[variable_types == 'object'].index.tolist()

print("Numeric columns:", numeric_columns)
print("Symbolic columns:", symbolic_columns)

Numeric columns: ['PD_CD', 'KY_CD', 'JURISDICTION_CODE', 'PERP_RACE', 'X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude', 'LAW_CAT_CD', 'ARREST_BORO', 'ARREST_PRECINCT', 'AGE_GROUP', 'PERP_SEX', 'CLASS']
Symbolic columns: ['ARREST_DATE', 'PD_DESC', 'OFNS_DESC']


In [54]:
data.drop(columns=['PD_DESC', 'OFNS_DESC'], inplace=True)

In [55]:
data.drop(columns=['JURISDICTION_CODE'], inplace=True)

In [56]:
from math import sin, cos, pi
import pandas as pd
from pandas import DataFrame

# Ensure ARREST_DATE is parsed as datetime
data['ARREST_DATE'] = pd.to_datetime(data['ARREST_DATE'], errors='coerce')

def encode_cyclic_variables(data: DataFrame, vars: list[str]) -> None:
    for v in vars:
        x_max: float | int = max(data[v])
        data[v + "_sin"] = data[v].apply(lambda x: round(sin(2 * pi * x / x_max), 3))
        data[v + "_cos"] = data[v].apply(lambda x: round(cos(2 * pi * x / x_max), 3))
    return

# Extract day of the year for cyclic encoding
data['ARREST_DAYOFYEAR'] = data['ARREST_DATE'].dt.dayofyear

# Call the function (it modifies data in-place, so no need to assign)
encode_cyclic_variables(data, ['ARREST_DAYOFYEAR'])

# Display the first few rows
data.head()


Unnamed: 0_level_0,ARREST_DATE,PD_CD,KY_CD,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,CLASS,ARREST_DAYOFYEAR,ARREST_DAYOFYEAR_sin,ARREST_DAYOFYEAR_cos
ARREST_KEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
173644106,2018-01-15,109.0,106.0,1,0,103,2,0,0.0,1040260.0,193180.0,40.696742,-73.798014,0,15,0.255,0.967
83480788,2012-03-04,101.0,344.0,0,1,18,1,0,1.0,987121.0,217093.0,40.762554,-73.989638,0,64,0.891,0.455
87189586,2012-10-11,744.0,359.0,0,0,102,2,0,1.0,1032428.0,198872.0,40.712411,-73.826217,0,285,-0.984,0.179
34021522,2007-08-27,478.0,343.0,0,3,40,2,0,0.0,1007288.0,236469.0,40.815707,-73.916772,0,239,-0.82,-0.572
63162514,2009-06-26,503.0,117.0,1,4,122,4,1,1.0,954943.0,148360.0,40.57385,-74.105496,0,177,0.103,-0.995


In [57]:
data.drop(columns=['ARREST_DATE'], inplace=True)
data.drop(columns=["LAW_CAT_CD"], inplace=True)

In [58]:
data.reset_index(drop=True, inplace=True)

In [59]:
data.head()

Unnamed: 0,PD_CD,KY_CD,ARREST_BORO,ARREST_PRECINCT,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,CLASS,ARREST_DAYOFYEAR,ARREST_DAYOFYEAR_sin,ARREST_DAYOFYEAR_cos
0,109.0,106.0,0,103,2,0,0.0,1040260.0,193180.0,40.696742,-73.798014,0,15,0.255,0.967
1,101.0,344.0,1,18,1,0,1.0,987121.0,217093.0,40.762554,-73.989638,0,64,0.891,0.455
2,744.0,359.0,0,102,2,0,1.0,1032428.0,198872.0,40.712411,-73.826217,0,285,-0.984,0.179
3,478.0,343.0,3,40,2,0,0.0,1007288.0,236469.0,40.815707,-73.916772,0,239,-0.82,-0.572
4,503.0,117.0,4,122,4,1,1.0,954943.0,148360.0,40.57385,-74.105496,0,177,0.103,-0.995


In [60]:
data.to_csv("ny_arrests_enoded.csv", index=False)