In [57]:
from pandas import read_csv, DataFrame
import numpy as np

data: DataFrame = read_csv("class_ny_arrests.csv",index_col="ARREST_KEY", na_values={"", "UNKNOWN"})

In [None]:
data.head()
sampled_data = data.sample(n=10000, random_state=1)
sampled_data.head()

Unnamed: 0_level_0,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude
ARREST_KEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
32311380,06/18/2007,511.0,"CONTROLLED SUBSTANCE, POSSESSION 7",235.0,DANGEROUS DRUGS,PL 2200300,M,Q,27,1.0,18-24,M,BLACK,,,,
192799737,01/26/2019,177.0,SEXUAL ABUSE,116.0,SEX CRIMES,PL 1306503,F,M,25,0.0,45-64,M,BLACK,1000555.0,230994.0,40.800694,-73.941109
193260691,02/06/2019,,,,,PL 2203400,F,M,14,0.0,25-44,M,,986685.0,215375.0,40.757839,-73.991212
149117452,01/06/2016,153.0,RAPE 3,104.0,RAPE,PL 1302503,F,K,67,0.0,25-44,M,BLACK,998032.0,175598.0,40.64865,-73.950336
190049060,11/15/2018,157.0,RAPE 1,104.0,RAPE,PL 1303501,F,K,77,0.0,25-44,M,BLACK,1003606.0,185050.0,40.674583,-73.930222


In [59]:
def get_variable_types(df):
    return df.dtypes

# Get the variable types of the 'data' DataFrame
variable_types = get_variable_types(data)
numeric_columns = variable_types[variable_types == 'float64'].index.tolist() + variable_types[variable_types == 'int64'].index.tolist()
symbolic_columns = variable_types[variable_types == 'object'].index.tolist()

print("Numeric columns:", numeric_columns)
print("Symbolic columns:", symbolic_columns)

Numeric columns: ['PD_CD', 'KY_CD', 'JURISDICTION_CODE', 'X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude', 'ARREST_PRECINCT']
Symbolic columns: ['ARREST_DATE', 'PD_DESC', 'OFNS_DESC', 'LAW_CODE', 'LAW_CAT_CD', 'ARREST_BORO', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE']


In [60]:
for column in symbolic_columns:
    print(f"{column}: {data[column].nunique()} unique values")

ARREST_DATE: 5844 unique values
PD_DESC: 337 unique values
OFNS_DESC: 72 unique values
LAW_CODE: 2114 unique values
LAW_CAT_CD: 2 unique values
ARREST_BORO: 5 unique values
AGE_GROUP: 87 unique values
PERP_SEX: 2 unique values
PERP_RACE: 7 unique values


In [61]:
unique_arrest_boro = data['AGE_GROUP'].unique()
print(unique_arrest_boro)

['18-24' '45-64' '25-44' '<18' '65+' '929' '944' '945' '320' '932' '894'
 '935' '934' '940' nan '339' '323' '965' '948' '946' '938' '895' '194'
 '928' '352' '943' '956' '920' '951' '708' '816' '316' '912' '237' '939'
 '947' '740' '952' '949' '338' '922' '933' '921' '1042' '2000' '1937'
 '1916' '1928' '2007' '1918' '1947' '1953' '1942' '1024' '1948' '937'
 '1922' '1048' '1018' '918' '927' '926' '931' '942' '924' '941' '930'
 '955' '936' '812' '317' '909' '923' '914' '896' '709' '640' '925' '212'
 '200' '954' '314' '723' '309' '959' '910' '446' '330']


In [62]:
data['CLASS'] = data['JURISDICTION_CODE'].apply(lambda x: 0 if x < 3 else 1)

In [63]:
# Define valid age groups
valid_age_groups = ['<18', '18-24', '25-44', '45-64', '65+']

# Replace erroneous values with NaN
data['AGE_GROUP'] = data['AGE_GROUP'].apply(lambda x: x if x in valid_age_groups else np.nan)


In [64]:
data['LAW_CAT_CD'] = data['LAW_CAT_CD'].map({'M': 0, 'F': 1})
data['PERP_SEX'] = data['PERP_SEX'].map({'M': 0, 'F': 1})
data['PERP_RACE'] = data['PERP_RACE'].map({
    'BLACK': 0,
    'WHITE': 1,
    'BLACK HISPANIC': 2,
    'WHITE HISPANIC': 3,
    'ASIAN / PACIFIC ISLANDER': 4,
    'AMERICAN INDIAN/ALASKAN NATIVE': 5
})
data['ARREST_BORO'] = data['ARREST_BORO'].map({
    'Q': 0,
    'M': 1,
    'K': 2,
    'B': 3,
    'S': 4
})

data['AGE_GROUP'] = data['AGE_GROUP'].map({
    '<18': 0,
    '18-24': 1,
    '25-44': 2,
    '45-64': 3,
    '65+': 4
})

In [65]:
variable_types = get_variable_types(data)
numeric_columns = variable_types[variable_types == 'float64'].index.tolist() + variable_types[variable_types == 'int64'].index.tolist()
symbolic_columns = variable_types[variable_types == 'object'].index.tolist()

print("Numeric columns:", numeric_columns)
print("Symbolic columns:", symbolic_columns)

Numeric columns: ['PD_CD', 'KY_CD', 'ARREST_BORO', 'JURISDICTION_CODE', 'AGE_GROUP', 'PERP_RACE', 'X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude', 'LAW_CAT_CD', 'ARREST_PRECINCT', 'PERP_SEX', 'CLASS']
Symbolic columns: ['ARREST_DATE', 'PD_DESC', 'OFNS_DESC', 'LAW_CODE']


In [66]:
data.drop(columns=['PD_DESC', 'OFNS_DESC'], inplace=True)

In [67]:
data.drop(columns=['LAW_CODE'], inplace=True)

In [1]:
from math import sin, cos, pi
import pandas as pd
from pandas import DataFrame

# Ensure ARREST_DATE is parsed as datetime
data['ARREST_DATE'] = pd.to_datetime(data['ARREST_DATE'], errors='coerce')

def encode_cyclic_variables(data: DataFrame, vars: list[str]) -> None:
    for v in vars:
        x_max: float | int = max(data[v])
        data[v + "_sin"] = data[v].apply(lambda x: round(sin(2 * pi * x / x_max), 3))
        data[v + "_cos"] = data[v].apply(lambda x: round(cos(2 * pi * x / x_max), 3))
    return

# Extract day of the year for cyclic encoding
data['ARREST_DAYOFYEAR'] = data['ARREST_DATE'].dt.dayofyear

# Call the function (it modifies data in-place, so no need to assign)
encode_cyclic_variables(data, ['ARREST_DAYOFYEAR'])

# Display the first few rows
data.head()


NameError: name 'data' is not defined

In [69]:
data.drop(columns=['ARREST_DATE'], inplace=True)
data.drop(columns=["LAW_CAT_CD"], inplace=True)

In [70]:
data.reset_index(drop=True, inplace=True)

In [71]:
data.head()

Unnamed: 0,PD_CD,KY_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,CLASS,ARREST_DAYOFYEAR,ARREST_DAYOFYEAR_sin,ARREST_DAYOFYEAR_cos
0,511.0,235.0,0.0,27,1.0,1.0,0,0.0,,,,,0,169,0.238,-0.971
1,177.0,116.0,1.0,25,0.0,3.0,0,0.0,1000555.0,230994.0,40.800694,-73.941109,0,26,0.432,0.902
2,,,1.0,14,0.0,2.0,0,,986685.0,215375.0,40.757839,-73.991212,0,37,0.593,0.805
3,153.0,104.0,2.0,67,0.0,2.0,0,0.0,998032.0,175598.0,40.64865,-73.950336,0,6,0.103,0.995
4,157.0,104.0,2.0,77,0.0,2.0,0,0.0,1003606.0,185050.0,40.674583,-73.930222,0,319,-0.722,0.692


In [72]:
data.to_csv("ny_arrests_enoded.csv", index=False)