# Setup

This section covers the initial setup, including library imports, path definitions, and mounting Google Drive.

## Mount Google Drive

In [6]:
# from google.colab import drive
# drive.mount('/drive', force_remount=True)

## Import Libraries

In [7]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
# !pip install openpyxl
import openpyxl
# !pip install holidays
import holidays
from datetime import datetime

# Enhanced ML imports for better preprocessing pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin

warnings.filterwarnings('ignore')

## Define Paths

In [None]:
# Path variables
# base_dir = "/drive/MyDrive/Data Mining and Machine Learning/Progetto"
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "JupyterOutputs"))
cleaned_integrated_dir = os.path.join(base_dir, "DataIntegrated")
cleaned_integrated_data_file = os.path.join(cleaned_integrated_dir, "cleaned_integrated_crime_data.csv")
feature_engineering_dir = os.path.join(base_dir, "FeatureEngineered")
feature_engineered_file_path = os.path.join(feature_engineering_dir, "feature_engineered_crime_data.csv")
pd_codes_file = os.path.join(os.path.dirname(os.path.dirname(base_dir)), "Documents", "PDCode_PenalLaw.xlsx")

# Check if pd_codes_file exists
if not os.path.exists(pd_codes_file):
    raise FileNotFoundError(f"PD codes file not found at: {pd_codes_file}")

# Ensure output directory exists
os.makedirs(feature_engineering_dir, exist_ok=True)

print(f"Base directory: {base_dir}")
print(f"Feature engineering output directory: {feature_engineering_dir}")
print(f"Looking for cleaned integrated data at: {cleaned_integrated_data_file}")
print(f"Looking for PD codes file at: {pd_codes_file}")

Base directory: C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs
Feature engineering output directory: C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs\FeatureEngineered
Looking for cleaned integrated data at: C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs\DataIntegrated\cleaned_integrated_crime_data.csv
Looking for PD codes file at: C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs\..\Documents\PDCode_PenalLaw.xlsx


# Load Integrated Data

Load the dataset produced by the Data Integration phase.

In [9]:
# Load dataset
print("=== Loading Integrated Data ===")
try:
    if os.path.exists(cleaned_integrated_data_file):
        df = pd.read_csv(cleaned_integrated_data_file)
        initial_rows = len(df)
        print(f"Dataset loaded successfully: {initial_rows} rows and {df.shape[1]} columns")
        print(f"Columns in the dataset: {df.columns.tolist()}")

        # Basic validation
        if initial_rows == 0:
            raise ValueError("Dataset is empty")
        if df.shape[1] < 5:
            raise ValueError("Dataset has insufficient columns")

    else:
        raise FileNotFoundError(f"Could not find cleaned integrated dataset at: {cleaned_integrated_data_file}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise RuntimeError(f"Failed to load required dataset for feature engineering: {e}")

=== Loading Integrated Data ===
Dataset loaded successfully: 2496759 rows and 34 columns
Columns in the dataset: ['BORO_NM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCCUR_DESC', 'OFNS_DESC', 'PARKS_NM', 'PD_CD', 'PREM_TYP_DESC', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'Latitude', 'Longitude', 'BAR_DISTANCE', 'NIGHTCLUB_DISTANCE', 'ATM_DISTANCE', 'ATMS_COUNT', 'BARS_COUNT', 'BUS_STOPS_COUNT', 'METROS_COUNT', 'NIGHTCLUBS_COUNT', 'SCHOOLS_COUNT', 'METRO_DISTANCE', 'MIN_POI_DISTANCE', 'AVG_POI_DISTANCE', 'MAX_POI_DISTANCE', 'TOTAL_POI_COUNT', 'POI_DIVERSITY', 'POI_DENSITY_SCORE']


## Initial Data Overview

Display basic information, summary statistics, and a sample of the loaded dataset.

In [10]:
# Display basic dataset overview
print("\n=== Dataset Overview ===")
print(df.info())
print("\n=== Summary Statistics ===")
print(df.describe())
print("\n=== Sample Row ===")
print(df.sample())


=== Dataset Overview ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2496759 entries, 0 to 2496758
Data columns (total 34 columns):
 #   Column              Dtype  
---  ------              -----  
 0   BORO_NM             object 
 1   CMPLNT_FR_DT        object 
 2   CMPLNT_FR_TM        object 
 3   KY_CD               int64  
 4   LAW_CAT_CD          object 
 5   LOC_OF_OCCUR_DESC   object 
 6   OFNS_DESC           object 
 7   PARKS_NM            object 
 8   PD_CD               int64  
 9   PREM_TYP_DESC       object 
 10  SUSP_AGE_GROUP      object 
 11  SUSP_RACE           object 
 12  SUSP_SEX            object 
 13  VIC_AGE_GROUP       object 
 14  VIC_RACE            object 
 15  VIC_SEX             object 
 16  Latitude            float64
 17  Longitude           float64
 18  BAR_DISTANCE        float64
 19  NIGHTCLUB_DISTANCE  float64
 20  ATM_DISTANCE        float64
 21  ATMS_COUNT          float64
 22  BARS_COUNT          float64
 23  BUS_STOPS_COUNT     float64
 24

# Load External PD Codes

Load the external Excel file containing mappings between PD codes and descriptions.

In [11]:
print("\n=== Loading External PD Codes ===")
try:
    if os.path.exists(pd_codes_file):
        df_codes = pd.read_excel(pd_codes_file)
        initial_rows = len(df_codes)
        print(f"PD Codes dataset loaded successfully: {initial_rows} rows and {df_codes.shape[1]} columns")
        print(f"Columns in the codes dataset: {df_codes.columns.tolist()}")
    else:
        raise FileNotFoundError(f"Could not find PD codes dataset at: {pd_codes_file}")
except Exception as e:
    print(f"Error loading PD codes dataset: {e}")
    # Instead of exit(1), print the error and continue
    print(f"Error details: {e}")


=== Loading External PD Codes ===
PD Codes dataset loaded successfully: 4671 rows and 5 columns
Columns in the codes dataset: ['PDCODE_VALUE', 'LAW_NYS', 'CATEGORY', 'LIT_LONG', 'LIT_SHORT']


# Feature Engineering

This section focuses on creating new features and refining existing ones based on domain knowledge and data characteristics.

## 1. Impute Missing `OFNS_DESC`

Fill missing offense descriptions (`OFNS_DESC`) using the mapping from `PD_CD` to `LIT_SHORT` found in the external codes file.

In [12]:
print("\n=== Feature Engineering: Imputing OFNS_DESC ===")
# Ensure both codes are of type string
df['PD_CD'] = df['PD_CD'].astype(str)
df_codes['PDCODE_VALUE'] = df_codes['PDCODE_VALUE'].astype(str)

# Create a mapping dictionary from the first occurrence of each code
code_to_lit_short = df_codes.drop_duplicates(subset='PDCODE_VALUE').set_index('PDCODE_VALUE')['LIT_SHORT'].to_dict()

# Fill OFNS_DESC where it's '(null)' using the mapping
null_mask = df['OFNS_DESC'] == '(null)'
initial_nulls = df['OFNS_DESC'].isin(['(null)', None, np.nan]).sum()
df.loc[null_mask, 'OFNS_DESC'] = (
    df.loc[null_mask, 'PD_CD'].map(code_to_lit_short)
    .str.strip()
    .fillna(df.loc[null_mask, 'OFNS_DESC']) # Keep original if map fails
)

# count missing after imputation
final_nulls = df['OFNS_DESC'].isin(['(null)', None, np.nan]).sum()

# print summary
print(f"OFNS_DESC missing before imputation: {initial_nulls}")
print(f"OFNS_DESC missing after imputation:  {final_nulls}")
print(f"Number of values filled: {initial_nulls - final_nulls}")


=== Feature Engineering: Imputing OFNS_DESC ===
OFNS_DESC missing before imputation: 44
OFNS_DESC missing after imputation:  0
Number of values filled: 44


## 2. Create Temporal Features

Extract time-based features from the complaint date and time:
- `HOUR`: Hour of the day (0-23)
- `WEEKDAY`: Name of the day (e.g., MONDAY)
- `IS_WEEKEND`: Binary flag (1 for Saturday/Sunday, 0 otherwise)
- `MONTH`: Month of the year (1-12)
- `SEASON`: Categorical season (WINTER, SPRING, SUMMER, AUTUMN)
- `TIME_BUCKET`: Categorical time of day (NIGHT, MORNING, AFTERNOON, EVENING)

In [13]:
print("\n=== Feature Engineering: Temporal Features ===")
# 1) Unified timestamp (temporary)
df['TIMESTAMP'] = pd.to_datetime(
    df['CMPLNT_FR_DT'] + ' ' + df['CMPLNT_FR_TM'],
    format='%Y/%m/%d %H:%M:%S.%f', # Adjusted format based on previous steps
    errors='coerce'
)

# Handle potential coercion errors (if any)
invalid_timestamps = df['TIMESTAMP'].isna().sum()
if invalid_timestamps > 0:
    print(f"Warning: {invalid_timestamps} rows had invalid date/time formats and resulted in NaT timestamps.")

# 2) Hour of the day
df['HOUR'] = df['TIMESTAMP'].dt.hour

# 3) Day
df['DAY'] = df['TIMESTAMP'].dt.day

# 4) Weekday
df['WEEKDAY'] = df['TIMESTAMP'].dt.day_name().str.upper()

# 5) Flag weekend (1 = Saturday/Sunday)
df['IS_WEEKEND'] = df['WEEKDAY'].isin(['SATURDAY', 'SUNDAY']).astype(int)

# 6) Month
df['MONTH'] = df['TIMESTAMP'].dt.month

# 7) Year
df['YEAR'] = df['TIMESTAMP'].dt.year

# 8) Season
def map_season(month):
    if pd.isna(month): return 'UNKNOWN' # Handle potential NaNs from NaT timestamps
    if month in [12, 1, 2]: return 'WINTER'
    elif month in [3, 4, 5]: return 'SPRING'
    elif month in [6, 7, 8]: return 'SUMMER'
    else: return 'AUTUMN'
df['SEASON'] = df['MONTH'].apply(map_season)

# 9) Time Bucket
def time_bucket(hour):
    if pd.isna(hour): return 'UNKNOWN' # Handle potential NaNs
    if hour < 6: return 'NIGHT'
    elif hour < 12: return 'MORNING'
    elif hour < 18: return 'AFTERNOON'
    else: return 'EVENING'
df['TIME_BUCKET'] = df['HOUR'].apply(time_bucket)

# Drop intermediate and original time columns
df = df.drop(columns=['CMPLNT_FR_TM', 'CMPLNT_FR_DT', 'TIMESTAMP'])
print("Created temporal features: HOUR, DAY, WEEKDAY, IS_WEEKEND, MONTH, YEAR, SEASON, TIME_BUCKET")
print(f"Columns after adding temporal features: {df.columns.tolist()}")


=== Feature Engineering: Temporal Features ===
Created temporal features: HOUR, DAY, WEEKDAY, IS_WEEKEND, MONTH, YEAR, SEASON, TIME_BUCKET
Columns after adding temporal features: ['BORO_NM', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCCUR_DESC', 'OFNS_DESC', 'PARKS_NM', 'PD_CD', 'PREM_TYP_DESC', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'Latitude', 'Longitude', 'BAR_DISTANCE', 'NIGHTCLUB_DISTANCE', 'ATM_DISTANCE', 'ATMS_COUNT', 'BARS_COUNT', 'BUS_STOPS_COUNT', 'METROS_COUNT', 'NIGHTCLUBS_COUNT', 'SCHOOLS_COUNT', 'METRO_DISTANCE', 'MIN_POI_DISTANCE', 'AVG_POI_DISTANCE', 'MAX_POI_DISTANCE', 'TOTAL_POI_COUNT', 'POI_DIVERSITY', 'POI_DENSITY_SCORE', 'HOUR', 'DAY', 'WEEKDAY', 'IS_WEEKEND', 'MONTH', 'YEAR', 'SEASON', 'TIME_BUCKET']


## 2.1. Create Holiday and Payday Features

Create binary flags for holidays and assumed paydays to capture potential temporal patterns related to these events.

In [14]:
print("\n=== Feature Engineering: Holiday and Payday Features ===")
# Recreate a datetime series from YEAR, MONTH, DAY for holiday and payday checking
# Ensure YEAR, MONTH, DAY are present and are not NaN before attempting conversion
if 'YEAR' in df.columns and 'MONTH' in df.columns and 'DAY' in df.columns:
    # Create a temporary date column, handling potential errors by setting to NaT
    # This is crucial if YEAR, MONTH, DAY could form invalid dates (e.g., Feb 30)
    # However, pd.to_datetime will handle standard invalid dates by raising errors if not coerced.
    # For safety, let's ensure components are integer and handle NaNs that might arise from prior steps.
    date_components = df[['YEAR', 'MONTH', 'DAY']].dropna()
    df_dates = pd.to_datetime(date_components, errors='coerce')

    # Initialize US holidays
    us_holidays = holidays.US(years=df_dates.dt.year.unique())

    # Create IS_HOLIDAY column
    # Apply to the original df index to ensure alignment
    df['IS_HOLIDAY'] = 0
    df.loc[df_dates.index, 'IS_HOLIDAY'] = df_dates.dt.date.apply(lambda date: 1 if date in us_holidays else 0).astype(int)
    print(f"Created IS_HOLIDAY feature. {df['IS_HOLIDAY'].sum()} instances on a holiday.")

    # Create IS_PAYDAY column (assuming 1st and 15th of the month)
    df['IS_PAYDAY'] = 0
    df.loc[df_dates.index, 'IS_PAYDAY'] = df_dates.apply(lambda x: 1 if x.day == 1 or x.day == 15 else 0).astype(int)
    print(f"Created IS_PAYDAY feature. {df['IS_PAYDAY'].sum()} instances on an assumed payday.")
else:
    print("Warning: YEAR, MONTH, or DAY column not found. Skipping IS_HOLIDAY and IS_PAYDAY creation.")

print(f"Columns after adding holiday/payday features: {df.columns.tolist()}")


=== Feature Engineering: Holiday and Payday Features ===
Created IS_HOLIDAY feature. 82010 instances on a holiday.
Created IS_PAYDAY feature. 180576 instances on an assumed payday.
Columns after adding holiday/payday features: ['BORO_NM', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCCUR_DESC', 'OFNS_DESC', 'PARKS_NM', 'PD_CD', 'PREM_TYP_DESC', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'Latitude', 'Longitude', 'BAR_DISTANCE', 'NIGHTCLUB_DISTANCE', 'ATM_DISTANCE', 'ATMS_COUNT', 'BARS_COUNT', 'BUS_STOPS_COUNT', 'METROS_COUNT', 'NIGHTCLUBS_COUNT', 'SCHOOLS_COUNT', 'METRO_DISTANCE', 'MIN_POI_DISTANCE', 'AVG_POI_DISTANCE', 'MAX_POI_DISTANCE', 'TOTAL_POI_COUNT', 'POI_DIVERSITY', 'POI_DENSITY_SCORE', 'HOUR', 'DAY', 'WEEKDAY', 'IS_WEEKEND', 'MONTH', 'YEAR', 'SEASON', 'TIME_BUCKET', 'IS_HOLIDAY', 'IS_PAYDAY']


## 3. Refine Location Features

Improve location-related features using imputation and rule-based logic:
- Impute missing `PREM_TYP_DESC` based on proximity to POIs (Bar, Nightclub, ATM, Metro) and offense type (`OFNS_DESC`).
- Group `PARKS_NM` entries into `PREM_TYP_DESC` as 'PARK/PLAYGROUND'.
- Impute missing `LOC_OF_OCCUR_DESC` based on `PREM_TYP_DESC` and `OFNS_DESC`.

### 3.1 Impute `PREM_TYP_DESC`

In [15]:
print("\n=== Feature Engineering: Refining Location Features - Impute PREM_TYP_DESC ===")
# 1) Identify rows where PREM_TYP_DESC is missing or placeholder
missing_prem_placeholders = ['(NULL)', None, np.nan, 'UNKNOWN'] # Include UNKNOWN from previous cleaning
mask_null_prem = df['PREM_TYP_DESC'].isin(missing_prem_placeholders)
initial_missing_prem = mask_null_prem.sum()
print(f"Initial missing/placeholder PREM_TYP_DESC: {initial_missing_prem}")

# 2) Compute nearest POI and its distance (helper columns)
poi_cols = ['BAR_DISTANCE', 'NIGHTCLUB_DISTANCE', 'ATM_DISTANCE', 'METRO_DISTANCE']
df['NEAREST_POI_DIST'] = df[poi_cols].min(axis=1)
df['NEAREST_POI_TYPE'] = df[poi_cols].idxmin(axis=1)

# Mapping from distance column name to PREM_TYP_DESC value
distance_to_prem = {
    'BAR_DISTANCE': 'BAR/NIGHT CLUB',
    'NIGHTCLUB_DISTANCE': 'BAR/NIGHT CLUB',
    'ATM_DISTANCE': 'ATM',
    'METRO_DISTANCE': 'TRANSIT - NYC SUBWAY'
}

# 3) Distance-based imputation: only if nearest POI is within 30 meters
mask_impute_by_dist = mask_null_prem & (df['NEAREST_POI_DIST'] <= 30)
df.loc[mask_impute_by_dist, 'PREM_TYP_DESC'] = df.loc[mask_impute_by_dist, 'NEAREST_POI_TYPE'].map(distance_to_prem)
imputed_by_dist_count = mask_impute_by_dist.sum()
print(f"Imputed {imputed_by_dist_count} PREM_TYP_DESC based on POI proximity (<= 30m).")

# Drop helper columns
df.drop(columns=['NEAREST_POI_DIST', 'NEAREST_POI_TYPE'], inplace=True)

# 4) Fallback using OFNS_DESC for street-related crimes
mask_still_null_prem = df['PREM_TYP_DESC'].isin(missing_prem_placeholders)
street_crime_keywords = ['BURGLARY', 'ROBBERY', 'ASSAULT', 'GRAND LARCENY', 'PETIT LARCENY', 'UNAUTHORIZED USE OF A VEHICLE']
mask_impute_by_offense = mask_still_null_prem & df['OFNS_DESC'].str.contains('|'.join(street_crime_keywords), na=False, case=False)
df.loc[mask_impute_by_offense, 'PREM_TYP_DESC'] = 'STREET'
imputed_by_offense_count = mask_impute_by_offense.sum()
print(f"Imputed {imputed_by_offense_count} PREM_TYP_DESC as 'STREET' based on OFNS_DESC.")

# 5) Final default for any remaining missing values
mask_final_null_prem = df['PREM_TYP_DESC'].isin(missing_prem_placeholders)
final_default_count = mask_final_null_prem.sum()
df.loc[mask_final_null_prem, 'PREM_TYP_DESC'] = 'OTHER'
print(f"Filled remaining {final_default_count} missing PREM_TYP_DESC with 'OTHER'.")

# Final check
final_missing_prem = df['PREM_TYP_DESC'].isin(missing_prem_placeholders).sum()
print(f"PREM_TYP_DESC missing/placeholders after imputation: {final_missing_prem}")


=== Feature Engineering: Refining Location Features - Impute PREM_TYP_DESC ===
Initial missing/placeholder PREM_TYP_DESC: 35590
Imputed 2225 PREM_TYP_DESC based on POI proximity (<= 30m).
Imputed 12816 PREM_TYP_DESC as 'STREET' based on OFNS_DESC.
Filled remaining 20549 missing PREM_TYP_DESC with 'OTHER'.
PREM_TYP_DESC missing/placeholders after imputation: 0


### 3.2 Consolidate Park Information

In [16]:
print("\n=== Feature Engineering: Refining Location Features - Consolidate Park Info ===")
# Group PARKS_NM into PREM_TYP_DESC
if 'PARKS_NM' in df.columns:
    park_mask = df['PARKS_NM'].notna() & (df['PARKS_NM'] != '(NULL)') & (df['PARKS_NM'] != 'UNKNOWN')
    park_update_count = park_mask.sum()
    df.loc[park_mask, 'PREM_TYP_DESC'] = 'PARK/PLAYGROUND'
    print(f"Updated {park_update_count} PREM_TYP_DESC entries to 'PARK/PLAYGROUND' based on PARKS_NM.")
    # Drop the original PARKS_NM column as it's now consolidated
    df.drop(columns=['PARKS_NM'], inplace=True)
    print("Dropped PARKS_NM column.")
else:
    print("PARKS_NM column not found, skipping consolidation.")


=== Feature Engineering: Refining Location Features - Consolidate Park Info ===
Updated 15739 PREM_TYP_DESC entries to 'PARK/PLAYGROUND' based on PARKS_NM.
Dropped PARKS_NM column.


### 3.3 Impute `LOC_OF_OCCUR_DESC`

In [17]:
print("\n=== Feature Engineering: Refining Location Features - Impute LOC_OF_OCCUR_DESC ===")
# 1) Identify missing/placeholder values
missing_loc_placeholders = ['(NULL)', None, np.nan, 'UNKNOWN']
mask_null_loc = df['LOC_OF_OCCUR_DESC'].isin(missing_loc_placeholders)
initial_missing_loc = mask_null_loc.sum()
print(f"Initial missing/placeholder LOC_OF_OCCUR_DESC: {initial_missing_loc}")

# 2) Rule-based mapping from PREM_TYP_DESC to LOC_OF_OCCUR_DESC
prem_to_loc = {
    # indoor locations
    'GROCERY/BODEGA':                     'INSIDE',
    'RESIDENCE - APT. HOUSE':             'INSIDE',
    'RESIDENCE-HOUSE':                    'INSIDE',
    'RESIDENCE - PUBLIC HOUSING':         'INSIDE',
    'DEPARTMENT STORE':                   'INSIDE',
    'CHAIN STORE':                        'INSIDE',
    'DRUG STORE':                         'INSIDE',
    'FOOD SUPERMARKET':                   'INSIDE',
    'COMMERCIAL BUILDING':                'INSIDE',
    'BANK':                               'INSIDE',
    'PUBLIC BUILDING':                    'INSIDE',
    'HOTEL/MOTEL':                        'INSIDE',
    'HOMELESS SHELTER':                   'INSIDE',
    'PUBLIC SCHOOL':                      'INSIDE',
    'PRIVATE/PAROCHIAL SCHOOL':           'INSIDE',
    'COLLEGE/UNIVERSITY':                 'INSIDE',
    'HOSPITAL':                           'INSIDE',
    'DOCTOR/DENTIST OFFICE':              'INSIDE',
    'GYM/FITNESS FACILITY':               'INSIDE',
    'BAR/NIGHT CLUB':                     'INSIDE',
    'RESTAURANT/DINER':                   'INSIDE',
    'FAST FOOD':                          'INSIDE',
    'DRY CLEANER/LAUNDRY':                'INSIDE',
    'BEAUTY & NAIL SALON':                'INSIDE',
    'CLOTHING/BOUTIQUE':                  'INSIDE',
    'JEWELRY':                            'INSIDE',
    'PHOTO/COPY':                         'INSIDE',
    'VIDEO STORE':                        'INSIDE',
    'STORE UNCLASSIFIED':                 'INSIDE',
    'SMALL MERCHANT':                     'INSIDE',
    'CANDY STORE':                        'INSIDE',
    'VARIETY STORE':                      'INSIDE',
    'SHOE':                               'INSIDE',
    'CHECK CASHING BUSINESS':             'INSIDE',
    'STORAGE FACILITY':                   'INSIDE',
    'REAL ESTATE':                        'INSIDE',
    'SOCIAL CLUB/POLICY':                 'INSIDE',
    'OTHER HOUSE OF WORSHIP':             'INSIDE',
    'CHURCH':                             'INSIDE',
    'SYNAGOGUE':                          'INSIDE',
    'MOSQUE':                             'INSIDE',
    'DAYCARE FACILITY':                   'INSIDE',
    'ABANDONED BUILDING':                 'INSIDE',
    'LOAN COMPANY':                       'INSIDE',
    'TAXI (YELLOW LICENSED)':             'INSIDE',
    'TAXI (LIVERY LICENSED)':             'INSIDE',
    'TAXI/LIVERY (UNLICENSED)':           'INSIDE',
    'BUS (NYC TRANSIT)':                  'INSIDE',
    'BUS (OTHER)':                        'INSIDE',
    'TRANSIT FACILITY (OTHER)':           'INSIDE',
    'AIRPORT TERMINAL':                   'INSIDE',
    'FERRY/FERRY TERMINAL':               'INSIDE',

    # outdoor or semi‐outdoor locations
    'STREET':                             'REAR',
    'HIGHWAY/PARKWAY':                    'REAR',
    'TUNNEL':                             'REAR',
    'PARK/PLAYGROUND':                    'REAR',
    'OPEN AREAS (OPEN LOTS)':             'REAR',
    'PARKING LOT/GARAGE (PUBLIC)':        'REAR',
    'PARKING LOT/GARAGE (PRIVATE)':       'REAR',
    'GAS STATION':                        'REAR',
    'ATM':                                'REAR',
    'BUS STOP':                           'REAR',
    'BUS TERMINAL':                       'REAR',
    'TRAMWAY':                            'REAR',
    'BRIDGE':                             'REAR',
    'MARINA/PIER':                        'REAR',
    'CEMETERY':                           'REAR',
    'CONSTRUCTION SITE':                  'REAR',
    'MOBILE FOOD':                        'REAR',
    'FERRY/FERRY TERMINAL':               'REAR',  # also treated as terminal interior above

    # catch-all for others
    'OTHER':                              'UNKNOWN',
    'STORE UNCLASSIFIED':                 'UNKNOWN',
    'TRANSIT - NYC SUBWAY':               'UNKNOWN' # Can be inside station, on platform, or on train
}

# Apply the mapping to fill missing LOC_OF_OCCUR_DESC
df.loc[mask_null_loc, 'LOC_OF_OCCUR_DESC'] = df.loc[mask_null_loc, 'PREM_TYP_DESC'].map(prem_to_loc)
imputed_by_prem_count = mask_null_loc.sum() - df['LOC_OF_OCCUR_DESC'].isin(missing_loc_placeholders).sum()
print(f"Imputed {imputed_by_prem_count} LOC_OF_OCCUR_DESC based on PREM_TYP_DESC mapping.")

# 3) Refinement using OFNS_DESC for potential inside crimes
mask_still_null_loc = df['LOC_OF_OCCUR_DESC'].isin(missing_loc_placeholders)
inside_crime_keywords = ['BURGLARY', 'TRESPASS', 'ROBBERY', 'ASSAULT'] # Keywords suggesting inside location
mask_impute_by_offense_loc = mask_still_null_loc & df['OFNS_DESC'].str.contains('|'.join(inside_crime_keywords), na=False, case=False)
df.loc[mask_impute_by_offense_loc, 'LOC_OF_OCCUR_DESC'] = 'INSIDE'
imputed_by_offense_loc_count = mask_impute_by_offense_loc.sum()
print(f"Imputed {imputed_by_offense_loc_count} LOC_OF_OCCUR_DESC as 'INSIDE' based on OFNS_DESC.")

# 4) Final default for any remaining missing values
mask_final_null_loc = df['LOC_OF_OCCUR_DESC'].isin(missing_loc_placeholders)
final_default_loc_count = mask_final_null_loc.sum()
df.loc[mask_final_null_loc, 'LOC_OF_OCCUR_DESC'] = 'UNKNOWN'
print(f"Filled remaining {final_default_loc_count} missing LOC_OF_OCCUR_DESC with 'UNKNOWN'.")

# Final check
final_missing_loc = df['LOC_OF_OCCUR_DESC'].isin(missing_loc_placeholders).sum()
print(f"LOC_OF_OCCUR_DESC missing/placeholders after imputation: {final_missing_loc}")


=== Feature Engineering: Refining Location Features - Impute LOC_OF_OCCUR_DESC ===
Initial missing/placeholder LOC_OF_OCCUR_DESC: 464374
Imputed 378151 LOC_OF_OCCUR_DESC based on PREM_TYP_DESC mapping.
Imputed 15240 LOC_OF_OCCUR_DESC as 'INSIDE' based on OFNS_DESC.
Filled remaining 70983 missing LOC_OF_OCCUR_DESC with 'UNKNOWN'.
LOC_OF_OCCUR_DESC missing/placeholders after imputation: 70983


## 5. Create Demographic Interaction Features

Generate features indicating whether the suspect and victim share the same demographic characteristics:
- `SAME_AGE_GROUP`: 1 if suspect and victim age groups match, 0 otherwise.
- `SAME_SEX`: 1 if suspect and victim sexes match, 0 otherwise.

In [18]:
print("\n=== Feature Engineering: Demographic Interaction Features ===")
# 1) SAME_AGE_GROUP: 1 if suspect and victim share the same age group, else 0
df['SAME_AGE_GROUP'] = 0
# Define valid (non-null, non-placeholder) age-group rows
valid_age_placeholders = ['UNKNOWN', '(NULL)', None, np.nan]
age_valid = (~df['SUSP_AGE_GROUP'].isin(valid_age_placeholders)) & (~df['VIC_AGE_GROUP'].isin(valid_age_placeholders))
# Set flag where age groups match
df.loc[age_valid & (df['SUSP_AGE_GROUP'] == df['VIC_AGE_GROUP']), 'SAME_AGE_GROUP'] = 1
print(f"Created SAME_AGE_GROUP feature. {df['SAME_AGE_GROUP'].sum()} instances where age groups match.")

# 2) SAME_SEX: 1 if suspect and victim have the same sex, else 0
df['SAME_SEX'] = 0
# Define valid (non-null, non-placeholder) sex rows
valid_sex_placeholders = ['U', 'UNKNOWN', '(NULL)', None, np.nan] # 'U' often means Unknown
sex_valid = (~df['SUSP_SEX'].isin(valid_sex_placeholders)) & (~df['VIC_SEX'].isin(valid_sex_placeholders))
# Set flag where sexes match
df.loc[sex_valid & (df['SUSP_SEX'] == df['VIC_SEX']), 'SAME_SEX'] = 1
print(f"Created SAME_SEX feature. {df['SAME_SEX'].sum()} instances where sexes match.")


=== Feature Engineering: Demographic Interaction Features ===
Created SAME_AGE_GROUP feature. 388596 instances where age groups match.
Created SAME_SEX feature. 515461 instances where sexes match.


In [19]:
print("\n=== Feature Engineering: TO_CHECK_CITIZENS Feature ===")
# List of offense descriptions that trigger the flag
to_check_citizens_list = [
    'CRIMINAL TRESPASS', 'CRIMINAL MISCHIEF & RELATED OF', 'HARRASSMENT 2',
    'GRAND LARCENY OF MOTOR VEHICLE', 'PETIT LARCENY', 'GRAND LARCENY',
    'INTOXICATED & IMPAIRED DRIVING', 'FRAUDS', 'THEFT-FRAUD',
    'OFF. AGNST PUB ORD SENSBLTY &', 'BURGLARY', 'ROBBERY', 'FELONY ASSAULT',
    'ASSAULT 3 & RELATED OFFENSES', 'DANGEROUS DRUGS', 'RAPE', 'SEX CRIMES',
    'DANGEROUS WEAPONS', 'ARSON', 'POSSESSION OF STOLEN PROPERTY',
    'UNAUTHORIZED USE OF A VEHICLE', 'FORGERY', 'ENDAN WELFARE INCOMP',
    'OTHER OFFENSES RELATED TO THEF', 'AGRICULTURE & MRKTS LAW-UNCLASSIFIED',
    'KIDNAPPING & RELATED OFFENSES', 'OTHER OFFENSES RELATED TO THEFT', # Duplicate, keep one
    'OFFENSES RELATED TO CHILDREN', 'BURGLAR\S TOOLS', 'ESCAPE 3',
    'CANNABIS RELATED OFFENSES', 'PETIT LARCENY OF MOTOR VEHICLE',
    'FELONY SEX CRIMES', 'PROSTITUTION & RELATED OFFENSES', 'JOSTLING',
    'KIDNAPPING', 'DISORDERLY CONDUCT', 'INTOXICATED/IMPAIRED DRIVING',
    'DISRUPTION OF A RELIGIOUS SERV', 'FRAUDULENT ACCOSTING', 'THEFT OF SERVICES',
    'UNRSNBLE NOISE', 'OTHER TRAFFIC INFRACTION', 'LOITERING/GAMBLING (CARDS, DIC',
    'UNLAWFUL POSS. WEAP. ON SCHOOL', 'FORTUNE TELLING', 'LOITERING',
    'FAIL RPT WOUNDS'
]
# Remove duplicates from the list
to_check_citizens_list = list(dict.fromkeys(to_check_citizens_list))

# Create the boolean column based on the condition
df['TO_CHECK_CITIZENS'] = df['OFNS_DESC'].isin(to_check_citizens_list).astype(int)
print(f"Created TO_CHECK_CITIZENS feature. {df['TO_CHECK_CITIZENS'].sum()} instances flagged.")


=== Feature Engineering: TO_CHECK_CITIZENS Feature ===
Created TO_CHECK_CITIZENS feature. 2270901 instances flagged.


In [20]:
print("\n=== Dropping Rows with NaN values ===")
initial_rows = len(df)
print(f"Initial number of rows: {initial_rows}")

# Drop rows that contain any NaN value
df_cleaned = df.dropna()

final_rows = len(df_cleaned)
rows_dropped = initial_rows - final_rows

print(f"Number of rows after dropping NaNs: {final_rows}")
print(f"Number of rows dropped: {rows_dropped}")

# Update the main DataFrame reference to the cleaned one
df = df_cleaned

print(f"Current DataFrame shape: {df.shape}")


=== Dropping Rows with NaN values ===
Initial number of rows: 2496759
Number of rows after dropping NaNs: 2496742
Number of rows dropped: 17
Current DataFrame shape: (2496742, 44)


In [21]:
print("\n=== Saving Feature Engineered Dataset (Before Scaling/PCA) ===")
# Ensure feature_engineering_dir and feature_engineered_file_path are defined

if 'df' in globals() and 'feature_engineered_file_path' in globals() and 'os' in globals():
    try:
        df.to_csv(feature_engineered_file_path, index=False)
        print(f"Feature engineered dataset (before scaling/PCA) saved successfully to: {feature_engineered_file_path}")
        print(f"Dataset shape: {df.shape}")
        print(f"Columns in the dataset: {df.columns.tolist()}")
    except NameError as ne:
        print(f"A NameError occurred while trying to save: {ne}. Ensure 'df' and path variables are correctly defined.")
    except Exception as e:
        print(f"Error saving feature engineered dataset (before scaling/PCA): {e}")
else:
    print("Error: 'df' or 'feature_engineered_file_path' or 'os' not found in globals. Cannot save the intermediate DataFrame.")


=== Saving Feature Engineered Dataset (Before Scaling/PCA) ===
Feature engineered dataset (before scaling/PCA) saved successfully to: C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs\FeatureEngineered\feature_engineered_crime_data.csv
Dataset shape: (2496742, 44)
Columns in the dataset: ['BORO_NM', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCCUR_DESC', 'OFNS_DESC', 'PD_CD', 'PREM_TYP_DESC', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'Latitude', 'Longitude', 'BAR_DISTANCE', 'NIGHTCLUB_DISTANCE', 'ATM_DISTANCE', 'ATMS_COUNT', 'BARS_COUNT', 'BUS_STOPS_COUNT', 'METROS_COUNT', 'NIGHTCLUBS_COUNT', 'SCHOOLS_COUNT', 'METRO_DISTANCE', 'MIN_POI_DISTANCE', 'AVG_POI_DISTANCE', 'MAX_POI_DISTANCE', 'TOTAL_POI_COUNT', 'POI_DIVERSITY', 'POI_DENSITY_SCORE', 'HOUR', 'DAY', 'WEEKDAY', 'IS_WEEKEND', 'MONTH', 'YEAR', 'SEASON', 'TIME_BUCKET', 'IS_HOLIDAY', 'IS_PAYDAY', 'SAME_AGE_GROUP', 'SAME_SEX', 'TO_CHECK_CITIZENS']


## Appendix: Feature Classification and Encoding

### Attribute Type Classification

Below is a classification of each column in the final feature-engineered dataset according to its data type and semantic meaning, *before* encoding and transformation. This helps guide the next steps in the Data Reduction and Transformation phase.

| Column                | Data Type                     | Details/Notes                                  | Transformation Needed | Encoding Type (if needed) |
|-----------------------|-------------------------------|------------------------------------------------|-----------------------|---------------------------|
| BORO_NM               | Categorical (nominal)         | Borough name                                   | Encoding              | One-Hot               |         |
| KY_CD            | Categorical (nominal)         | Crime classification code | Encoding              | One-Hot                   |       |
| LAW_CAT_CD            | Categorical (ordinal)         | Crime level (FELONY > MISDEMEANOR > VIOLATION) | Encoding              | Ordinal                   |       |
| LOC_OF_OCCUR_DESC     | Categorical (nominal)         | Location description (INSIDE, FRONT, etc.)     | Encoding              | One-Hot                   |
| OFNS_DESC             | Categorical (nominal)         | Offense description                            | Encoding              | One-Hot (High Cardinality)|
| PD_CD                 | Categorical (nominal)         | Detailed police code                           | Encoding              | One-Hot (High Cardinality)|
| PREM_TYP_DESC         | Categorical (nominal)         | Premises type description                      | Encoding              | One-Hot (High Cardinality)|
| SUSP_AGE_GROUP        | Categorical (ordinal)         | Age group (e.g., '<18', '18-24', 'UNKNOWN')    | Encoding              | Ordinal                   |
| SUSP_RACE             | Categorical (nominal)         | Suspect race                                   | Encoding              | One-Hot                   |
| SUSP_SEX              | Categorical (nominal)         | Suspect sex (M/F/U)                            | Encoding              | One-Hot                   |
| VIC_AGE_GROUP         | Categorical (ordinal)         | Age group (e.g., '<18', '18-24', 'UNKNOWN')    | Encoding              | Ordinal                   |
| VIC_RACE              | Categorical (nominal)         | Victim race                                    | Encoding              | One-Hot                   |
| VIC_SEX               | Categorical (nominal)         | Victim sex (M/F/D/E/U)                         | Encoding              | One-Hot                   |
| LATITUDE              | Numeric (continuous)          | Geographic coordinate                          | Scaling               | -                         |
| LONGITUDE             | Numeric (continuous)          | Geographic coordinate                          | Scaling               | -                         |
| BAR_DISTANCE          | Numeric (continuous)          | Distance to nearest bar (meters)               | Scaling               | -                         |
| NIGHTCLUB_DISTANCE    | Numeric (continuous)          | Distance to nearest nightclub (meters)         | Scaling               | -                         |
| ATM_DISTANCE          | Numeric (continuous)          | Distance to nearest ATM (meters)               | Scaling               | -                         |
| METRO_DISTANCE        | Numeric (continuous)          | Distance to nearest metro station (meters)     | Scaling               | -                         |
| ATMS_COUNT            | Numeric (discrete)            | Count of ATMs nearby                           | Scaling               | -                         |
| BARS_COUNT            | Numeric (discrete)            | Count of bars nearby                           | Scaling               | -                         |
| BUS_STOPS_COUNT       | Numeric (discrete)            | Count of bus stops nearby                      | Scaling               | -                         |
| METROS_COUNT | Numeric (discrete)            | Count of metro stations nearby                 | Scaling               | -                         |
| NIGHTCLUBS_COUNT      | Numeric (discrete)            | Count of nightclubs nearby                     | Scaling               | -                         |
| SCHOOLS_COUNT         | Numeric (discrete)            | Count of schools nearby                        | Scaling               | -                         |
| HOUR                  | Numeric (cyclical)            | Hour of the day (0-23)                         | Encoding              | Cyclical (Sine/Cosine)    |
| DAY                   | Numeric (cyclical)            | Day of the month (1-31)                        | Encoding              | Cyclical (Sine/Cosine)    |
| WEEKDAY               | Categorical (cyclical)        | Day of the week (MON-SUN)                      | Encoding              | Cyclical (Sine/Cosine)    |
| IS_WEEKEND            | Binary                        | 0 = weekday, 1 = weekend                       | None                  | -                         |
| MONTH                 | Numeric (cyclical)            | Month of the year (1-12)                       | Encoding              | Cyclical (Sine/Cosine)    |
| SEASON                | Categorical (nominal)         | Season (WINTER, SPRING, SUMMER, AUTUMN)        | Encoding              | One-Hot                   |
| TIME_BUCKET           | Categorical (nominal)         | Time of day (NIGHT, MORNING, AFTERNOON, EVENING) | Encoding              | One-Hot                   |
| YEAR                  | Numeric (discrete)            | Year of the event                              | Scaling               | -                         |
| IS_HOLIDAY            | Binary                        | 0 = not a holiday, 1 = holiday                 | None                  | -                         |
| IS_PAYDAY             | Binary                        | 0 = not a payday, 1 = payday (1st or 15th)     | None                  | -                         |
| SAME_AGE_GROUP        | Binary                        | 1 if suspect/victim age groups match, else 0   | None                  | -                         |
| SAME_SEX              | Binary                        | 1 if suspect/victim sexes match, else 0        | None                  | -                         |
| TO_CHECK_CITIZENS     | Binary                        | 1 if offense is relevant for citizens, else 0 | None                  | -                         |
| TOTAL_POI_COUNT       | Numeric (discrete)            | Sum of all POI counts  | Scaling               | -                         |
| POI_DIVERSITY         | Numeric (discrete)            | Number of different POI types present          | Scaling               | -                         |
| POI_DENSITY_SCORE      | Numeric (continuous)           | Density score                           | Scaling                  | -                             |
| MIN_POI_DISTANCE      | Numeric (continuous)          | Distance to closest POI of any type            | Scaling               | -                         |
| AVG_POI_DISTANCE      | Numeric (continuous)          | Average distance to POIs                       | Scaling               | -                         |
| MAX_POI_DISTANCE       | Numeric (continuous)           | Maximum distance to POIs                       | Scaling                  | -                             |

### Reasons for Cyclical/Ordinal Encoding

Cyclical and Ordinal encoding are valuable techniques for representing categorical features while preserving inherent relationships and limiting dimensionality increase compared to One-Hot encoding.
- **Cyclical Encoding:** This is particularly useful for features where the end of the sequence connects back to the beginning, such as time-related attributes (`HOUR`, `DAY`, `WEEKDAY`, `MONTH`). For example, hour 23 is conceptually close to hour 0, December is close to January. Cyclical encoding (often using sine and cosine transformations) captures this wrap-around nature, preventing the model from incorrectly interpreting these values as distant.
- **Ordinal Encoding:** This method is suitable for features with a clear, inherent order or ranking (`LAW_CAT_CD`, `SUSP_AGE_GROUP`, `VIC_AGE_GROUP`). Assigning sequential integers (e.g., 0, 1, 2...) allows the model to understand the relative magnitude or progression between categories (e.g., 'FELONY' > 'MISDEMEANOR', '<18' < '18-24').

### Reasons for One-Hot Encoding

One-Hot encoding is applied to nominal categorical features where there is no inherent order or ranking among the categories (`BORO_NM`, `KY_CD`, `LOC_OF_OCCUR_DESC`, `OFNS_DESC`, `PD_CD`, `PREM_TYP_DESC`, `SUSP_RACE`, `SUSP_SEX`, `VIC_RACE`, `VIC_SEX`, `SEASON`, `TIME_BUCKET`). It creates new binary (0 or 1) columns for each unique category in the original feature. This prevents the model from assuming any ordinal relationship between categories that doesn't exist. While it can significantly increase the number of features (especially for high-cardinality columns like `OFNS_DESC`), it ensures that each category is treated independently.

### Reasons for Scaling

Scaling is essential for numeric features (`Latitude`, `Longitude`, distance columns, count columns, `YEAR`) that have different ranges or units. Many machine learning algorithms (especially those based on distance calculations like k-NN or SVM, or those using gradient descent like linear regression and neural networks) are sensitive to the scale of input features. Features with larger values might disproportionately influence the model's learning process. Scaling methods like Standardization (Z-score normalization) or Min-Max scaling transform the data to a common scale (e.g., mean 0 and standard deviation 1, or range [0, 1]), ensuring that all features contribute more equally to the model's outcome.