# Setup

## Import Libraries and Define Paths

Import the necessary libraries (pandas, numpy, os, warnings) and define the paths for input (historical and current year datasets) and the merged output dataset. Also create the output directory if it does not exist.

In [None]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

# Path to the datasets (for local environment)
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "JupyterOutputs"))
year_to_date_dataset = os.path.join(base_dir, "Raw", "NYPD_Complaint_Data_Current__Year_To_Date__20250410.csv")
historic_dataset = os.path.join(base_dir, "Raw", "NYPD_Complaint_Data_Historic_20250313.csv")
output_dataset = os.path.join(base_dir, "Merged", "NYPD_Complaints_Merged.csv")

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_dataset), exist_ok=True)

# Data Loading and Merging

Load the two source CSV files, align their columns (adding NaN for missing columns), concatenate them, convert 'CMPLNT_FR_DT' column to datetime, filter records from 2020 onwards, and save the merged dataset. Include fallback mechanisms for potential errors.

In [3]:
print("=== Data Loading and Merging ===")
print(f"Loading historic dataset from {historic_dataset}...")
print(f"Loading year-to-date dataset from {year_to_date_dataset}...")

# Check if datasets exist
historic_exists = os.path.exists(historic_dataset)
ytd_exists = os.path.exists(year_to_date_dataset)

df = None # Initialize df to None

if historic_exists and ytd_exists:
    # Load the datasets
    try:
        historic_df = pd.read_csv(historic_dataset)
        year_to_date_df = pd.read_csv(year_to_date_dataset)

        print(f"Historic dataset shape: {historic_df.shape}")
        print(f"Year-to-date dataset shape: {year_to_date_df.shape}")

        print("Year-to-date columns:")
        print(year_to_date_df.columns.tolist())

        print("Historic columns:")
        print(historic_df.columns.tolist())

        # Align columns (adds NaN for missing columns)
        print("Aligning columns between datasets...")
        common_columns = year_to_date_df.columns.union(historic_df.columns)
        historic_df_aligned = historic_df.reindex(columns=common_columns)
        year_to_date_df_aligned = year_to_date_df.reindex(columns=common_columns)

        # Merge the two datasets
        print("Merging datasets...")
        merged_df = pd.concat([historic_df_aligned, year_to_date_df_aligned], ignore_index=True)

        # Convert CMPLNT_FR_DT to datetime
        print("Converting CMPLNT_FR_DT to datetime...")
        merged_df['CMPLNT_FR_DT'] = pd.to_datetime(merged_df['CMPLNT_FR_DT'], errors='coerce')

        # Filter out rows before 2020
        print("Filtering data from 2020 onwards...")
        merged_df = merged_df[merged_df['CMPLNT_FR_DT'] >= '2020-01-01']

        # Save the merged dataset
        print(f"Saving merged and filtered dataset to {output_dataset}...")
        merged_df.to_csv(output_dataset, index=False)

        print(f"Merge and filtering completed. Total rows from 2020 onwards: {merged_df.shape[0]}")
        print(f"Merged dataset saved to {output_dataset}")

        # Continue with the existing dataset for prepreprocessing
        df = merged_df
    except Exception as e:
        print(f"Error during dataset merging: {e}")
        print("Attempting to load the pre-merged dataset instead...")
        # Try to load the pre-merged dataset as fallback
        if os.path.exists(output_dataset):
            try:
                df = pd.read_csv(output_dataset)
                print(f"Pre-merged dataset loaded successfully: {df.shape[0]} rows and {df.shape[1]} columns")
            except Exception as load_err:
                 print(f"Error loading pre-merged dataset {output_dataset}: {load_err}")
                 raise RuntimeError(f"Could not load any dataset: {load_err}")
        else:
            print(f"Error: Could not load pre-merged dataset {output_dataset}")
            raise FileNotFoundError(f"No datasets available for processing")
else:
    print("One or both datasets do not exist.")
    if not historic_exists:
        print(f"Historic dataset not found at: {historic_dataset}")
    if not ytd_exists:
        print(f"Year-to-date dataset not found at: {year_to_date_dataset}")
    raise FileNotFoundError("Required datasets are missing. Please check the file paths.")

# Final check if df was loaded
if df is None:
    print("Error: DataFrame 'df' could not be loaded or created.")
    raise RuntimeError("DataFrame loading failed")

=== Data Loading and Merging ===
Loading historic dataset from C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs\Raw\NYPD_Complaint_Data_Historic_20250313.csv...
Loading year-to-date dataset from C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs\Raw\NYPD_Complaint_Data_Current__Year_To_Date__20250410.csv...
Historic dataset shape: (8914838, 35)
Year-to-date dataset shape: (577108, 36)
Year-to-date columns:
['CMPLNT_NUM', 'ADDR_PCT_CD', 'BORO_NM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'CMPLNT_TO_DT', 'CMPLNT_TO_TM', 'CRM_ATPT_CPTD_CD', 'HADEVELOPT', 'HOUSING_PSA', 'JURISDICTION_CODE', 'JURIS_DESC', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCCUR_DESC', 'OFNS_DESC', 'PARKS_NM', 'PATROL_BORO', 'PD_CD', 'PD_DESC', 'PREM_TYP_DESC', 'RPT_DT', 'STATION_NAME', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'TRANSIT_DISTRICT', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude', 'Lat_Lon', 'New Georeferenced Column']
Historic columns:
['CMPLNT_NUM', 'C

# Data Cleaning - Initial Overview

Display basic information and the list of columns in the loaded (merged and filtered) dataset before cleaning. Perform duplicate checks and basic data validation (missing values).

In [4]:
print("\n=== Data Preprocessing - Initial Overview ===")

# Information about the original dataset
print(f"Dataset loaded successfully: {df.shape[0]} rows and {df.shape[1]} columns.")
print(f"Columns in the original dataset: {df.columns.tolist()}")

# Data validation and quality checks
print("\n=== Data Quality Checks ===")

# Check for duplicates
duplicates = df.duplicated().sum()
if duplicates > 0:
    print(f"Found {duplicates} duplicate rows ({duplicates/len(df)*100:.2f}% of data)")
    print("Removing duplicate rows...")
    df = df.drop_duplicates()
    print(f"After removing duplicates: {df.shape[0]} rows")
else:
    print("No duplicate rows found.")

# Basic data validation
print(f"Date range: {df['CMPLNT_FR_DT'].min()} to {df['CMPLNT_FR_DT'].max()}")
print(f"Missing values per column:")
missing_summary = df.isnull().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)
if len(missing_summary) > 0:
    for col, count in missing_summary.head(10).items():
        print(f"  {col}: {count} ({count/len(df)*100:.1f}%)")
else:
    print("  No missing values found.")



=== Data Preprocessing - Initial Overview ===
Dataset loaded successfully: 2512541 rows and 36 columns.
Columns in the original dataset: ['ADDR_PCT_CD', 'BORO_NM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'CMPLNT_NUM', 'CMPLNT_TO_DT', 'CMPLNT_TO_TM', 'CRM_ATPT_CPTD_CD', 'HADEVELOPT', 'HOUSING_PSA', 'JURISDICTION_CODE', 'JURIS_DESC', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCCUR_DESC', 'Lat_Lon', 'Latitude', 'Longitude', 'New Georeferenced Column', 'OFNS_DESC', 'PARKS_NM', 'PATROL_BORO', 'PD_CD', 'PD_DESC', 'PREM_TYP_DESC', 'RPT_DT', 'STATION_NAME', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'TRANSIT_DISTRICT', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX', 'X_COORD_CD', 'Y_COORD_CD']

=== Data Quality Checks ===
No duplicate rows found.
Date range: 2020-01-01 00:00:00 to 2024-12-31 00:00:00
Missing values per column:
  TRANSIT_DISTRICT: 2443447 (97.3%)
  New Georeferenced Column: 1936566 (77.1%)
  HOUSING_PSA: 540598 (21.5%)
  CMPLNT_TO_DT: 180993 (7.2%)
  PD_CD: 2090 (0.1%)
  ADDR_PCT_CD: 97 (0.0%)
  Lat_Lon

# Data Cleaning - Column Removal

Remove specified irrelevant, redundant, or non-useful columns from the DataFrame to reduce dimensionality and noise. Warn if columns not present.

In [5]:
# List of columns to remove
columns_to_remove = [
    'CMPLNT_NUM', 'ADDR_PCT_CD', 'CMPLNT_TO_DT', 'CMPLNT_TO_TM',
    'CRM_ATPT_CPTD_CD', 'HADEVELOPT', 'HOUSING_PSA', 'JURISDICTION_CODE',
    'JURIS_DESC', 'PATROL_BORO', 'PD_DESC', 'RPT_DT', 'STATION_NAME',
    'TRANSIT_DISTRICT', 'X_COORD_CD', 'Y_COORD_CD', 'Lat_Lon',
    'New Georeferenced Column'
]

print(f"Attempting to remove columns: {columns_to_remove}")

# Verify existing columns to remove
existing_columns = [col for col in columns_to_remove if col in df.columns]
missing_columns = [col for col in columns_to_remove if col not in df.columns]

if missing_columns:
    print(f"Warning: the following columns intended for removal are not present in the dataset: {missing_columns}")

# Remove specified columns (only those that exist)
df_cleaned = df.drop(columns=existing_columns, errors='ignore')
print(f"Removed columns: {existing_columns}")

# Information about the dataset after cleaning
print(f"Final cleaned dataset: {df_cleaned.shape[0]} rows and {df_cleaned.shape[1]} columns")
print(f"Columns in the final dataset: {df_cleaned.columns.tolist()}")

# Display the first rows of the cleaned dataset
print("\n=== First rows of the final cleaned dataset ===")
print(df_cleaned.head())


Attempting to remove columns: ['CMPLNT_NUM', 'ADDR_PCT_CD', 'CMPLNT_TO_DT', 'CMPLNT_TO_TM', 'CRM_ATPT_CPTD_CD', 'HADEVELOPT', 'HOUSING_PSA', 'JURISDICTION_CODE', 'JURIS_DESC', 'PATROL_BORO', 'PD_DESC', 'RPT_DT', 'STATION_NAME', 'TRANSIT_DISTRICT', 'X_COORD_CD', 'Y_COORD_CD', 'Lat_Lon', 'New Georeferenced Column']
Removed columns: ['CMPLNT_NUM', 'ADDR_PCT_CD', 'CMPLNT_TO_DT', 'CMPLNT_TO_TM', 'CRM_ATPT_CPTD_CD', 'HADEVELOPT', 'HOUSING_PSA', 'JURISDICTION_CODE', 'JURIS_DESC', 'PATROL_BORO', 'PD_DESC', 'RPT_DT', 'STATION_NAME', 'TRANSIT_DISTRICT', 'X_COORD_CD', 'Y_COORD_CD', 'Lat_Lon', 'New Georeferenced Column']
Final cleaned dataset: 2512541 rows and 18 columns
Columns in the final dataset: ['BORO_NM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCCUR_DESC', 'Latitude', 'Longitude', 'OFNS_DESC', 'PARKS_NM', 'PD_CD', 'PREM_TYP_DESC', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX']

=== First rows of the final cleaned dataset ===
     B

# Save Cleaned Dataset

Save the cleaned dataset (with selected columns removed) to the 'PrePreProcessed' directory. Ensures consistency with the global pipeline and provides a standardized input for downstream processing.

In [None]:
# Save the cleaned dataset
output_dir = os.path.join(base_dir, "PrePreProcessed")
os.makedirs(output_dir, exist_ok=True)
print(f"Saving to output directory: {output_dir}")

cleaned_file_path = os.path.join(output_dir, "cleaned_crime_data.csv")
df_cleaned.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved to: {cleaned_file_path}")

print("\n=== Preprocessing completed ===")

Saving to output directory: C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs\PrePreProcessed
Cleaned dataset saved to: C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs\PrePreProcessed\cleaned_crime_data.csv

=== Preprocessing completed ===
