# Setup

This section covers the initial setup, including library imports, path definitions, and mounting Google Drive.

## Mount Google Drive
Mount Google Drive to access files stored there.

In [1]:
# from google.colab import drive
# drive.mount('/drive', force_remount=True)

## Import Libraries
Import necessary libraries for data manipulation, file operations, and warnings management.

In [2]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

## Define Paths
Define base directory and paths for input (cleaned data) and output (integrated data) files.

In [3]:
# Path variables
# base_dir = "/drive/MyDrive/Data Mining and Machine Learning/Progetto"
base_dir = r"C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs"
processed_dir = os.path.join(base_dir, "Processed")
cleaned_data_file = os.path.join(processed_dir, "cleaned_crime_data_processed.csv")
os.makedirs(processed_dir, exist_ok=True)

# Load Cleaned Data

Load the cleaned crime dataset and the cleaned POI dataset produced by the Data Cleaning phase.

## Load Dataset


In [4]:
# Load dataset
print("=== Loading Processed Data ===")
try:
    if os.path.exists(cleaned_data_file):
        df = pd.read_csv(cleaned_data_file)
        initial_rows = len(df)
        print(f"Dataset loaded successfully: {initial_rows} rows and {df.shape[1]} columns")
        print(f"Columns in the dataset: {df.columns.tolist()}")

        # Basic data validation
        if initial_rows == 0:
            raise ValueError("Dataset is empty")
        if df.shape[1] < 5:
            raise ValueError("Dataset has too few columns")

    else:
        raise FileNotFoundError(f"Could not find cleaned dataset at: {cleaned_data_file}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise RuntimeError(f"Failed to load required dataset: {e}")

# Display basic dataset overview
print("\n=== Dataset Overview ===")
print(df.info())
print("\n=== Summary Statistics ===")
print(df.describe())
print("\n=== Sample Data ===")
print(df.sample(min(5, len(df))))

=== Loading Processed Data ===
Dataset loaded successfully: 2496759 rows and 18 columns
Columns in the dataset: ['BORO_NM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCCUR_DESC', 'Latitude', 'Longitude', 'OFNS_DESC', 'PARKS_NM', 'PD_CD', 'PREM_TYP_DESC', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX']

=== Dataset Overview ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2496759 entries, 0 to 2496758
Data columns (total 18 columns):
 #   Column             Dtype  
---  ------             -----  
 0   BORO_NM            object 
 1   CMPLNT_FR_DT       object 
 2   CMPLNT_FR_TM       object 
 3   KY_CD              int64  
 4   LAW_CAT_CD         object 
 5   LOC_OF_OCCUR_DESC  object 
 6   Latitude           float64
 7   Longitude          float64
 8   OFNS_DESC          object 
 9   PARKS_NM           object 
 10  PD_CD              float64
 11  PREM_TYP_DESC      object 
 12  SUSP_AGE_GROUP     object 
 13  SUSP_RACE        

# Data Integration – Enrichment with OpenStreetMap POI

A **data integration task** was carried out to enrich the original dataset of crime records with **contextual geographic information** retrieved from **OpenStreetMap (OSM)**. The goal was to enhance the dataset with spatial features related to urban structure, mobility, and social activity, which are potentially correlated with criminal activity.

The integration was performed in QGIS through the following steps:

####  1. Construction of a spatial grid
A **100m x 100m regular grid** was generated over the entire study area using a projected CRS (EPSG:32618 – UTM zone 18N), to allow metric spatial operations. Each cell acts as a local unit of spatial context for aggregation.

####  2. Extraction of Points of Interest (POIs) from OSM
Using the **QuickOSM plugin**, several types of POIs relevant to criminological analysis were downloaded and filtered:
- Bars (`amenity=bar`)
- Nightclubs (`amenity=nightclub`)
- ATMs (`amenity=atm`)
- Bus stops (`highway=bus_stop`)
- Metro and train stations (`railway=station`, `public_transport=station`)
- Schools (`amenity=school`)

#### 3. Spatial integration and feature generation
For each crime record, two types of features were computed:

- **Distance features**:  
  Euclidean distance (in meters) from the crime location to the **nearest POI** of a given category, using Nearest Neighbor Join (`NNJoin` plugin).  
  These features are:
  - `bar_distance`
  - `nightclubs_distance`
  - `atm_distance`
  - `metro_distance`

- **Density/count features**:  
  Number of POIs of a given type **within the 100x100m grid cell** where the crime occurred. Computed using `Count points in polygon` and spatial joins (`Join attributes by location`).  
  These features are:
  - `bars_count`
  - `nightclubs_count`
  - `atms_count`
  - `bus_stops_count`
  - `metro_and_trains_count`
  - `schools_count`

#### Purpose of this integration
These contextual features are expected to improve downstream analysis tasks such as:
- **Clustering of crime patterns**
- **Crime type classification**
- **Hotspot detection**
- **Interpretability of spatial models**

By embedding urban environmental data directly into the crime dataset, this enrichment step helps move from raw geospatial points to **semantically informed locations**.

In [6]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
from datetime import datetime
from collections import Counter
warnings.filterwarnings('ignore')

# Enhanced spatial data quality validation functions
def validate_spatial_data(df):
    """
    Validate spatial data quality for crime dataset
    """
    validation_results = {'passed': [], 'warnings': [], 'errors': []}

    # Check for coordinate columns
    lat_cols = [col for col in df.columns if 'lat' in col.lower()]
    lon_cols = [col for col in df.columns if 'lon' in col.lower()]

    if not lat_cols or not lon_cols:
        validation_results['errors'].append("Missing latitude or longitude columns")
        return validation_results

    lat_col, lon_col = lat_cols[0], lon_cols[0]

    # Validate coordinate ranges (NYC bounds approximately)
    valid_lat_range = (40.4, 41.0)
    valid_lon_range = (-74.3, -73.7)

    invalid_lat = ((df[lat_col] < valid_lat_range[0]) | (df[lat_col] > valid_lat_range[1])).sum()
    invalid_lon = ((df[lon_col] < valid_lon_range[0]) | (df[lon_col] > valid_lon_range[1])).sum()

    if invalid_lat > 0:
        validation_results['warnings'].append(f"{invalid_lat} records with latitude outside NYC bounds")
    if invalid_lon > 0:
        validation_results['warnings'].append(f"{invalid_lon} records with longitude outside NYC bounds")

    # Check for missing coordinates
    missing_coords = df[[lat_col, lon_col]].isna().any(axis=1).sum()
    if missing_coords > 0:
        validation_results['warnings'].append(f"{missing_coords} records with missing coordinates")

    return validation_results

def validate_poi_features(df):
    """
    Validate POI feature quality
    """
    validation_results = {'passed': [], 'warnings': [], 'errors': []}

    # Expected POI columns
    expected_poi_cols = {
        'distance': ['atm_distance', 'bar_distance', 'nightclubs_distance', 'metro_distance'],
        'count': ['atms_count', 'bars_count', 'nightclubs_count', 'metro_and_trains_count', 'bus_stops_count', 'schools_count']
    }

    # Check for presence of POI columns
    missing_distance_cols = [col for col in expected_poi_cols['distance'] if col not in df.columns]
    missing_count_cols = [col for col in expected_poi_cols['count'] if col not in df.columns]

    if missing_distance_cols:
        validation_results['warnings'].append(f"Missing distance columns: {missing_distance_cols}")
    if missing_count_cols:
        validation_results['warnings'].append(f"Missing count columns: {missing_count_cols}")

    # Validate distance values (should be non-negative)
    for col in expected_poi_cols['distance']:
        if col in df.columns:
            negative_distances = (df[col] < 0).sum()
            if negative_distances > 0:
                validation_results['errors'].append(f"Column {col} has {negative_distances} negative distance values")

    # Validate count values (should be non-negative integers)
    for col in expected_poi_cols['count']:
        if col in df.columns:
            negative_counts = (df[col] < 0).sum()
            if negative_counts > 0:
                validation_results['errors'].append(f"Column {col} has {negative_counts} negative count values")

    return validation_results

# Path variables and data loading
# base_dir = "/drive/MyDrive/Data Mining and Machine Learning/Progetto"
base_dir = r"C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs"
integrated_dir = os.path.join(base_dir, "DataIntegrated")
integrated_data_file = os.path.join(integrated_dir, "integrated_crime_data.csv")
os.makedirs(integrated_dir, exist_ok=True)

# Load integrated dataset
print("=== Loading Integrated Data ===")
try:
    if os.path.exists(integrated_data_file):
        df = pd.read_csv(integrated_data_file)
        initial_rows = len(df)
        print(f"Integrated dataset loaded: {initial_rows} rows and {df.shape[1]} columns")
        print(f"Columns: {df.columns.tolist()}")

        # Validate data quality
        if initial_rows == 0:
            raise ValueError("Integrated dataset is empty")

    else:
        raise FileNotFoundError(f"Could not find integrated dataset at: {integrated_data_file}")
except Exception as e:
    print(f"Error loading integrated dataset: {e}")
    raise RuntimeError(f"Failed to load integrated dataset: {e}")

# Perform spatial data validation
print("\n=== Spatial Data Quality Validation ===")
spatial_validation = validate_spatial_data(df)
poi_validation = validate_poi_features(df)

# Display validation results
for validation_name, results in [('Spatial', spatial_validation), ('POI', poi_validation)]:
    print(f"\n--- {validation_name} Validation Results ---")
    if results['errors']:
        print("ERRORS:")
        for error in results['errors']:
            print(f"  {error}")
    if results['warnings']:
        print("WARNINGS:")
        for warning in results['warnings']:
            print(f"  {warning}")
    if not results['errors'] and not results['warnings']:
        print("All checks passed successfully.")

# Display dataset overview
print("\n=== Integrated Dataset Overview ===")
print(df.info())
print("\n=== Summary Statistics ===")
print(df.describe())
print("\n=== Sample Data ===")
print(df.sample(min(3, len(df))))

=== Loading Integrated Data ===
Integrated dataset loaded: 2496759 rows and 32 columns
Columns: ['fid', 'BORO_NM', 'CMPLNT_FR_', 'CMPLNT_F_1', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCC', 'OFNS_DESC', 'PARKS_NM', 'PD_CD', 'PREM_TYP_D', 'SUSP_AGE_G', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GR', 'VIC_RACE', 'VIC_SEX', 'Latitude', 'Longitude', 'bar_distance', 'nightclubs_distance', 'atm_distance', 'atms_count', 'bars_count', 'bus_stops_count', 'metro_and_trains_count', 'nightclubs_count', 'schools_count', 'join_fid', 'join_full_id', 'join_osm_id', 'metro_distance']

=== Spatial Data Quality Validation ===

--- Spatial Validation Results ---
  17 records with latitude outside NYC bounds
  17 records with longitude outside NYC bounds

--- POI Validation Results ---
All checks passed successfully.

=== Integrated Dataset Overview ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2496759 entries, 0 to 2496758
Data columns (total 32 columns):
 #   Column                  Dtype  
---  ------             

## Data Processing and Feature Engineering

### 1. Intelligent Column Detection
- **Existing column verification**: Only processes columns that actually exist in the dataset
- **Safe column removal**: Only removes technical columns if they are present
- **Robust POI detection**: Automatically finds POI-related columns regardless of naming

### 2. Basic Feature Engineering
- **Distance aggregations**: MIN_POI_DISTANCE, AVG_POI_DISTANCE, MAX_POI_DISTANCE (only if distance columns exist)
- **Density metrics**: TOTAL_POI_COUNT, POI_DIVERSITY (only if count columns exist)

### 3. Safe Column Standardization
- **Conditional renaming**: Only renames columns that actually exist in the dataset
- **POI standardization**: Converts POI column names to uppercase for consistency
- **No assumptions**: Does not attempt to rename columns that don't exist

### 4. Robust Data Validation
- **Comprehensive logging**: Shows exactly which columns are found and processed
- **Missing data handling**: Only processes POI columns that actually exist
- **Final verification**: Reports actual column names and shapes

## Save Cleaned Data

Finally, the cleaned and integrated dataset is saved as a new CSV file. The code ensures that only valid, existing data is processed and exported.

In [None]:
# Simple and robust column standardization
def create_basic_features(df):
    """
    Create only basic derived features from existing POI data
    """
    print("Creating basic derived features...")

    # Distance-based features - only if distance columns exist
    distance_cols = [col for col in df.columns if 'distance' in col.lower()]
    if distance_cols:
        print(f"Found distance columns: {distance_cols}")
        # Create aggregated distance features
        df['MIN_POI_DISTANCE'] = df[distance_cols].min(axis=1)
        df['AVG_POI_DISTANCE'] = df[distance_cols].mean(axis=1)
        df['MAX_POI_DISTANCE'] = df[distance_cols].max(axis=1)

    # Count-based features - only if count columns exist
    count_cols = [col for col in df.columns if 'count' in col.lower()]
    if count_cols:
        print(f"Found count columns: {count_cols}")
        # Total POI density
        df['TOTAL_POI_COUNT'] = df[count_cols].sum(axis=1)

        # Create POI diversity index (number of different POI types present)
        df['POI_DIVERSITY'] = (df[count_cols] > 0).sum(axis=1)

        #Normalized POI counts
        df['POI_DENSITY_SCORE'] = df['TOTAL_POI_COUNT'] / (df['TOTAL_POI_COUNT'].max() if df['TOTAL_POI_COUNT'].max() > 0 else 1)


    return df

# List of columns to remove (technical/join-related)
columns_to_remove = [
    'fid', 'join_fid', 'join_full_id', 'join_osm_id', 'layer', 'path'
]

# Check which columns actually exist
existing_columns_to_remove = [col for col in columns_to_remove if col in df.columns]
missing_columns = [col for col in columns_to_remove if col not in df.columns]

print(f"=== Column Cleaning ===")
if existing_columns_to_remove:
    print(f"Removing technical columns: {existing_columns_to_remove}")
else:
    print("No technical columns found to remove")

# Remove technical columns
df_cleaned = df.drop(columns=existing_columns_to_remove, errors='ignore')
print(f"Removed {len(existing_columns_to_remove)} technical columns")

# Print current columns to see what we actually have
print(f"\nCurrent columns in dataset: {list(df_cleaned.columns)}")

# Handle missing POI data more intelligently
# First, let's see what POI columns actually exist
poi_columns = [col for col in df_cleaned.columns if 'count' in col.lower() or 'distance' in col.lower()]
print(f"\nFound POI columns: {poi_columns}")

if poi_columns:
    rows_before = len(df_cleaned)
    # Only remove rows where ALL POI features are missing (likely integration failures)
    df_cleaned = df_cleaned.dropna(subset=poi_columns, how='all')
    rows_after = len(df_cleaned)
    print(f"Removed {rows_before - rows_after} rows with missing POI data")
else:
    print("Warning: No POI columns found for missing data filtering")

# CAREFUL column renaming - only rename columns that actually exist
print("\n=== Column Renaming ===")
actual_columns = set(df_cleaned.columns)
print(f"Checking {len(actual_columns)} columns for renaming...")

# Define renaming map but only apply to existing columns
column_rename_map = {
# Date/Time columns
    'CMPLNT_FR_': 'CMPLNT_FR_DT',
    'CMPLNT_F_1': 'CMPLNT_FR_TM',

    # Description columns
    'LOC_OF_OCC': 'LOC_OF_OCCUR_DESC',
    'PREM_TYP_D': 'PREM_TYP_DESC',
    'PD_DESC': 'PD_DESCRIPTION',

    # Age group columns
    'SUSP_AGE_G': 'SUSP_AGE_GROUP',
    'VIC_AGE_GR': 'VIC_AGE_GROUP',

    # POI count columns (standardize to uppercase)
    'atms_count': 'ATMS_COUNT',
    'bars_count': 'BARS_COUNT',
    'bus_stops_count': 'BUS_STOPS_COUNT',
    'metro_and_trains_count': 'METROS_COUNT',
    'nightclubs_count': 'NIGHTCLUBS_COUNT',
    'schools_count': 'SCHOOLS_COUNT',

    # POI distance columns (standardize to uppercase)
    'atm_distance': 'ATM_DISTANCE',
    'bar_distance': 'BAR_DISTANCE',
    'nightclubs_distance': 'NIGHTCLUB_DISTANCE',
    'metro_distance': 'METRO_DISTANCE'
}

# Apply renaming only for columns that actually exist
existing_renames = {old: new for old, new in column_rename_map.items() if old in actual_columns}
if existing_renames:
    df_cleaned.rename(columns=existing_renames, inplace=True)
    print(f"Renamed {len(existing_renames)} columns: {existing_renames}")
else:
    print("No columns found for renaming")

# Create basic derived features
df_cleaned = create_basic_features(df_cleaned)

# Final data quality check
print(f"\n=== Final Dataset Summary ===")
print(f"Final shape: {df_cleaned.shape}")
print(f"Columns: {len(df_cleaned.columns)}")
print(f"Memory usage: {df_cleaned.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Check for missing data
missing_data_summary = df_cleaned.isnull().sum()
if missing_data_summary.sum() > 0:
    print(f"\nColumns with missing data:")
    for col, missing_count in missing_data_summary[missing_data_summary > 0].items():
        print(f"  {col}: {missing_count} ({missing_count/len(df_cleaned)*100:.1f}%)")
else:
    print("\nNo missing data in final dataset")

=== Column Cleaning ===
Removing technical columns: ['fid', 'join_fid', 'join_full_id', 'join_osm_id']
Removed 4 technical columns

Current columns in dataset: ['BORO_NM', 'CMPLNT_FR_', 'CMPLNT_F_1', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCC', 'OFNS_DESC', 'PARKS_NM', 'PD_CD', 'PREM_TYP_D', 'SUSP_AGE_G', 'SUSP_RACE', 'SUSP_SEX', 'VIC_AGE_GR', 'VIC_RACE', 'VIC_SEX', 'Latitude', 'Longitude', 'bar_distance', 'nightclubs_distance', 'atm_distance', 'atms_count', 'bars_count', 'bus_stops_count', 'metro_and_trains_count', 'nightclubs_count', 'schools_count', 'metro_distance']

Found POI columns: ['bar_distance', 'nightclubs_distance', 'atm_distance', 'atms_count', 'bars_count', 'bus_stops_count', 'metro_and_trains_count', 'nightclubs_count', 'schools_count', 'metro_distance']
Removed 0 rows with missing POI data

=== Column Renaming ===
Checking 28 columns for renaming...
Renamed 16 columns: {'CMPLNT_FR_': 'CMPLNT_FR_DT', 'CMPLNT_F_1': 'CMPLNT_FR_TM', 'LOC_OF_OCC': 'LOC_OF_OCCUR_DESC', 'PREM_TYP_D':

In [9]:
# Final validation before saving
print("\n=== Final Validation Before Export ===")

# Validate essential columns are present
essential_columns = ['BORO_NM', 'LAW_CAT_CD', 'Latitude', 'Longitude']
missing_essential = [col for col in essential_columns if col not in df_cleaned.columns]

if missing_essential:
    print(f"Warning: Missing essential columns: {missing_essential}")
else:
    print("All essential columns present")

# Check for duplicate records
duplicates = df_cleaned.duplicated().sum()
if duplicates > 0:
    print(f"Found {duplicates} duplicate records - consider deduplication")
    # Optionally remove duplicates
    df_cleaned = df_cleaned.drop_duplicates()
    print(f"Removed duplicates. Final shape: {df_cleaned.shape}")
else:
    print("No duplicate records found")

# Validate data types
print("\n--- Data Type Summary ---")
print(df_cleaned.dtypes.value_counts())

# Create processing log
processing_log = {
    'timestamp': pd.Timestamp.now().isoformat(),
    'original_rows': initial_rows,
    'final_rows': len(df_cleaned),
    'original_columns': len(df.columns),
    'final_columns': len(df_cleaned.columns),
    'rows_removed': initial_rows - len(df_cleaned),
    'columns_in_final_dataset': list(df_cleaned.columns),
    'processing_successful': True
}

print(f"\n=== Processing Summary ===")
print(f"Original rows: {processing_log['original_rows']}")
print(f"Final rows: {processing_log['final_rows']}")
print(f"Original columns: {processing_log['original_columns']}")
print(f"Final columns: {processing_log['final_columns']}")
print(f"Rows removed: {processing_log['rows_removed']}")

# Save the cleaned integrated dataset
cleaned_integrated_file_path = os.path.join(integrated_dir, "cleaned_integrated_crime_data.csv")
try:
    df_cleaned.to_csv(cleaned_integrated_file_path, index=False)
    print(f"\nSuccessfully saved cleaned integrated dataset to: {cleaned_integrated_file_path}")
    print(f"Dataset shape: {df_cleaned.shape}")
    print(f"File size: {os.path.getsize(cleaned_integrated_file_path) / (1024*1024):.2f} MB")

    # Save processing log
    log_file_path = os.path.join(integrated_dir, "integration_processing_log.json")
    import json
    with open(log_file_path, 'w') as f:
        json.dump(processing_log, f, indent=2)
    print(f"Processing log saved to: {log_file_path}")

except Exception as e:
    print(f"Error saving files: {e}")
    raise

print("\nDATA INTEGRATION COMPLETED SUCCESSFULLY")
print(f"Final dataset contains {len(df_cleaned)} records with {len(df_cleaned.columns)} columns")


=== Final Validation Before Export ===
All essential columns present
No duplicate records found

--- Data Type Summary ---
float64    17
object     14
int64       3
Name: count, dtype: int64

=== Processing Summary ===
Original rows: 2496759
Final rows: 2496759
Original columns: 32
Final columns: 34
Rows removed: 0

Successfully saved cleaned integrated dataset to: C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs\DataIntegrated\cleaned_integrated_crime_data.csv
Dataset shape: (2496759, 34)
File size: 756.06 MB
Processing log saved to: C:\Users\ferdi\Documents\GitHub\crime-analyzer\JupyterOutputs\DataIntegrated\integration_processing_log.json

DATA INTEGRATION COMPLETED SUCCESSFULLY
Final dataset contains 2496759 records with 34 columns
