In [1]:
import pandas as pd
import os

In [2]:
# data
crime_df = pd.read_csv('Crimes_-_2001_to_Present_20250205.csv')
socio_df = pd.read_csv('Census_Data_-_Selected_socioeconomic_indicators_in_Chicago__2008___2012_20250704.csv')

In [4]:
# Drop irrelevant columns
columns_to_drop = [
    'ID', 'Case Number', 'Updated On', 'Location'
]
crime_df.drop(columns=columns_to_drop, errors='ignore', inplace=True)

In [7]:
# format dates and year
crime_df['Date'] = pd.to_datetime(crime_df['Date'], errors='coerce')
crime_df['Year'] = pd.to_numeric(crime_df['Year'], errors='coerce')

In [8]:
# filter years >= 2012
crime_df = crime_df[crime_df['Year'] >= 2012]

# standardize column names in socio df
socio_df.columns = socio_df.columns.str.strip().str.upper().str.replace(' ', '_')

# Rename for merging
crime_df.rename(columns={'Community Area': 'COMMUNITY_AREA'}, inplace=True)
socio_df.rename(columns={'COMMUNITY_AREA_NUMBER': 'COMMUNITY_AREA'}, inplace=True)

# Merge datasets
merged_df = crime_df.merge(socio_df, on='COMMUNITY_AREA', how='left')

# Export merged dataset
output_dir = 'cleaned_data_with_socio'
os.makedirs(output_dir, exist_ok=True)
merged_df.to_csv(f'{output_dir}/chicago_crime_with_socio.csv', index=False)

# Document what was cleaned
doc = {
    'dropped_columns': columns_to_drop,
    'filtered_years': '2012–2024',
    'merge_key': 'COMMUNITY_AREA',
    'socio_cols_added': [col for col in socio_df.columns if col != 'COMMUNITY_AREA'],
    'final_row_count': len(merged_df),
    'final_column_count': merged_df.shape[1]
}

import json
with open(f'{output_dir}/cleaning_documentation.json', 'w') as f:
    json.dump(doc, f, indent=2)

print("Merge complete. Files saved in `cleaned_data/` folder.")


Merge complete. Files saved in `cleaned_data/` folder.
