In [1]:
import pandas as pd
import numpy as np

In [2]:
crime_df = pd.read_csv("socio_data/chicago_crime_with_socio.csv")
business_df = pd.read_csv("business_licenses_datasets/business_licenses.csv")
population_df = pd.read_csv("population_datasets/Chicago_Population_Counts_Cleaned.csv")
homicides_df = pd.read_csv("cleaned_data/Chicago_Homicides_Cleaned.csv")

In [8]:
business_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 39 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ID                                 0 non-null      object 
 1   LICENSE ID                         0 non-null      int64  
 2   ACCOUNT NUMBER                     0 non-null      int64  
 3   SITE NUMBER                        0 non-null      int64  
 4   LEGAL NAME                         0 non-null      object 
 5   DOING BUSINESS AS NAME             0 non-null      object 
 6   ADDRESS                            0 non-null      object 
 7   CITY                               0 non-null      object 
 8   STATE                              0 non-null      object 
 9   ZIP CODE                           0 non-null      object 
 10  WARD                               0 non-null      float64
 11  PRECINCT                           0 non-null      float64
 12  WARD PRECIN

In [5]:
# === Feature Engineering ===

## 1) Crime Socio-Demographics Features (already partly cleaned)
# Drop records without community area name
crime_df_clean = crime_df.dropna(subset=["COMMUNITY_AREA_NAME"])

# Group by community area and average the socio-demographic values
socio_features = crime_df_clean.groupby("COMMUNITY_AREA_NAME").agg({
    "PERCENT_OF_HOUSING_CROWDED": "mean",
    "PERCENT_HOUSEHOLDS_BELOW_POVERTY": "mean",
    "PERCENT_AGED_16+_UNEMPLOYED": "mean",
    "PERCENT_AGED_25+_WITHOUT_HIGH_SCHOOL_DIPLOMA": "mean",
    "PER_CAPITA_INCOME": "mean",
    "HARDSHIP_INDEX": "mean"
}).reset_index()

# Rename for clarity
socio_features.columns = [
    "Community Area", "Pct_Housing_Crowded", "Pct_Below_Poverty",
    "Pct_Unemployed", "Pct_Without_HS_Diploma",
    "Per_Capita_Income", "Hardship_Index"
]

In [6]:
## 2) Population-based Features
# Calculate new features from population_df
population_df["Pct_Under_18"] = population_df["Population - Age 0-17"] / population_df["Population - Total"] * 100
population_df["Pct_Seniors_65plus"] = population_df["Population - Age 65+"] / population_df["Population - Total"] * 100
population_df["Gender_Ratio_Female_Male"] = population_df["Population - Female"] / population_df["Population - Male"]
population_df["Pct_Black_Non_Latinx"] = population_df["Population - Black Non-Latinx"] / population_df["Population - Total"] * 100
population_df["Pct_Latinx"] = population_df["Population - Latinx"] / population_df["Population - Total"] * 100

# Calculate Dependency Ratio
working_age = population_df["Population - Age 18+"] - population_df["Population - Age 65+"]
dependent = population_df["Population - Age 0-17"] + population_df["Population - Age 65+"]
population_df["Dependency_Ratio"] = dependent / working_age

# Population density
population_df["Pop_Density"] = population_df["Population - Total"] / 2.5  # if land area unavailable, use a placeholder

# Select engineered features
pop_features = population_df[[
    "community", "Population - Total", "Pct_Under_18", "Pct_Seniors_65plus", "Gender_Ratio_Female_Male",
    "Pct_Black_Non_Latinx", "Pct_Latinx", "Dependency_Ratio", "Pop_Density"
]].rename(columns={"community": "Community Area"})

In [10]:
## 3. Business Density Features
# === Filter active business licenses ===
business_df = business_df.copy()
business_df = business_df[
    (business_df["LICENSE TERM EXPIRATION DATE"] >= "2022-01-01") &
    (business_df["COMMUNITY AREA"].notna())
]

# Drop duplicates
business_df = business_df.drop_duplicates(subset=["LICENSE ID"])

# Clean license descriptions
business_df["LICENSE DESCRIPTION"] = business_df["LICENSE DESCRIPTION"].str.lower()

# Add business category flags
business_df["is_retail"] = business_df["LICENSE DESCRIPTION"].str.contains("retail")
business_df["is_food"] = business_df["LICENSE DESCRIPTION"].str.contains("restaurant|food|catering|grocery")

# Group by community area
business_features = business_df.groupby("COMMUNITY AREA").agg({
    "LICENSE ID": "count",
    "is_retail": "sum",
    "is_food": "sum",
    "LICENSE CODE": pd.Series.nunique
}).reset_index()

# Rename columns
business_features.columns = [
    "Community_Area_Code", "Active_Business_Count",
    "Retail_Business_Count", "Restaurant_Count", "License_Type_Diversity"
]

# merge with area name
area_map = crime_df[["COMMUNITY_AREA", "COMMUNITY_AREA_NAME"]].dropna().drop_duplicates()
business_features = pd.merge(business_features, area_map,
                             left_on="Community_Area_Code", right_on="COMMUNITY_AREA", how="left")

business_features = business_features.rename(columns={"COMMUNITY_AREA_NAME": "Community Area"})

In [11]:
# === Merge All Features ===

# Merge socio + population
merged_df = pd.merge(socio_features, pop_features, on="Community Area", how="left")

# Merge business features
final_df = pd.merge(merged_df, business_features[[
    "Community Area", "Active_Business_Count",
    "Retail_Business_Count", "Restaurant_Count",
    "License_Type_Diversity"
]], on="Community Area", how="left")

# Business per 1000 residents
final_df["Business_Per_1000"] = final_df["Active_Business_Count"] / final_df["Population - Total"] * 1000


In [12]:
final_df.to_csv("sociodemographic_features_by_area.csv", index=False)

In [13]:
# === Document Features ===
feature_dict = pd.DataFrame({
    "Feature": final_df.columns,
    "Description": [
        "Name of community area",
        "% Housing crowded", "% Below poverty", "% Unemployed",
        "% without high school", "Per capita income", "Hardship index",
        "Population", "% Under 18", "% 65+", "Female to male ratio",
        "% Black Non-Latinx", "% Latinx", "Age dependency ratio", "Population density",
        "Active business licenses", "# of retail businesses", "# of restaurants",
        "Unique license types", "Businesses per 1000 residents"
    ],
    "Data Type": ["str"] + ["float"] * (len(final_df.columns) - 1)
})

feature_dict.to_csv("feature_documentation.csv", index=False)