In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

In [2]:
df= pd.read_excel("../processed_data/raw_data_1985-2023.xlsx")
df.head()

Unnamed: 0,County,Year,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,...,Category_Rural,Category_Suburban,Category_Urban,No health insurance coverage,Civilian noninstitutionalized population,high_school_rate,dropout_rate,public_school_rate,adherent_rate,religion_diversity
0,Alameda,1985,11628,143,791,5427,5267,5429,91,445,...,0,0,1,,,,,,,
1,Alameda,1986,12495,174,820,5971,5530,5570,114,532,...,0,0,1,,,,,,,
2,Alameda,1987,11703,147,770,5019,5767,6303,91,511,...,0,0,1,,,,,,,
3,Alameda,1988,10963,159,722,4863,5219,5708,100,498,...,0,0,1,,,,,,,
4,Alameda,1989,10563,172,670,4879,4842,5250,98,453,...,0,0,1,,,,,,,


In [3]:
df.columns

Index(['County', 'Year', 'Violent_sum', 'Homicide_sum', 'ForRape_sum',
       'Robbery_sum', 'AggAssault_sum', 'ViolentClr_sum', 'HomicideClr_sum',
       'ForRapeClr_sum', 'RobberyClr_sum', 'AggAssaultClr_sum', 'Property_sum',
       'Burglary_sum', 'VehicleTheft_sum', 'LTtotal_sum', 'PropertyClr_sum',
       'BurglaryClr_sum', 'VehicleTheftClr_sum', 'LTtotalClr_sum',
       'Population', 'Area_sq_mi', 'unemployment_rate',
       'median_household_income', 'CPI', 'poverty_rate', 'Label',
       'Total_Housing_Units', 'Mobile_Home', 'Occupied_Housing_Units',
       'Vacant_Housing_Units', 'Owner_Occupied', 'Renter_Occupied',
       'Avg_Hsehld_Size_Owner_Occupied', 'Avg_HseHld_Size_Renter_Occupied',
       'rent_burden', 'Median_Age', 'entity_ID', 'police_budget',
       'education_budget', 'welfare_budget', 'mental_health_budget',
       'rehab_budget', 'health_budget', 'judiciary_budget', 'prison_budget',
       'median_house_value', 'Category', 'Category_encoded', 'Category_Rural',


In [4]:
initial_columns = [
    "County",
    "Year",
    "Violent_sum",
    "ViolentClr_sum",
    "Population",
    "Area_sq_mi",
    "unemployment_rate",
    "median_household_income",
    "CPI",

    'poverty_rate',
    "Total_Housing_Units",
    "Mobile_Home",
    "Occupied_Housing_Units",
    "Vacant_Housing_Units",
    "Owner_Occupied",
    "Renter_Occupied",
    "Avg_Hsehld_Size_Owner_Occupied",
    "Avg_HseHld_Size_Renter_Occupied",
    "rent_burden",
    "Median_Age",
    "police_budget",
    "education_budget",
    "welfare_budget",
    "mental_health_budget",
    "rehab_budget",
    "health_budget",
    "judiciary_budget",
    "prison_budget",
    "median_house_value",
    "Category",
    "Category_encoded",
    "Category_Rural",
    "Category_Suburban",
    "Category_Urban",
    'No health insurance coverage',
    'Civilian noninstitutionalized population',
    "high_school_rate",
    'dropout_rate',
    'public_school_rate',
    'adherent_rate',
    'religion_diversity'
]

dff = df[initial_columns]
dff.head()

Unnamed: 0,County,Year,Violent_sum,ViolentClr_sum,Population,Area_sq_mi,unemployment_rate,median_household_income,CPI,poverty_rate,...,Category_Rural,Category_Suburban,Category_Urban,No health insurance coverage,Civilian noninstitutionalized population,high_school_rate,dropout_rate,public_school_rate,adherent_rate,religion_diversity
0,Alameda,1985,11628,5429,1185500,738,,,108.6,,...,0,0,1,,,,,,,
1,Alameda,1986,12495,5570,1206900,738,,,112.0,,...,0,0,1,,,,,,,
2,Alameda,1987,11703,6303,1220600,738,,,116.5,,...,0,0,1,,,,,,,
3,Alameda,1988,10963,5708,1242300,738,,,121.9,,...,0,0,1,,,,,,,
4,Alameda,1989,10563,5250,1261200,738,,,128.0,,...,0,0,1,,,,,,,


In [5]:
def compute_crime_rate(df):
    """
    Computes the crime rate as Total Crimes per 1,000 people and adds it to the dataset.
    Parameters:
        data (pd.DataFrame): The input dataset.
    Returns:
        pd.DataFrame: The dataset with the crime rate as the target column.
    """
    data = df.copy()
    if "Violent_sum" in data.columns and "Population" in data.columns:
        data["crime_rate"] = data["Violent_sum"] / data["Population"]
        return data

    raise ValueError(
        "Columns 'Violent_sum' and 'Population' are required to compute the crime rate."
    )


def compute_clearance_rate(df):
    """
    Computes the clearance rate as ViolentClr_sum / Violent_sum.
    """
    data = df.copy()
    if "ViolentClr_sum" in data.columns and "Violent_sum" in data.columns:
        data["clearance_rate"] = data["ViolentClr_sum"] / data["Violent_sum"]
        # print("Computed .... clearance_rate")
    else:
        raise ValueError(
            "Columns 'ViolentClr_sum' and 'Violent_sum' are required for clearance rate."
        )
    return data

def compute_uninsured_rate(df):
    """
    Computes the uninsured rate as No health insurance coverage / Civilian noninstitutionalized population.
    """
    data = df.copy()
    if "No health insurance coverage" in data.columns and "Civilian noninstitutionalized population" in data.columns:
        data["uninsured_rate"] = data["No health insurance coverage"] / data["Civilian noninstitutionalized population"]
        # print("Computed .... uninsured_rate")
    else:
        raise ValueError(
            "Columns 'No health insurance coverage' and 'Civilian noninstitutionalized population' are required for uninsured_rate."
        )
    return data


# Feature Function: Compute Population Density
def compute_population_density(df):
    """
    Computes the population density as Population / Area_sq_mi.
    """
    data = df.copy()
    if "Population" in data.columns and "Area_sq_mi" in data.columns:
        data["population_density"] = data["Population"] / data["Area_sq_mi"]
    else:
        raise ValueError(
            "Columns 'Population' and 'Area_sq_mi' are required for population density."
        )
    return data


# Dynamically Add Adjusted Expenditure Columns


def add_adjusted_expenditures(df):
    """
    Adjusts expenditure columns by CPI_Population.
    Parameters:
        data (pd.DataFrame): The input dataset.
    Returns:
        pd.DataFrame: The dataset with adjusted expenditure columns.
    """
    data = df.copy()
    
    if all(
        col in data.columns
        for col in [
            "police_budget",
            "education_budget",
            "welfare_budget",
            "mental_health_budget",
            "rehab_budget",
            "health_budget",
            "judiciary_budget",
            "prison_budget",
            'Population',
            "CPI",
        ]
    ):
        data['CPI_Population'] = data["CPI"] * data["Population"]
        data["adj_police_budget"] = data["police_budget"] / data["CPI_Population"]
        data["adj_education_budget"] = data["education_budget"] / data["CPI_Population"]
        data["adj_welfare_budget"] = data["welfare_budget"] / data["CPI_Population"]
        data["adj_mental_health_budget"] = (
            data["mental_health_budget"] / data["CPI_Population"]
        )
        data["adj_rehab_budget"] = data["rehab_budget"] / data["CPI_Population"]
        data["adj_health_budget"] = data["health_budget"] / data["CPI_Population"]
        data["adj_judiciary_budget"] = data["judiciary_budget"] / data["CPI_Population"]
        data["adj_prison_budget"] = data["prison_budget"] / data["CPI_Population"]

        data["social_vs_security"] = (
            data["adj_education_budget"]
            + data["adj_welfare_budget"]
            + data["adj_health_budget"]
        ) / (
            data["adj_police_budget"]
            + data["adj_judiciary_budget"]
            + data["adj_prison_budget"]
        )
        data["security_vs_social"] = (
            data["adj_police_budget"]
            + data["adj_judiciary_budget"]
            + data["adj_prison_budget"]
        ) / (
            data["adj_education_budget"]
            + data["adj_welfare_budget"]
            + data["adj_health_budget"]
        )
    else:
        raise ValueError("Required columns for adjusted_expenditures are missing.")

    return data


# Feature Function: Adjusted Income
def compute_adjusted_income(df):
    """
    Computes adjusted income as median household income divided by CPI.
    """
    data = df.copy()
    if "median_household_income" in data.columns and "CPI" in data.columns:
        data["adjusted_income"] = data["median_household_income"] / data["CPI"]
    else:
        raise ValueError(
            "Columns 'median_household_income' and 'CPI' are required for adjusted income."
        )
    return data


# Feature Function: House Affordability
def compute_house_affordability(df):
    """
    Computes house affordability as median house value divided by median household income.
    """
    data = df.copy()
    if (
        "median_house_value" in data.columns
        and "median_household_income" in data.columns
    ):
        data["house_affordability"] = (
            data["median_house_value"] / data["median_household_income"]
        )
    else:
        raise ValueError(
            "Columns 'median_house_value' and 'median_household_income' are required for house affordability."
        )
    return data


# Feature Function: Home Ownership Rate
def compute_home_ownership_rate(df):
    """
    Computes home ownership rate as Owner_Occupied divided by Occupied_Housing_Units.
    """
    data = df.copy()
    if "Owner_Occupied" in data.columns and "Occupied_Housing_Units" in data.columns:
        data["home_ownership_rate"] = (
            data["Owner_Occupied"] / data["Occupied_Housing_Units"]
        )
    else:
        raise ValueError(
            "Columns 'Owner_Occupied' and 'Occupied_Housing_Units' are required for home ownership rate."
        )
    return data


# Feature Function: Total Persons and Household Metrics
def compute_persons_and_household_metrics(df):
    """
    Computes total persons, total persons for owners/renters, and persons per household.
    """
    data = df.copy()
    if all(
        col in data.columns
        for col in [
            "Vacant_Housing_Units",
            "Total_Housing_Units",
            "Owner_Occupied",
            "Avg_Hsehld_Size_Owner_Occupied",
            "Renter_Occupied",
            "Avg_HseHld_Size_Renter_Occupied",
            "Occupied_Housing_Units",
            "Mobile_Home",
        ]
    ):
        data["vacancy_rate"] = (
            data["Vacant_Housing_Units"] / data["Total_Housing_Units"]
        )
        data["Total_Persons_Owner"] = (
            data["Owner_Occupied"] * data["Avg_Hsehld_Size_Owner_Occupied"]
        )
        data["Total_Persons_Renter"] = (
            data["Renter_Occupied"] * data["Avg_HseHld_Size_Renter_Occupied"]
        )
        data["Total_Persons"] = (
            data["Total_Persons_Owner"] + data["Total_Persons_Renter"]
        )
        data["Number_of_Persons_per_HseHld"] = (
            data["Total_Persons"] / data["Occupied_Housing_Units"]
        )
        data["renter_ratio"] = data["Total_Persons_Renter"] / data["Total_Persons"]
        data["mobile_home_ratio"] = data["Mobile_Home"] / data["Total_Housing_Units"]
    else:
        raise ValueError(
            "Required columns for total persons or household metrics are missing."
        )
    return data

In [6]:
features_funcs = [
    compute_crime_rate,
    compute_clearance_rate,
    compute_population_density,
    add_adjusted_expenditures,
    compute_adjusted_income,
    compute_house_affordability,
    compute_home_ownership_rate,
    compute_uninsured_rate,
    compute_persons_and_household_metrics,
]
def apply_features(df, funcs):
    data = df.copy()

    for feature_func in funcs:
        data = feature_func(data)
    return data
test_df = apply_features(df=dff, funcs=features_funcs)

In [7]:
feature_0 = [
    'Population',
    'clearance_rate',
    'population_density',
    'unemployment_rate',
    'adjusted_income',
    'poverty_rate',
    'rent_burden',
    'home_ownership_rate',
    'mobile_home_ratio',
    'vacancy_rate',
    'Number_of_Persons_per_HseHld',
    'renter_ratio',
    'Median_Age',
    'adj_police_budget',
    'adj_education_budget',
    'adj_welfare_budget',
    'adj_mental_health_budget',
    'adj_rehab_budget',
    'adj_health_budget',
    'adj_judiciary_budget',
    'adj_prison_budget',
    'house_affordability',
    'uninsured_rate',
    'high_school_rate',
    'dropout_rate',
    'public_school_rate',
    "social_vs_security",
    "security_vs_social",
    'adherent_rate',
    'religion_diversity']
feature_cat=['Category_Rural', 'Category_Suburban', 'Category_Urban']

In [8]:
Feature_df = test_df[["County", "Year", "crime_rate"] + feature_0 + feature_cat]
Feature_df = Feature_df.set_index(["County", "Year"])
Feature_df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,crime_rate,Population,clearance_rate,population_density,unemployment_rate,adjusted_income,poverty_rate,rent_burden,home_ownership_rate,mobile_home_ratio,...,high_school_rate,dropout_rate,public_school_rate,social_vs_security,security_vs_social,adherent_rate,religion_diversity,Category_Rural,Category_Suburban,Category_Urban
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Yuba,2019,0.004095,79619,0.411043,126.379365,6.2,201.70825,15.2,41.5,0.605145,0.082943,...,82.3,0.055995,0.985,1.237238,0.808252,0.347593,2.0,1,0,0
Yuba,2020,0.004977,81178,0.398515,128.853968,10.6,197.248655,16.3,37.5,0.608648,0.093497,...,82.2,0.049287,0.984,1.257962,0.794937,0.346609,2.0,1,0,0
Yuba,2021,0.004312,82091,0.420904,130.303175,8.4,204.337343,15.6,39.3,0.615739,0.087198,...,81.2,,,1.186006,0.843166,0.342754,2.0,1,0,0
Yuba,2022,0.003379,82563,0.433692,131.052381,5.6,199.314588,14.9,39.5,0.614068,0.083845,...,82.7,0.022527,0.937,1.165807,0.857775,0.340794,2.0,1,0,0
Yuba,2023,0.004005,83405,0.365269,132.388889,6.7,215.467565,15.4,38.8,0.633182,0.080927,...,84.7,0.058615,0.93,1.221294,0.818804,0.337354,2.0,1,0,0


In [9]:
Feature_df.columns

Index(['crime_rate', 'Population', 'clearance_rate', 'population_density',
       'unemployment_rate', 'adjusted_income', 'poverty_rate', 'rent_burden',
       'home_ownership_rate', 'mobile_home_ratio', 'vacancy_rate',
       'Number_of_Persons_per_HseHld', 'renter_ratio', 'Median_Age',
       'adj_police_budget', 'adj_education_budget', 'adj_welfare_budget',
       'adj_mental_health_budget', 'adj_rehab_budget', 'adj_health_budget',
       'adj_judiciary_budget', 'adj_prison_budget', 'house_affordability',
       'uninsured_rate', 'high_school_rate', 'dropout_rate',
       'public_school_rate', 'social_vs_security', 'security_vs_social',
       'adherent_rate', 'religion_diversity', 'Category_Rural',
       'Category_Suburban', 'Category_Urban'],
      dtype='object')

In [10]:
Feature_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2262 entries, ('Alameda', 1985) to ('Yuba', 2023)
Data columns (total 34 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   crime_rate                    2262 non-null   float64
 1   Population                    2262 non-null   int64  
 2   clearance_rate                2262 non-null   float64
 3   population_density            2262 non-null   float64
 4   unemployment_rate             1914 non-null   float64
 5   adjusted_income               928 non-null    float64
 6   poverty_rate                  870 non-null    float64
 7   rent_burden                   812 non-null    float64
 8   home_ownership_rate           812 non-null    float64
 9   mobile_home_ratio             812 non-null    float64
 10  vacancy_rate                  812 non-null    float64
 11  Number_of_Persons_per_HseHld  812 non-null    float64
 12  renter_ratio                  812 no

In [11]:
def fill_missing_with_linear_regression(group):
    """
    For a single county (group), fit a simple linear regression model
    Year vs. each numeric column. Use that model to fill missing values.
    """
    # Sort by Year for clarity
    group = group.sort_index(level='Year')
    
    # Iterate over each column
    for col in group.columns:
        # Only process numeric columns
        if pd.api.types.is_numeric_dtype(group[col]):
            # Extract the known data points (drop missing)
            valid_data = group[col].dropna()
            
            # If there aren't at least two valid points, we can't fit a regression
            if len(valid_data) < 2:
                continue
            
            # Prepare X (Year) and y (column values)
            X = valid_data.index.get_level_values('Year').values.reshape(-1, 1)
            y = valid_data.values
            
            # Fit the linear regression model
            model = LinearRegression().fit(X, y)
            
            # Predict for all years in this county
            X_all = group.index.get_level_values('Year').values.reshape(-1, 1)
            y_pred = model.predict(X_all)
            
            # Fill only missing values with the predictions
            missing_mask = group[col].isna()
            group.loc[missing_mask, col] = y_pred[missing_mask]
        else:
            print(f"Skipping non-numeric column: {col}")
    
    return group

In [12]:
# 1. Copy your original DataFrame and set a MultiIndex
df = Feature_df.copy()
#df = df.set_index(['County', 'Year'])
# 2. Group by County and apply the regression-based filling
df_reg_filled = (
    df.groupby(level='County', group_keys=False)
      .apply(fill_missing_with_linear_regression)
)

df_reg_filled.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,crime_rate,Population,clearance_rate,population_density,unemployment_rate,adjusted_income,poverty_rate,rent_burden,home_ownership_rate,mobile_home_ratio,...,high_school_rate,dropout_rate,public_school_rate,social_vs_security,security_vs_social,adherent_rate,religion_diversity,Category_Rural,Category_Suburban,Category_Urban
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,1985,0.009809,1185500,0.46689,1606.368564,5.890887,198.249497,20.61619,49.471868,0.541511,0.013777,...,53.839341,0.038731,0.928594,2.199568,0.420023,0.167093,2.195604,0,0,1
Alameda,1986,0.010353,1206900,0.445778,1635.365854,5.887444,202.918528,20.301905,49.215165,0.541334,0.013726,...,54.807253,0.038225,0.927948,2.200612,0.421325,0.173917,2.248352,0,0,1
Alameda,1987,0.009588,1220600,0.53858,1653.929539,5.884002,207.587558,19.987619,48.958462,0.541157,0.013674,...,55.775165,0.037719,0.927301,2.201655,0.422627,0.180741,2.301099,0,0,1
Alameda,1988,0.008825,1242300,0.52066,1683.333333,5.880559,212.256589,19.673333,48.701758,0.540979,0.013623,...,56.743077,0.037213,0.926655,2.202699,0.423928,0.187565,2.353846,0,0,1
Alameda,1989,0.008375,1261200,0.497018,1708.943089,5.877117,216.92562,19.359048,48.445055,0.540802,0.013572,...,57.710989,0.036707,0.926009,2.203742,0.42523,0.194389,2.406593,0,0,1


In [13]:
df_reg_filled.columns

Index(['crime_rate', 'Population', 'clearance_rate', 'population_density',
       'unemployment_rate', 'adjusted_income', 'poverty_rate', 'rent_burden',
       'home_ownership_rate', 'mobile_home_ratio', 'vacancy_rate',
       'Number_of_Persons_per_HseHld', 'renter_ratio', 'Median_Age',
       'adj_police_budget', 'adj_education_budget', 'adj_welfare_budget',
       'adj_mental_health_budget', 'adj_rehab_budget', 'adj_health_budget',
       'adj_judiciary_budget', 'adj_prison_budget', 'house_affordability',
       'uninsured_rate', 'high_school_rate', 'dropout_rate',
       'public_school_rate', 'social_vs_security', 'security_vs_social',
       'adherent_rate', 'religion_diversity', 'Category_Rural',
       'Category_Suburban', 'Category_Urban'],
      dtype='object')

In [14]:
df_reg_filled.isna().sum()

crime_rate                       0
Population                       0
clearance_rate                   0
population_density               0
unemployment_rate                0
adjusted_income                  0
poverty_rate                     0
rent_burden                      0
home_ownership_rate              0
mobile_home_ratio                0
vacancy_rate                     0
Number_of_Persons_per_HseHld     0
renter_ratio                     0
Median_Age                       0
adj_police_budget               39
adj_education_budget            39
adj_welfare_budget              39
adj_mental_health_budget        77
adj_rehab_budget                39
adj_health_budget               39
adj_judiciary_budget            39
adj_prison_budget               39
house_affordability             39
uninsured_rate                   0
high_school_rate                 0
dropout_rate                     0
public_school_rate               0
social_vs_security              39
security_vs_social  

In [15]:
print(df_reg_filled.shape)
Final_df = df_reg_filled.dropna()
print(Final_df.shape)

(2262, 34)
(2068, 34)


In [16]:
# Save to .xlsx file
Feature_2010_2023 = Feature_df.dropna().reset_index()
print(Feature_2010_2023.shape)
Feature_2010_2023.to_excel(
    "../processed_data/Features_crime_data_2010-2023.xlsx", sheet_name="Crime_Data", index=False
)

# Save to .xlsx file
Feature_1985_2023 = Final_df.reset_index()
print(Feature_1985_2023.shape)
Feature_1985_2023.to_excel(
    "../processed_data/Features_crime_data_1985-2023.xlsx", sheet_name="Crime_Data", index=False
)

(602, 36)
(2068, 36)
