In [1]:
# import modules
import pandas as pd
import numpy as np
import os

# Work on Crime and Clearances, and Pop and Area

In [2]:
# DataFrame for Crimes and clearances with Arson
crime_clearance_df = pd.read_csv(
    "../data/Crimes_and_Clearances_with_Arson-1985-2023.csv"
)

# DataFrame for Pop and area by county
pop_area = pd.read_csv("../data/Pop_and_area_by_county_1980_to_2024.csv")

  crime_clearance_df = pd.read_csv(


The message above indicates that some of the columns have mixed data type. We can resolve this below:
- First check which columns have non-numeric data types.
- Check which columns have mixed data types. 

In [3]:
## Columns with Non numeric dtypes
non_numeric_cols = crime_clearance_df.select_dtypes(include=["object"]).columns
print(non_numeric_cols)

Index(['County', 'NCICCode', 'TotalStructural_sum', 'TotalMobile_sum',
       'TotalOther_sum', 'GrandTotal_sum', 'GrandTotClr_sum'],
      dtype='object')


In [4]:
def mixed_type_columns(df: pd.DataFrame) -> list:
    """
    Look for columns with mixed types

    Parameters:
    df(pd.DataFrame)

    Returns:
    list: A list of columes with mixed dtypes
    """
    mixed_columns = []
    for column in df.columns:
        types_in_column = df[column].map(type).unique()
        if len(types_in_column) > 1:
            mixed_columns.append(column)
    return mixed_columns

In [5]:
# The columns with mixed dtypes
mixed_columns = mixed_type_columns(crime_clearance_df)
print(mixed_columns)

['TotalStructural_sum', 'TotalMobile_sum', 'TotalOther_sum', 'GrandTotal_sum', 'GrandTotClr_sum']


In [6]:
cca_df = crime_clearance_df.copy()  # A copy of crime_clearance_df

# Resolve the issue with mixed dtypes
cca_df[mixed_columns] = cca_df[mixed_columns].apply(pd.to_numeric, errors="coerce")

In [7]:
# No mixed types in the copy of the dataframe.
mixed_columns = mixed_type_columns(cca_df)
print(mixed_columns == [])

True


In [8]:
# Remove the column
cca_df = cca_df.drop(["NCICCode"], axis=1)
cca_df.head()

Unnamed: 0,Year,County,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,Property_sum,Burglary_sum,VehicleTheft_sum,...,MVPLARnao_sum,BILARnao_sum,FBLARnao_sum,COMLARnao_sum,AOLARnao_sum,LT400nao_sum,LT200400nao_sum,LT200nao_sum,LT50200nao_sum,LT50nao_sum
0,1985,Alameda County,427,3,27,166,231,3964,1483,353,...,109,205,44,11,475,753.0,437.0,,440,498
1,1985,Alameda County,405,7,15,220,163,4486,989,260,...,673,516,183,53,559,540.0,622.0,,916,1159
2,1985,Alameda County,101,1,4,58,38,634,161,55,...,62,39,46,17,37,84.0,68.0,,128,138
3,1985,Alameda County,1164,11,43,660,450,12035,2930,869,...,508,611,1877,18,496,533.0,636.0,,2793,4274
4,1985,Alameda County,146,0,5,82,59,971,205,102,...,153,16,85,24,169,217.0,122.0,,161,164


In [9]:
# Custom function to remove County from the values in the column County


def remove_county(text: str) -> str:
    """
    Remove County from string
    """
    return text.replace(" County", "")


assert remove_county("Hello County") == "Hello"
assert remove_county("Hello World County") == "Hello World"

In [10]:
# Apply the remove_county to the dataframe cca_df
cca_df["County"] = cca_df["County"].apply(remove_county)
cca_df["Year"] = cca_df["Year"].astype(str)

# Group by 'County' and 'Year'
cca_grouped_df = cca_df.groupby(["County", "Year"]).sum()

In [11]:
cca_grouped_df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,Property_sum,Burglary_sum,VehicleTheft_sum,LTtotal_sum,ViolentClr_sum,...,MVPLARnao_sum,BILARnao_sum,FBLARnao_sum,COMLARnao_sum,AOLARnao_sum,LT400nao_sum,LT200400nao_sum,LT200nao_sum,LT50200nao_sum,LT50nao_sum
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,1985,11628,143,791,5427,5267,89297,24997,7142,57158,5429,...,5728,3926,10664,406,9163,7744.0,7787.0,0.0,14473,27154
Alameda,1986,12495,174,820,5971,5530,90167,24392,7896,57879,5570,...,5449,3380,9575,465,9552,9048.0,7482.0,0.0,13459,27890
Alameda,1987,11703,147,770,5019,5767,88306,22399,8909,56998,6303,...,5445,2954,8687,256,8817,11437.0,8132.0,0.0,10845,26584


In [12]:
# Create a new feature 'crime_rate' for each county and year: crime_rate = Violentsum/population
violent = [
    "Violent_sum",
    "Homicide_sum",
    "ForRape_sum",
    "Robbery_sum",
    "AggAssault_sum",
    "ViolentClr_sum",
    "HomicideClr_sum",
    "ForRapeClr_sum",
    "RobberyClr_sum",
    "AggAssaultClr_sum",
]

property = [
    "Property_sum",
    "Burglary_sum",
    "VehicleTheft_sum",
    "LTtotal_sum",
    "PropertyClr_sum",
    "BurglaryClr_sum",
    "VehicleTheftClr_sum",
    "LTtotalClr_sum",
]

In [13]:
crime_data = cca_grouped_df[violent + property]
crime_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,Property_sum,Burglary_sum,VehicleTheft_sum,LTtotal_sum,PropertyClr_sum,BurglaryClr_sum,VehicleTheftClr_sum,LTtotalClr_sum
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,89297,24997,7142,57158,15409,3117,1607,10685
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,90167,24392,7896,57879,15121,2899,1698,10524
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,88306,22399,8909,56998,16380,2848,2189,11343
Alameda,1988,10963,159,722,4863,5219,5708,100,498,1545,3565,92745,22308,11080,59357,16747,2671,2533,11543
Alameda,1989,10563,172,670,4879,4842,5250,98,453,1496,3203,92888,21311,12556,59021,16171,2539,2560,11072


In [14]:
# Columns for new dataframe
list_of_cols = [
    "year",
    "county",
    "population",
    "crime_rate",
    "clearance_rate",
    "population_density",
    "vacancy_rate",
    "number_of_person_in_household",
    "mobile_home_ratio",
    "percent_in_poverty",
    "adjusted_median_income",
    "unemployment_rate",
    "dropout_rate",
    "public_school_rate",
    "no_highschool_rate",
    "uninsured_rate",
    "house_affordability",
    "adj_police_budget",
    "adj_education_budget",
    "adj_welfare_budget",
    "adj_mental_health_budget",
    "adj_rehab_budget",
    "adj_health_budget",
    "adj_judiciary_budget",
    "adj_prison_budget",
    "median_age",
    "home_ownership_rate",
    "rent_burden",
]

In [15]:
pop_area = pop_area.rename(columns={"COUNTY": "County", "Area (sq mi)": "Area_sq_mi"})
pop_area = pop_area.fillna(0)

pop_area_copy = pop_area.copy()  # Make a copy
pop_area_copy.head(3)

Unnamed: 0,County,Area_sq_mi,1981,1982,1983,1984,1985,1986,1987,1988,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Alameda,738.0,1117800,1134000,1151800,1170400,1185500,1206900,1220600,1242300,...,1622205,1641983,1656919,1666247,1675964,1681337,1655767,1645265,1644199,1644569
1,Alpine,739.0,1090,1100,1120,1080,1100,1140,1130,1100,...,1190,1196,1201,1205,1201,1204,1181,1177,1166,1163
2,Amador,606.0,19800,20250,20600,21050,21800,22450,23300,25750,...,37453,37663,38807,39708,40227,40426,40224,40073,40028,39893


In [16]:
pop_area_copy = pop_area_copy.drop(["Area_sq_mi"], axis=1)
pop_area_copy["County"] = pop_area_copy["County"].apply(lambda x: str(x.strip()))
pop_area_copy.head()

Unnamed: 0,County,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Alameda,1117800,1134000,1151800,1170400,1185500,1206900,1220600,1242300,1261200,...,1622205,1641983,1656919,1666247,1675964,1681337,1655767,1645265,1644199,1644569
1,Alpine,1090,1100,1120,1080,1100,1140,1130,1100,1090,...,1190,1196,1201,1205,1201,1204,1181,1177,1166,1163
2,Amador,19800,20250,20600,21050,21800,22450,23300,25750,27600,...,37453,37663,38807,39708,40227,40426,40224,40073,40028,39893
3,Butte,146800,150700,153800,156600,159700,163000,166200,170800,175200,...,227400,228198,230412,231774,227263,216090,206058,206183,205741,206194
4,Calaveras,21350,22250,23200,23850,24650,25550,26800,28200,29700,...,45395,45402,45355,45367,45324,45290,45013,44771,44616,44436


In [17]:
# Convert pop_area_copy to the structure of crime_data
pop_index_county = pop_area_copy.set_index("County")  # Index dataframe by County
pop_stacked = pop_index_county.stack().to_frame(name="popupation")
pop_stacked["popupation"] = pop_stacked["popupation"].apply(
    lambda x: int(str(x.replace(",", "")))
)
pop_stacked.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2596 entries, ('Alameda', '1981') to ('State Total', '2024')
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   popupation  2596 non-null   int64
dtypes: int64(1)
memory usage: 29.3+ KB


In [18]:
# Create Column for Population and Area

crime_data_df = crime_data.copy()
crime_data_df["Population"] = pop_stacked[
    "popupation"
]  # Add column 'Population' to crime_data_df

area_df = pop_area.copy()
area_df = area_df[["County", "Area_sq_mi"]]
area_df["County"] = area_df["County"].apply(lambda x: str(x.strip()))

crime_data_df = crime_data_df.reset_index()  # Reset the index
crime_data_df = crime_data_df.merge(area_df, on="County", how="left")
crime_data_df = crime_data_df.set_index(["County", "Year"])

In [19]:
crime_data_df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,Property_sum,Burglary_sum,VehicleTheft_sum,LTtotal_sum,PropertyClr_sum,BurglaryClr_sum,VehicleTheftClr_sum,LTtotalClr_sum,Population,Area_sq_mi
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,89297,24997,7142,57158,15409,3117,1607,10685,1185500,738.0
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,90167,24392,7896,57879,15121,2899,1698,10524,1206900,738.0
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,88306,22399,8909,56998,16380,2848,2189,11343,1220600,738.0


## Some features
- Crime rate
- Claerance rate
- Population density

In [20]:
crime_data_df["crime_rate"] = (
    crime_data_df["Violent_sum"] / crime_data_df["Population"]
)  # Crime rate
crime_data_df["clearance_rate"] = (
    crime_data_df["ViolentClr_sum"] / crime_data_df["Population"]
)  # Clearance rate
crime_data_df["population_density"] = (
    crime_data_df["Population"] / crime_data_df["Area_sq_mi"]
)

In [21]:
crime_data_df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,...,LTtotal_sum,PropertyClr_sum,BurglaryClr_sum,VehicleTheftClr_sum,LTtotalClr_sum,Population,Area_sq_mi,crime_rate,clearance_rate,population_density
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,...,57158,15409,3117,1607,10685,1185500,738.0,0.009809,0.00458,1606.368564
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,...,57879,15121,2899,1698,10524,1206900,738.0,0.010353,0.004615,1635.365854
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,...,56998,16380,2848,2189,11343,1220600,738.0,0.009588,0.005164,1653.929539


# Work on Unemployment 

In [22]:
# DataFrame for Unemployment rate
unemployment_rate_df = pd.read_excel("../data/Unemployment_rate_1990-2023.xlsx")

In [23]:
unemployment_rate_df = unemployment_rate_df.rename(
    columns={
        "County Name/State Abbreviation": "County",
        "unemployment rate(%)": "unemployment_rate",
    }
)
unemployment_rate_df.head()

Unnamed: 0,Code,Code.1,Code.2,County,Year,Unnamed: 5,Laber Force,Employed,Unemployed,unemployment_rate
0,CN0600100000000,6,1,"Alameda County, CA",2023,,826102,792439,33663,4.1
1,CN0600300000000,6,3,"Alpine County, CA",2023,,540,505,35,6.5
2,CN0600500000000,6,5,"Amador County, CA",2023,,14404,13673,731,5.1
3,CN0600700000000,6,7,"Butte County, CA",2023,,91910,87088,4822,5.2
4,CN0600900000000,6,9,"Calaveras County, CA",2023,,21956,21030,926,4.2


In [24]:
unemployment_rate_df["County"] = unemployment_rate_df["County"].apply(
    lambda x: x.replace(" County, CA", "").replace(" County/city, CA", "")
)
unemp_rate_df = unemployment_rate_df.copy()
unemp_rate_df = unemp_rate_df.fillna(0)
unemp_rate_df = unemp_rate_df[["County", "Year", "unemployment_rate"]]
unemp_rate_df["Year"] = unemp_rate_df["Year"].astype(str)
unemp_rate_df.set_index(["County", "Year"], inplace=True)

In [25]:
crime_data_df = crime_data_df.merge(unemp_rate_df, on=["County", "Year"], how="left")

In [26]:
crime_data_df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,...,PropertyClr_sum,BurglaryClr_sum,VehicleTheftClr_sum,LTtotalClr_sum,Population,Area_sq_mi,crime_rate,clearance_rate,population_density,unemployment_rate
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,...,15409,3117,1607,10685,1185500,738.0,0.009809,0.00458,1606.368564,
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,...,15121,2899,1698,10524,1206900,738.0,0.010353,0.004615,1635.365854,
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,...,16380,2848,2189,11343,1220600,738.0,0.009588,0.005164,1653.929539,
Alameda,1988,10963,159,722,4863,5219,5708,100,498,1545,3565,...,16747,2671,2533,11543,1242300,738.0,0.008825,0.004595,1683.333333,
Alameda,1989,10563,172,670,4879,4842,5250,98,453,1496,3203,...,16171,2539,2560,11072,1261200,738.0,0.008375,0.004163,1708.943089,


# Work on Median House and CPI

In [27]:
# DataFrame for Califonia_CPI
califonia_cpi_df = pd.read_excel("../data/California_CPI_1985_to_2023.xlsx")

# DataFrame for median household income
median_house_income = pd.read_excel("../data/Median_income_2000_and_2009_to_2023.xlsx")

In [28]:
califonia_cpi_df["Year"] = califonia_cpi_df["Year"].astype(str)

In [29]:
median_house_income["Year"] = median_house_income["Year"].astype(str)

In [30]:
median_house_cpi = median_house_income.merge(califonia_cpi_df, on="Year", how="left")
median_house_cpi["County"] = median_house_cpi["County"].apply(
    lambda x: x.replace(" County", "").strip()
)
median_house_cpi = median_house_cpi.rename(
    columns={"Median Household Income": "median_hse_income"}
)
median_house_cpi = median_house_cpi.set_index(["County", "Year"])
median_house_cpi["median_hse_income"] = median_house_cpi["median_hse_income"].astype(
    "float64"
)

In [31]:
median_house_cpi["adjusted_income"] = (
    median_house_cpi["median_hse_income"] / median_house_cpi.CPI
)
median_house_cpi.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,median_hse_income,CPI,adjusted_income
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
California,2023,95473.0,331.804,287.739147
Alameda,2023,119230.0,331.804,359.338646
Alpine,2023,83265.0,331.804,250.946342


In [32]:
crime_data_df = crime_data_df.merge(median_house_cpi, on=["County", "Year"], how="left")

In [33]:
crime_data_df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,...,LTtotalClr_sum,Population,Area_sq_mi,crime_rate,clearance_rate,population_density,unemployment_rate,median_hse_income,CPI,adjusted_income
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,...,10685,1185500,738.0,0.009809,0.00458,1606.368564,,,,
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,...,10524,1206900,738.0,0.010353,0.004615,1635.365854,,,,
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,...,11343,1220600,738.0,0.009588,0.005164,1653.929539,,,,
Alameda,1988,10963,159,722,4863,5219,5708,100,498,1545,3565,...,11543,1242300,738.0,0.008825,0.004595,1683.333333,,,,
Alameda,1989,10563,172,670,4879,4842,5250,98,453,1496,3203,...,11072,1261200,738.0,0.008375,0.004163,1708.943089,,,,


# Work on Poverty

In [34]:
poverty_rate_df = pd.read_excel("../data/Poverty_rate_2009_2023.xlsx")

In [35]:
poverty_rate_df.head()

Unnamed: 0,Year,ID,Name,Poverty Universe,Number in Poverty,90% Confidence Interval,Percent in Poverty,90% Confidence Interval.1
0,2023,6000,California,38249913,4597732,"4,546,196 to 4,649,268",12.0,11.9 to 12.1
1,2023,6001,Alameda County,1594026,151872,"138,959 to 164,785",9.5,8.7 to 10.3
2,2023,6003,Alpine County,1136,177,134 to 220,15.6,11.8 to 19.4
3,2023,6005,Amador County,37700,4400,"3,493 to 5,307",11.7,9.3 to 14.1
4,2023,6007,Butte County,203267,40532,"36,792 to 44,272",19.9,18.1 to 21.7


# Work on health Insurance

In [36]:
health_insurance_df = pd.read_excel("../data/Health_Insurance_2010_to_2023.xlsx") # Single file

In [37]:
# Combine all files in Extra-HI

extra_hi_dir = "../data/Extra-HI"

hi_dfs = {} # Dictionary to store data frames

# Loop over a list of files in the directory: Extra-HI
for filename in os.listdir(extra_hi_dir):
    # print(filename)
    file_path = os.path.join(extra_hi_dir, filename)
    raw_data = pd.read_excel(file_path, sheet_name="Data", header=[0,1]) 
    year = filename.strip("HI_, .xlsx")
    trans = raw_data.T
    trans["Year"] = year
    hi_dfs[year] = trans

### Combine all the dataframes into one dataframe 

In [39]:
#Combine all the dataframes into one dataframe

combine_his = pd.concat(hi_dfs.values(), axis=0) 
combine_his.columns= combine_his.iloc[0]
combine_his=combine_his[1:]
combine_his = combine_his.reset_index()
combine_his=combine_his.rename(columns={'2010':'Year', 'level_0':'County', 'level_1':'Label'}) 
combine_his["County"] = combine_his["County"].apply(
    lambda x: x.replace(" County, California", "").strip()
)
combine_his = combine_his.set_index(['County','Year'])


KeyError: "None of ['Year'] are in the columns"

In [45]:
combine_his.head(3)

"(Unnamed: 0_level_0, Label)",County,Label,EMPLOYMENT STATUS,Population 16 years and over,In labor force,Civilian labor force,Employed,Unemployed,Armed Forces,Not in labor force,...,Under 18 years,Related children of the householder under 18 years,Related children of the householder under 5 years,Related children of the householder 5 to 17 years,18 years and over,18 to 64 years,65 years and over,People in families,Unrelated individuals 15 years and over,2017
0,Alameda,Estimate,,1321464,880970,879485,826310,53175,1485,440494,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),2017
1,Alameda,Margin of Error,,±738,"±3,225","±3,241","±3,553","±1,567",±231,"±3,429",...,(X),(X),(X),(X),(X),(X),(X),(X),(X),2017
2,Alameda,Percent,,1321464,66.7%,66.6%,62.5%,4.0%,0.1%,33.3%,...,13.0%,12.8%,12.6%,12.8%,10.8%,11.1%,9.4%,7.9%,24.9%,2017


In [45]:
health_insurance_df.head()

Unnamed: 0,Year,County,Label,Civilian noninstitutionalized population,With health insurance coverage,With private health insurance,With public coverage,No health insurance coverage
0,2023,"Alameda County, California",Estimate,1641321,1574283,1228986,502254,67038
1,2023,"Alpine County, California",Estimate,1695,1610,1119,734,85
2,2023,"Amador County, California",Estimate,37789,35568,26408,17708,2221
3,2023,"Butte County, California",Estimate,207385,194212,127086,98182,13173
4,2023,"Calaveras County, California",Estimate,45670,43263,28952,22699,2407


In [46]:
health_insurance_df.tail()

Unnamed: 0,Year,County,Label,Civilian noninstitutionalized population,With health insurance coverage,With private health insurance,With public coverage,No health insurance coverage
1547,2010,"Sutter County, California",Percent,92664,79.3%,51.1%,38.3%,20.7%
1548,2010,"Tulare County, California",Percent,440539,78.1%,46.6%,39.4%,21.9%
1549,2010,"Ventura County, California",Percent,816034,83.7%,68.3%,25.0%,16.3%
1550,2010,"Yolo County, California",Percent,199916,88.2%,73.7%,23.0%,11.8%
1551,2010,"Yuba County, California",Percent,68357,83.2%,49.4%,44.7%,16.8%


# Work on House Ownership data

In [48]:
hse_ownership_df = pd.read_excel(
    "../data/House_Ownership_rent_2010_to_2023.xlsx", header=None
)

In [49]:
hse_ownership_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3239,3240,3241,3242,3243,3244,3245,3246,3247,3248
0,Year,2023,2023,2023,2023,2023,2023,2023,2023,2023,...,2010,2010,2010,2010,2010,2010,2010,2010,2010,2010
1,County,"Alameda County, California","Alameda County, California","Alameda County, California","Alameda County, California","Alpine County, California","Alpine County, California","Alpine County, California","Alpine County, California","Amador County, California",...,"Ventura County, California","Ventura County, California","Yolo County, California","Yolo County, California","Yolo County, California","Yolo County, California","Yuba County, California","Yuba County, California","Yuba County, California","Yuba County, California"
2,Label,Estimate,Margin of Error,Percent,Percent Margin of Error,Estimate,Margin of Error,Percent,Percent Margin of Error,Estimate,...,Percent,Percent Margin of Error,Estimate,Estimate Margin of Error,Percent,Percent Margin of Error,Estimate,Estimate Margin of Error,Percent,Percent Margin of Error


In [50]:
hse_own_df = hse_ownership_df.copy()
hse_own_df = hse_own_df.T
hse_own_df.columns = hse_own_df.iloc[0]
hse_own_df = hse_own_df[1:].reset_index(drop=True)

In [51]:
hse_own_df["County"] = hse_own_df["County"].apply(
    lambda x: x.replace(" County, California", "").strip()
)
hse_own_df = hse_own_df.set_index(["County", "Year"])

In [52]:
hse_own_df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Label,Total housing units,Occupied housing units,Owner-occupied,Renter-occupied,Less than 15.0 percent,15.0 to 19.9 percent,20.0 to 24.9 percent,25.0 to 29.9 percent,30.0 to 34.9 percent,35.0 percent or more
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Alameda,2023,Estimate,630726,593117,320712,272405,32899,34476,33678,29869,23770,106282
Alameda,2023,Margin of Error,±295,"±1,676","±2,874","±2,581","±1,393","±1,609","±1,269","±1,279","±1,155","±2,542"
Alameda,2023,Percent,630726,94.0%,54.1%,45.9%,12.6%,13.2%,12.9%,11.4%,9.1%,40.7%
Alameda,2023,Percent Margin of Error,(X),±0.3,±0.4,±0.4,±0.5,±0.6,±0.5,±0.5,±0.5,±0.8
Alpine,2023,Estimate,1587,473,383,90,35,2,10,22,0,5
Alpine,2023,Margin of Error,±96,±103,±92,±52,±33,±4,±14,±25,±14,±6
Alpine,2023,Percent,1587,29.8%,81.0%,19.0%,47.3%,2.7%,13.5%,29.7%,0.0%,6.8%
Alpine,2023,Percent Margin of Error,(X),±5.8,±9.7,±9.7,±31.2,±4.9,±17.0,±26.5,±39.4,±10.3
Amador,2023,Estimate,18919,16066,12863,3203,252,472,415,310,333,1064
Amador,2023,Margin of Error,±47,±341,±457,±472,±117,±227,±176,±132,±202,±333


In [54]:
hse_own_df = hse_own_df.map(lambda x: x.replace(",", "").strip("±, %"))
hse_own_df["Total housing units"] = hse_own_df["Total housing units"].apply(
    lambda x: x.replace("(X)", "0"))

AttributeError: 'DataFrame' object has no attribute 'map'

In [448]:
hse_own_df[hse_own_df.columns[1:]] = hse_own_df[hse_own_df.columns[1:]].astype(
    "float64"
)

In [449]:
hse_own_df["home_ownership_rate"] = (
    hse_own_df["Owner-occupied"] / hse_own_df["Occupied housing units"]
)

In [450]:
hse_own_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Label,Total housing units,Occupied housing units,Owner-occupied,Renter-occupied,Less than 15.0 percent,15.0 to 19.9 percent,20.0 to 24.9 percent,25.0 to 29.9 percent,30.0 to 34.9 percent,35.0 percent or more,home_ownership_rate
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Alameda,2023,Estimate,630726.0,593117.0,320712.0,272405.0,32899.0,34476.0,33678.0,29869.0,23770.0,106282.0,0.540723
Alameda,2023,Margin of Error,295.0,1676.0,2874.0,2581.0,1393.0,1609.0,1269.0,1279.0,1155.0,2542.0,1.714797
Alameda,2023,Percent,630726.0,94.0,54.1,45.9,12.6,13.2,12.9,11.4,9.1,40.7,0.575532
Alameda,2023,Percent Margin of Error,0.0,0.3,0.4,0.4,0.5,0.6,0.5,0.5,0.5,0.8,1.333333
Alpine,2023,Estimate,1587.0,473.0,383.0,90.0,35.0,2.0,10.0,22.0,0.0,5.0,0.809725
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yolo,2010,Percent Margin of Error,0.0,0.8,1.1,1.1,1.4,1.2,1.5,1.3,1.3,2.1,1.375000
Yuba,2010,Estimate,27351.0,23750.0,14214.0,9536.0,603.0,1037.0,1029.0,1092.0,955.0,3680.0,0.598484
Yuba,2010,Estimate Margin of Error,220.0,494.0,525.0,540.0,142.0,290.0,243.0,247.0,221.0,407.0,1.062753
Yuba,2010,Percent,27351.0,86.8,59.8,40.2,7.2,12.4,12.3,13.0,11.4,43.8,0.688940


# Work on category: "Urban", "Suburban", "Rural"

In [55]:
# Define the criteria for Urban, Suburban, and Rural
conditions = [
    (crime_data_df["population_density"] > 1500),
    (crime_data_df["population_density"] <= 1500)
    & (crime_data_df["population_density"] >= 500),
    (crime_data_df["population_density"] < 500),
]

# The Categories: Index must much index of conditions
categories = ["Urban", "Suburban", "Rural"]

In [56]:
# Create the categorical variable 'area_type'
crime_data_df["county_category"] = np.select(conditions, categories, default="unknown")

In [57]:
crime_data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,...,Population,Area_sq_mi,crime_rate,clearance_rate,population_density,unemployment_rate,median_hse_income,CPI,adjusted_income,county_category
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,...,1185500,738.0,0.009809,0.004580,1606.368564,,,,,Urban
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,...,1206900,738.0,0.010353,0.004615,1635.365854,,,,,Urban
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,...,1220600,738.0,0.009588,0.005164,1653.929539,,,,,Urban
Alameda,1988,10963,159,722,4863,5219,5708,100,498,1545,3565,...,1242300,738.0,0.008825,0.004595,1683.333333,,,,,Urban
Alameda,1989,10563,172,670,4879,4842,5250,98,453,1496,3203,...,1261200,738.0,0.008375,0.004163,1708.943089,,,,,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yuba,2019,326,7,41,61,217,134,8,6,13,107,...,79619,630.0,0.004095,0.001683,126.379365,6.2,56607.0,280.638,201.708250,Rural
Yuba,2020,404,2,37,57,308,161,2,5,26,128,...,81178,630.0,0.004977,0.001983,128.853968,10.6,56278.0,285.315,197.248655,Rural
Yuba,2021,354,6,35,70,243,149,7,9,21,112,...,82091,630.0,0.004312,0.001815,130.303175,8.4,60764.0,297.371,204.337343,Rural
Yuba,2022,279,5,33,40,201,121,4,7,18,92,...,82563,630.0,0.003379,0.001466,131.052381,5.6,63626.0,319.224,199.314588,Rural


# Work on Age folder to get the median age.

Conbine the Excels sheets

In [58]:
# Define the directory with the excel files

age_dir = "../data/Age"

age_dfs = {} # Dictionary to store dataframes

# Loop over a list of files in the directory: Age
for filename in os.listdir(age_dir):
    # print(filename)
    file_path = os.path.join(age_dir, filename)
    raw_data = pd.read_excel(file_path, sheet_name="Data", header=[0, 1])
    tranpose = raw_data.T
    tranpose.columns = tranpose.iloc[0]
    tranpose = tranpose[1:]
    tranpose = tranpose.reset_index()
    tranpose = tranpose.rename(columns={"level_0": "County"})
    tranpose["County"] = tranpose["County"].apply(
        lambda x: x.replace(" County, California", "").strip()
    )
    year = filename.strip("Age_, .xlsx")
    tranpose["Year"] = year
    tranpose.set_index(["County", "Year"], inplace=True)
    # tranpose = tranpose.drop['SUMMARY INDICATORS','PERCENT ALLOCATED','Sex', 'Age']
    age_dfs[year] = tranpose

### Combine all the dataframes into one dataframe 

In [59]:
combined_age_dfs = pd.concat(age_dfs.values(), axis=0)
combined_age_dfs.head()

Unnamed: 0_level_0,"(Unnamed: 0_level_0, Unnamed: 0_level_1)",level_1,Label,Total population,AGE,Under 5 years,5 to 9 years,10 to 14 years,15 to 19 years,20 to 24 years,25 to 29 years,...,Age dependency ratio,Old-Age dependency ratio,Child dependency ratio,PERCENT IMPUTED,Sex,Age,Under 18 years,21 years and over,Old-age dependency ratio,PERCENT ALLOCATED
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,2011,Total,Estimate,1494876,,6.6%,6.3%,6.0%,6.6%,7.0%,7.7%,...,50.9,16.6,34.3,,0.2%,1.5%,,,,
Alameda,2011,Total.1,Margin of Error,*****,,*****,±0.1,±0.1,±0.1,±0.1,*****,...,*****,*****,*****,,(X),(X),,,,
Alameda,2011,Male,Estimate,733297,,6.9%,6.6%,6.1%,6.9%,7.3%,7.8%,...,(X),(X),(X),,(X),(X),,,,
Alameda,2011,Male.1,Margin of Error,*****,,*****,±0.1,±0.1,±0.1,±0.1,*****,...,(X),(X),(X),,(X),(X),,,,
Alameda,2011,Female,Estimate,761579,,6.3%,5.9%,5.9%,6.3%,6.8%,7.6%,...,(X),(X),(X),,(X),(X),,,,


In [60]:
crime_data_df["median_age"] = combined_age_dfs[combined_age_dfs["level_1"] == "Total"][
    "Median age (years)"
]

In [61]:
crime_data_df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,...,Area_sq_mi,crime_rate,clearance_rate,population_density,unemployment_rate,median_hse_income,CPI,adjusted_income,county_category,median_age
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,...,738.0,0.009809,0.00458,1606.368564,,,,,Urban,
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,...,738.0,0.010353,0.004615,1635.365854,,,,,Urban,
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,...,738.0,0.009588,0.005164,1653.929539,,,,,Urban,
Alameda,1988,10963,159,722,4863,5219,5708,100,498,1545,3565,...,738.0,0.008825,0.004595,1683.333333,,,,,Urban,
Alameda,1989,10563,172,670,4879,4842,5250,98,453,1496,3203,...,738.0,0.008375,0.004163,1708.943089,,,,,Urban,


# Work on School Enrollment

In [62]:
from pathlib import Path

In [63]:

sch_enrol_dir = "../data/School_Enrollment"

sch_enrol_dfs = {} # A dictionary to store the Dataframes
raw_data = pd.DataFrame()

# Loop over a list of files in the directory: School_Enrollment
for filename in os.listdir(sch_enrol_dir):
    # print(filename)
    file_path = os.path.join(sch_enrol_dir, filename)
    # Convert the string to a Path object
    file_path = Path(file_path)
    if file_path.suffix == '.xlsx':
        raw_data = pd.read_excel(file_path, sheet_name="Data", header=[0,1,2,3]) 
    if file_path.suffix == '.csv':
        raw_data = pd.read_csv(file_path, header=[0,1,2,3]) 
    year = filename.replace('School_enrollment_', '').replace('.xlsx', '').replace('.csv', '')
    trans = raw_data.T
    trans["Year"] = year
    sch_enrol_dfs[year] = trans

In [64]:
sch_enrol_dfs['2010']

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,Year
Unnamed: 0_level_0,Unnamed: 0_level_1,Unnamed: 0_level_2,Label,Population 3 years and over enrolled in school,"Nursery school, preschool",Kindergarten to 12th grade,Kindergarten,Elementary: grade 1 to grade 4,Elementary: grade 5 to grade 8,High school: grade 9 to grade 12,"College, undergraduate","Graduate, professional school",Percent of age group enrolled in school --,...,Population 18 to 24 years,Enrolled in college or graduate school,Males 18 to 24 years,Enrolled in college or graduate school,Females 18 to 24 years,Enrolled in college or graduate school,PERCENT IMPUTED,School enrollment,Grade enrolled,2010
"Alameda County, California",Total,Unnamed: 1_level_2,Estimate,407491,26241,244741,19247,73625,71806,80063,106146,30363,,...,144102,51.3%,73360,46.4%,70742,56.4%,,3.1%,5.5%,2010
"Alameda County, California",Total,Unnamed: 2_level_2,Margin of Error,"±3,036",±950,"±1,355","±1,180","±1,357","±1,478","±1,360","±2,469","±1,489",,...,*****,±1.1,*****,±1.6,*****,±1.5,,(X),(X),2010
"Alameda County, California",Percent of enrolled population,In public school,Estimate,84.4%,44.0%,89.0%,86.2%,87.8%,88.7%,90.9%,88.5%,68.2%,,...,(X),90.0%,(X),91.4%,(X),88.9%,,(X),(X),2010
"Alameda County, California",Percent of enrolled population,In public school,Margin of Error,±0.5,±2.5,±0.5,±1.6,±0.9,±0.9,±0.7,±0.9,±2.4,,...,(X),±1.0,(X),±1.2,(X),±1.5,,(X),(X),2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Yuba County, California",Total,Unnamed: 344_level_2,Margin of Error,±484,±214,±267,±243,±369,±387,±267,±426,±130,,...,±138,±4.1,±89,±5.6,±84,±6.2,,(X),(X),2010
"Yuba County, California",Percent of enrolled population,In public school,Estimate,91.9%,82.0%,93.8%,96.1%,94.4%,94.0%,92.5%,91.1%,55.1%,,...,(X),95.1%,(X),93.9%,(X),95.8%,,(X),(X),2010
"Yuba County, California",Percent of enrolled population,In public school,Margin of Error,±1.7,±7.7,±1.8,±3.9,±2.6,±3.0,±3.3,±3.3,±14.3,,...,(X),±2.8,(X),±5.4,(X),±3.5,,(X),(X),2010
"Yuba County, California",Percent of enrolled population,In private school,Estimate,8.1%,18.0%,6.2%,3.9%,5.6%,6.0%,7.5%,8.9%,44.9%,,...,(X),4.9%,(X),6.1%,(X),4.2%,,(X),(X),2010


### Combine all the dataframes into one dataframe 

In [65]:
combine_sch_enrol = pd.concat(sch_enrol_dfs.values(), axis=0)
combine_sch_enrol.columns= combine_sch_enrol.iloc[0]
combine_sch_enrol=combine_sch_enrol[1:]
combine_sch_enrol = combine_sch_enrol.reset_index()
combine_sch_enrol=combine_sch_enrol.rename(columns={'2010':'Year', 'level_0':'County', 'level_3':'Label'}) 
combine_sch_enrol["County"] = combine_sch_enrol["County"].apply(
    lambda x: x.replace(" County, California", "").strip()
)
combine_sch_enrol = combine_sch_enrol.set_index(['County', 'Year'])


KeyError: "None of ['Year'] are in the columns"

In [66]:
combine_sch_enrol.head()

"(Unnamed: 0_level_0, Unnamed: 0_level_1, Label, Population 3 years and over enrolled in school)",County,level_1,level_2,Label,"Nursery school, preschool",Kindergarten to 12th grade,Kindergarten,Elementary: grade 1 to grade 4,Elementary: grade 5 to grade 8,High school: grade 9 to grade 12,...,25 to 34 year olds enrolled in school,Population 35 years and over,35 years and over enrolled in school,Population 18 to 24 years,Enrolled in college or graduate school,Males 18 to 24 years,Enrolled in college or graduate school.1,Females 18 to 24 years,Enrolled in college or graduate school.2,2020
0,Alameda,Total,Estimate,411713,29302,243123,20675,73055,74162,75231,...,38994,905488,24906,136997,75907,68351,34971,68646,40936,2020
1,Alameda,Total,Margin of Error,"±3,188","±1,191","±1,069","±1,010","±1,549","±1,561",±962,...,"±1,757",*****,"±1,333",*****,"±1,756",*****,"±1,123",*****,"±1,183",2020
2,Alameda,Percent,Estimate,(X),7.1%,59.1%,5.0%,17.7%,18.0%,18.3%,...,14.1%,(X),2.8%,(X),55.4%,(X),51.2%,(X),59.6%,2020
3,Alameda,Percent,Margin of Error,(X),±0.3,±0.5,±0.2,±0.4,±0.4,±0.3,...,±0.6,(X),±0.1,(X),±1.3,(X),±1.6,(X),±1.7,2020
4,Alameda,In public school,Estimate,(X),12713,218540,18178,65510,66145,68707,...,30176,(X),17686,(X),68772,(X),32252,(X),36520,2020


# Work on Expenditure

In [67]:
expenditure_df = pd.read_excel("../data/Expenditure_2003_to_2023.xlsx")

In [68]:
expenditure_df.head(4)

Unnamed: 0,Entity Name,Entity ID,Fiscal Year,Police Protection_Total Governmental Funds_,Total Education_Total Governmental Funds,Total Public Assistance_Total Governmental Funds,Mental Health_Total Governmental Funds_Health,Drug and Alcohol Abuse Services_Total Governmental Funds_Health,Total Health_Total Governmental Funds,Total Judicial_Total Governmental Funds,Total Detention and Correction_Total Governmental Funds
0,Alameda,1,2023,136000000,39064698.0,1044985620,503588373.0,59132457.0,1100494826,253504410,433901696
1,Alpine,2,2023,3803427,684938.0,3104497,1997691.0,267679.0,3897922,573927,704258
2,Amador,3,2023,13793355,1087355.0,16887389,9895140.0,936709.0,14857378,7175470,9139719
3,Butte,4,2023,23968804,4206603.0,156340270,68213421.0,6190888.0,98478974,33032555,52562192


In [69]:
expenditure_df.columns

Index(['Entity Name', 'Entity ID', 'Fiscal Year',
       'Police Protection_Total Governmental Funds_',
       'Total Education_Total Governmental Funds',
       'Total Public Assistance_Total Governmental Funds',
       'Mental Health_Total Governmental Funds_Health',
       'Drug and Alcohol Abuse Services_Total Governmental Funds_Health',
       'Total Health_Total Governmental Funds',
       'Total Judicial_Total Governmental Funds',
       'Total Detention and Correction_Total Governmental Funds'],
      dtype='object')

In [70]:
expenditure_df = expenditure_df.rename(columns={'Entity Name': 'County', 'Fiscal Year':'Year'})

In [71]:
expenditure_df = expenditure_df.set_index(['County', 'Year'])

In [72]:
expenditure_df.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Entity ID,Police Protection_Total Governmental Funds_,Total Education_Total Governmental Funds,Total Public Assistance_Total Governmental Funds,Mental Health_Total Governmental Funds_Health,Drug and Alcohol Abuse Services_Total Governmental Funds_Health,Total Health_Total Governmental Funds,Total Judicial_Total Governmental Funds,Total Detention and Correction_Total Governmental Funds
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alameda,2023,1,136000000,39064698.0,1044985620,503588400.0,59132457.0,1100494826,253504410,433901696
Alpine,2023,2,3803427,684938.0,3104497,1997691.0,267679.0,3897922,573927,704258
Amador,2023,3,13793355,1087355.0,16887389,9895140.0,936709.0,14857378,7175470,9139719
Butte,2023,4,23968804,4206603.0,156340270,68213420.0,6190888.0,98478974,33032555,52562192
Calaveras,2023,5,13471436,1054402.0,26934694,13070810.0,1200171.0,21193409,5183073,9579066
Colusa,2023,6,7829504,1193397.0,13693497,12151220.0,752539.0,20288594,3986775,8736098
Contra Costa,2023,7,232000000,38418100.0,616905208,205768100.0,18283736.0,368830012,119753157,188893018
Del Norte,2023,8,4780030,90797.0,30288024,9971184.0,1127830.0,14999338,4148599,8540026
El Dorado,2023,9,53438607,4172017.0,83673581,30919460.0,4936155.0,72701186,29761470,41183923
Fresno,2023,10,154000000,32455348.0,802852571,351151200.0,,436316794,139964827,232744450


# Work on Religion

In [231]:
# Specify the path
path = '../data/Religion'

# Get the list of all files in the directory
file_names = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
file_names.remove('.DS_Store')
file_names = sorted(file_names)

# Create an empty dataframe
religion_df = pd.DataFrame()

# For each file get the counts of adherents belonging to each tradition
for file in file_names:
    data = pd.read_csv(path + '/' + file, encoding='latin-1')
    max_index = data.shape[0]-1
    county = file[:-9].replace('_',' ')
    year = file[-8:-4]
    religion_dict = {'County' : county, 'Year': year}
    data.drop(max_index-1, inplace=True)
    
    try:
        data['Adherents'] = data['Adherents'].str.replace(',','')
        data['Adherents'] = pd.to_numeric(data['Adherents'])
    except AttributeError:
        continue
    
    counts = data.groupby('Tradition')['Adherents'].sum()
    for tradition in counts.index:
        religion_dict[tradition] = counts[tradition]
    
    # Convert the dictionary to a DataFrame
    religion_dict_df = pd.DataFrame([religion_dict])

    # Use pd.concat() to append the new row
    religion_df = pd.concat([religion_df, religion_dict_df], ignore_index=True)
    
religion_df = religion_df.set_index(['County', 'Year'])
religion_df = religion_df.drop(columns= {'Â\xa0'})
traditions = religion_df.columns
religion_df = religion_df.fillna(0)
religion_df.head()
    

Unnamed: 0_level_0,Unnamed: 1_level_0,Black Protestant,Buddhism,Catholic,Evangelical Protestant,Hinduism,Islam,Jehovah's Witnesses,Judaism,Latter-day Saints,Mainline Protestant,Orthodox,Other,Other Christians
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Alameda,2010,20716.0,14379.0,231500.0,123462.0,6339.0,29941.0,0.0,9214.0,24929.0,40185.0,6640.0,1125.0,1025.0
Alameda,2020,43178.0,19118.0,322607.0,118752.0,33209.0,57322.0,12886.0,8445.0,26502.0,33200.0,13362.0,1097.0,835.0
Amador,2010,0.0,0.0,3887.0,3288.0,16.0,0.0,0.0,0.0,1196.0,435.0,150.0,4.0,0.0
Amador,2020,0.0,0.0,5139.0,3670.0,0.0,0.0,630.0,0.0,1164.0,162.0,160.0,4.0,0.0
Butte,2010,475.0,236.0,34101.0,22822.0,0.0,1109.0,0.0,32.0,9357.0,6272.0,280.0,218.0,79.0


In [232]:
# Create values for the years from 2010 to 2024 by using linear interpolation

# Get unique counties
counties = religion_df.index.get_level_values('County').unique()

# Create a new MultiIndex with all years from 2010 to 2024
years = [str(year) for year in range(2010, 2025) ]
new_index = pd.MultiIndex.from_product([counties, years], names=['County', 'Year'])

# Reindex the DataFrame
religion_df = religion_df.reindex(new_index)

# Fill missing values with 0 (optional)
#religion_df = religion_df.fillna(0)

# Fill NaN values by interpolate method

columns = religion_df.columns.to_list()

# Convert year index to integer for interpolation
religion_df = religion_df.reset_index()
religion_df['Year'] = religion_df['Year'].astype(int)  # Convert year to integer for interpolation

from scipy import interpolate

def interp_func(x):
    known_x = x.dropna().index.to_numpy(dtype=float)  # Convert to float
    known_y = x.dropna().values
    
    if len(known_x) < 2:
        return x  # Not enough points to interpolate
    f = interpolate.interp1d(known_x, known_y, kind='linear')
    return pd.Series(f(x.index.to_numpy(dtype=float)), index=x.index)

for column in columns:
    religion_df[column] = religion_df.groupby('County', group_keys=False)[column].apply(lambda x: x.interpolate(method='linear', limit_direction='both').round().astype('Int64'))
    religion_df[column] = religion_df.groupby('County', group_keys=False)[column].apply(interp_func)
    
# Set the MultiIndex back
religion_df = religion_df.set_index(['County', 'Year'])

# Replace negative values with 0
religion_df = religion_df.applymap(lambda x: 0 if x < 0 else x)

# Add a new column for the total number of adherents
religion_df['total_adherents'] = religion_df.sum(axis=1)

# Reshape population_df
population_melted = pop_area_copy.set_index('County').melt(ignore_index=False, var_name='Year', value_name='Population').reset_index()
population_melted['Year'] = population_melted['Year'].astype(int)
population_melted = population_melted.set_index(['County', 'Year'])

# Merge with religion_df
religion_df = religion_df.merge(population_melted, left_index=True, right_index=True, how='left')

# Make the data type of Population float
religion_df['Population'] = religion_df['Population'].str.replace(',', '').astype(float)

# Define a new variable adherent_rate = total_adherents / Population
religion_df['adherent_rate'] = religion_df['total_adherents'] / religion_df['Population']
religion_df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Black Protestant,Buddhism,Catholic,Evangelical Protestant,Hinduism,Islam,Jehovah's Witnesses,Judaism,Latter-day Saints,Mainline Protestant,Orthodox,Other,Other Christians,total_adherents,Population,adherent_rate
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Alameda,2010,20716.0,14379.0,231500.0,123462.0,6339.0,29941.0,0.0,9214.0,24929.0,40185.0,6640.0,1125.0,1025.0,509455.0,1510271.0,0.337327
Alameda,2011,22962.0,14853.0,240611.0,122991.0,9026.0,32679.0,1289.0,9137.0,25086.0,39486.0,7312.0,1122.0,1006.0,527560.0,1527169.0,0.34545
Alameda,2012,25208.0,15327.0,249721.0,122520.0,11713.0,35417.0,2577.0,9060.0,25244.0,38788.0,7984.0,1119.0,987.0,545665.0,1549193.0,0.352225
Alameda,2013,27455.0,15801.0,258832.0,122049.0,14400.0,38155.0,3866.0,8983.0,25401.0,38090.0,8657.0,1117.0,968.0,563774.0,1575139.0,0.35792
Alameda,2014,29701.0,16275.0,267943.0,121578.0,17087.0,40893.0,5154.0,8906.0,25558.0,37391.0,9329.0,1114.0,949.0,581878.0,1597747.0,0.364187


In [224]:
# Present basic statistics of adherent_rate

religion_df['adherent_rate'].mean()

0.391942085881901

In [275]:
# Define a new variable rdm = religion_diversity_measure '1: poor, 2:average, 3:good, 4:very good'

def diversity_measure(x):
    if x <5: return 1
    elif x == 5: return 2
    elif x < 8: return 3
    else: return 4

religion_df['rdm'] = religion_df.apply(lambda row: sum(row[tradition] > row['total_adherents'] * 0.02 for tradition in traditions), axis=1)

religion_df['rdm'] = religion_df['rdm'].apply(diversity_measure)

religion_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Black Protestant,Buddhism,Catholic,Evangelical Protestant,Hinduism,Islam,Jehovah's Witnesses,Judaism,Latter-day Saints,Mainline Protestant,Orthodox,Other,Other Christians,total_adherents,Population,adherent_rate,rdm,rdm_pure
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Alameda,2010,20716.0,14379.0,231500.0,123462.0,6339.0,29941.0,0.0,9214.0,24929.0,40185.0,6640.0,1125.0,1025.0,509455.0,1510271.0,0.337327,3,7
Alameda,2011,22962.0,14853.0,240611.0,122991.0,9026.0,32679.0,1289.0,9137.0,25086.0,39486.0,7312.0,1122.0,1006.0,527560.0,1527169.0,0.34545,3,7
Alameda,2012,25208.0,15327.0,249721.0,122520.0,11713.0,35417.0,2577.0,9060.0,25244.0,38788.0,7984.0,1119.0,987.0,545665.0,1549193.0,0.352225,4,8
Alameda,2013,27455.0,15801.0,258832.0,122049.0,14400.0,38155.0,3866.0,8983.0,25401.0,38090.0,8657.0,1117.0,968.0,563774.0,1575139.0,0.35792,4,8
Alameda,2014,29701.0,16275.0,267943.0,121578.0,17087.0,40893.0,5154.0,8906.0,25558.0,37391.0,9329.0,1114.0,949.0,581878.0,1597747.0,0.364187,4,8
