In [257]:
# import modules
import pandas as pd
import numpy as np

# Work on Crime and Clearances, and Pop and Area

In [3]:
# DataFrame for Crimes and clearances with Arson
crime_clearance_df = pd.read_csv(
    "../data/Crimes_and_Clearances_with_Arson-1985-2023.csv"
)

# DataFrame for Pop and area by county
pop_area = pd.read_csv("../data/Pop_and_area_by_county_1980_to_2024.csv")

  crime_clearance_df = pd.read_csv("../data/Crimes_and_Clearances_with_Arson-1985-2023.csv")


The message above indicates that some of the columns have mixed data type. We can resolve this below:
- First check which columns have non-numeric data types.
- Check which columns have mixed data types. 

In [4]:
## Columns with Non numeric dtypes
non_numeric_cols = crime_clearance_df.select_dtypes(include=["object"]).columns
print(non_numeric_cols)

Index(['County', 'NCICCode', 'TotalStructural_sum', 'TotalMobile_sum',
       'TotalOther_sum', 'GrandTotal_sum', 'GrandTotClr_sum'],
      dtype='object')


In [5]:
def mixed_type_columns(df: pd.DataFrame) -> list:
    """
    Look for columns with mixed types

    Parameters:
    df(pd.DataFrame)

    Returns:
    list: A list of columes with mixed dtypes
    """
    mixed_columns = []
    for column in df.columns:
        types_in_column = df[column].map(type).unique()
        if len(types_in_column) > 1:
            mixed_columns.append(column)
    return mixed_columns

In [6]:
# The columns with mixed dtypes
mixed_columns = mixed_type_columns(crime_clearance_df)
print(mixed_columns)

['TotalStructural_sum', 'TotalMobile_sum', 'TotalOther_sum', 'GrandTotal_sum', 'GrandTotClr_sum']


In [7]:
cca_df = crime_clearance_df.copy()  # A copy of rime_clearance_df

# Resolve the issue with mixed dtypes
cca_df[mixed_columns] = cca_df[mixed_columns].apply(pd.to_numeric, errors="coerce")

In [8]:
# No mixed types in the copy of the dataframe.
mixed_columns = mixed_type_columns(cca_df)
print(mixed_columns == [])

True


In [9]:
# Remove the column
cca_df = cca_df.drop(["NCICCode"], axis=1)
cca_df.head()

Unnamed: 0,Year,County,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,Property_sum,Burglary_sum,VehicleTheft_sum,...,MVPLARnao_sum,BILARnao_sum,FBLARnao_sum,COMLARnao_sum,AOLARnao_sum,LT400nao_sum,LT200400nao_sum,LT200nao_sum,LT50200nao_sum,LT50nao_sum
0,1985,Alameda County,427,3,27,166,231,3964,1483,353,...,109,205,44,11,475,753.0,437.0,,440,498
1,1985,Alameda County,405,7,15,220,163,4486,989,260,...,673,516,183,53,559,540.0,622.0,,916,1159
2,1985,Alameda County,101,1,4,58,38,634,161,55,...,62,39,46,17,37,84.0,68.0,,128,138
3,1985,Alameda County,1164,11,43,660,450,12035,2930,869,...,508,611,1877,18,496,533.0,636.0,,2793,4274
4,1985,Alameda County,146,0,5,82,59,971,205,102,...,153,16,85,24,169,217.0,122.0,,161,164


In [10]:
# Custom function to remove County from the values in the column County


def remove_county(text: str) -> str:
    """
    Remove County from string
    """
    return text.replace(" County", "")


assert remove_county("Hello County") == "Hello"
assert remove_county("Hello World County") == "Hello World"

In [11]:
# Apply the remove_county to the dataframe cca_df
cca_df["County"] = cca_df["County"].apply(remove_county)
cca_df["Year"] = cca_df["Year"].astype(str)

# Group by 'County' and 'Year'
cca_grouped_df = cca_df.groupby(["County", "Year"]).sum()

In [12]:
cca_grouped_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,Property_sum,Burglary_sum,VehicleTheft_sum,LTtotal_sum,ViolentClr_sum,...,MVPLARnao_sum,BILARnao_sum,FBLARnao_sum,COMLARnao_sum,AOLARnao_sum,LT400nao_sum,LT200400nao_sum,LT200nao_sum,LT50200nao_sum,LT50nao_sum
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,1985,11628,143,791,5427,5267,89297,24997,7142,57158,5429,...,5728,3926,10664,406,9163,7744.0,7787.0,0.0,14473,27154
Alameda,1986,12495,174,820,5971,5530,90167,24392,7896,57879,5570,...,5449,3380,9575,465,9552,9048.0,7482.0,0.0,13459,27890
Alameda,1987,11703,147,770,5019,5767,88306,22399,8909,56998,6303,...,5445,2954,8687,256,8817,11437.0,8132.0,0.0,10845,26584
Alameda,1988,10963,159,722,4863,5219,92745,22308,11080,59357,5708,...,4971,3183,8103,367,10482,12588.0,8538.0,0.0,11133,27098
Alameda,1989,10563,172,670,4879,4842,92888,21311,12556,59021,5250,...,4998,3702,7386,322,8284,13458.0,8758.0,0.0,11590,25215


In [13]:
# Create a new feature 'crime_rate' for each county and year: crime_rate = Violentsum/poplation
violent = [
    "Violent_sum",
    "Homicide_sum",
    "ForRape_sum",
    "Robbery_sum",
    "AggAssault_sum",
    "ViolentClr_sum",
    "HomicideClr_sum",
    "ForRapeClr_sum",
    "RobberyClr_sum",
    "AggAssaultClr_sum",
]

property = [
    "Property_sum",
    "Burglary_sum",
    "VehicleTheft_sum",
    "LTtotal_sum",
    "PropertyClr_sum",
    "BurglaryClr_sum",
    "VehicleTheftClr_sum",
    "LTtotalClr_sum",
]

In [14]:
crime_data = cca_grouped_df[violent + property]
crime_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,Property_sum,Burglary_sum,VehicleTheft_sum,LTtotal_sum,PropertyClr_sum,BurglaryClr_sum,VehicleTheftClr_sum,LTtotalClr_sum
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,89297,24997,7142,57158,15409,3117,1607,10685
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,90167,24392,7896,57879,15121,2899,1698,10524
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,88306,22399,8909,56998,16380,2848,2189,11343
Alameda,1988,10963,159,722,4863,5219,5708,100,498,1545,3565,92745,22308,11080,59357,16747,2671,2533,11543
Alameda,1989,10563,172,670,4879,4842,5250,98,453,1496,3203,92888,21311,12556,59021,16171,2539,2560,11072


In [15]:
# Columns for new dataframe
list_of_cols = [
    "year",
    "county",
    "population",
    "crime_rate",
    "clearance_rate",
    "population_density",
    "vacancy_rate",
    "number_of_person_in_household",
    "mobile_home_ratio",
    "percent_in_poverty",
    "adjusted_median_income",
    "unemployment_rate",
    "dropout_rate",
    "public_school_rate",
    "no_highschool_rate",
    "uninsured_rate",
    "house_affordability",
    "adj_police_budget",
    "adj_education_budget",
    "adj_welfare_budget",
    "adj_mental_health_budget",
    "adj_rehab_budget",
    "adj_health_budget",
    "adj_judiciary_budget",
    "adj_prison_budget",
    "median_age",
    "home_ownership_rate",
    "rent_burden",
]

In [16]:
pop_area = pop_area.rename(columns={"COUNTY": "County", "Area (sq mi)": "Area_sq_mi"})
pop_area = pop_area.fillna(0)

pop_area_copy = pop_area.copy()  # Make a copy
pop_area_copy.head()

Unnamed: 0,County,Area_sq_mi,1981,1982,1983,1984,1985,1986,1987,1988,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Alameda,738.0,1117800,1134000,1151800,1170400,1185500,1206900,1220600,1242300,...,1622205,1641983,1656919,1666247,1675964,1681337,1655767,1645265,1644199,1644569
1,Alpine,739.0,1090,1100,1120,1080,1100,1140,1130,1100,...,1190,1196,1201,1205,1201,1204,1181,1177,1166,1163
2,Amador,606.0,19800,20250,20600,21050,21800,22450,23300,25750,...,37453,37663,38807,39708,40227,40426,40224,40073,40028,39893
3,Butte,1604.0,146800,150700,153800,156600,159700,163000,166200,170800,...,227400,228198,230412,231774,227263,216090,206058,206183,205741,206194
4,Calaveras,1020.0,21350,22250,23200,23850,24650,25550,26800,28200,...,45395,45402,45355,45367,45324,45290,45013,44771,44616,44436


In [17]:
pop_area_copy = pop_area_copy.drop(["Area_sq_mi"], axis=1)
pop_area_copy["County"] = pop_area_copy["County"].apply(lambda x: str(x.strip()))
pop_area_copy.head()

Unnamed: 0,County,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Alameda,1117800,1134000,1151800,1170400,1185500,1206900,1220600,1242300,1261200,...,1622205,1641983,1656919,1666247,1675964,1681337,1655767,1645265,1644199,1644569
1,Alpine,1090,1100,1120,1080,1100,1140,1130,1100,1090,...,1190,1196,1201,1205,1201,1204,1181,1177,1166,1163
2,Amador,19800,20250,20600,21050,21800,22450,23300,25750,27600,...,37453,37663,38807,39708,40227,40426,40224,40073,40028,39893
3,Butte,146800,150700,153800,156600,159700,163000,166200,170800,175200,...,227400,228198,230412,231774,227263,216090,206058,206183,205741,206194
4,Calaveras,21350,22250,23200,23850,24650,25550,26800,28200,29700,...,45395,45402,45355,45367,45324,45290,45013,44771,44616,44436


In [18]:
# Convert pop_area_copy to the structure of crime_data
pop_index_county = pop_area_copy.set_index("County")  # Index dataframe by County
pop_stacked = pop_index_county.stack().to_frame(name="popupation")

pop_stacked["popupation"] = pop_stacked["popupation"].apply(
    lambda x: int(str(x.replace(",", "")))
)
pop_stacked.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2596 entries, ('Alameda', '1981') to ('State Total', '2024')
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   popupation  2596 non-null   int64
dtypes: int64(1)
memory usage: 29.3+ KB


In [19]:
# Create Column for Population and Area

crime_data_df = crime_data.copy()
crime_data_df["Population"] = pop_stacked[
    "popupation"
]  # Add column 'Population' to crime_data_df

area_df = pop_area.copy()
area_df = area_df[["County", "Area_sq_mi"]]
area_df["County"] = area_df["County"].apply(lambda x: str(x.strip()))

crime_data_df = crime_data_df.reset_index()  # Reset the index
crime_data_df = crime_data_df.merge(area_df, on="County", how="left")
crime_data_df = crime_data_df.set_index(["County", "Year"])

In [20]:
crime_data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,Property_sum,Burglary_sum,VehicleTheft_sum,LTtotal_sum,PropertyClr_sum,BurglaryClr_sum,VehicleTheftClr_sum,LTtotalClr_sum,Population,Area_sq_mi
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,89297,24997,7142,57158,15409,3117,1607,10685,1185500,738.0
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,90167,24392,7896,57879,15121,2899,1698,10524,1206900,738.0
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,88306,22399,8909,56998,16380,2848,2189,11343,1220600,738.0
Alameda,1988,10963,159,722,4863,5219,5708,100,498,1545,3565,92745,22308,11080,59357,16747,2671,2533,11543,1242300,738.0
Alameda,1989,10563,172,670,4879,4842,5250,98,453,1496,3203,92888,21311,12556,59021,16171,2539,2560,11072,1261200,738.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yuba,2019,326,7,41,61,217,134,8,6,13,107,1844,405,475,964,121,45,15,61,79619,630.0
Yuba,2020,404,2,37,57,308,161,2,5,26,128,1841,318,623,900,101,31,23,47,81178,630.0
Yuba,2021,354,6,35,70,243,149,7,9,21,112,1284,242,293,749,110,28,42,40,82091,630.0
Yuba,2022,279,5,33,40,201,121,4,7,18,92,1227,302,131,794,84,19,25,40,82563,630.0


Compute Rates

In [21]:
crime_data_df["crime_rate"] = (
    crime_data_df["Violent_sum"] / crime_data_df["Population"]
)  # Crime rate
crime_data_df["clearance_rate"] = (
    crime_data_df["ViolentClr_sum"] / crime_data_df["Population"]
)  # Clearance rate
crime_data_df["population_density"] = (
    crime_data_df["Population"] / crime_data_df["Area_sq_mi"]
)

# Work on Unemployment 

In [22]:
# DataFrame for Unemployment rate
unemployment_rate_df = pd.read_excel("../data/Unemployment_rate_1990-2023.xlsx")

In [23]:
unemployment_rate_df = unemployment_rate_df.rename(
    columns={
        "County Name/State Abbreviation": "County",
        "unemployment rate(%)": "unemployment_rate",
    }
)

In [24]:
unemp_rate_df = unemployment_rate_df.copy()
unemp_rate_df["County"] = unemp_rate_df["County"].apply(
    lambda x: x.replace(" County, CA", "").strip()
)
unemp_rate_rev_df = unemp_rate_df.fillna(0)
unemp_df = unemp_rate_rev_df.copy()
unemp_df = unemp_df.set_index(["County", "Year"])

In [25]:
unemp_df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Code,Code.1,Code.2,Unnamed: 5,Laber Force,Employed,Unemployed,unemployment_rate
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alameda,2023,CN0600100000000,6,1,0.0,826102,792439,33663,4.1
Alpine,2023,CN0600300000000,6,3,0.0,540,505,35,6.5
Amador,2023,CN0600500000000,6,5,0.0,14404,13673,731,5.1


In [26]:
crime_data_df.loc[:, "unemployment_rate"] = unemp_df.loc[:, "unemployment_rate"]
crime_data_df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,...,PropertyClr_sum,BurglaryClr_sum,VehicleTheftClr_sum,LTtotalClr_sum,Population,Area_sq_mi,crime_rate,clearance_rate,population_density,unemployment_rate
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,...,15409,3117,1607,10685,1185500,738.0,0.009809,0.00458,1606.368564,
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,...,15121,2899,1698,10524,1206900,738.0,0.010353,0.004615,1635.365854,
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,...,16380,2848,2189,11343,1220600,738.0,0.009588,0.005164,1653.929539,


# Work on Median House and CPI

In [27]:
# DataFrame for Califonia_CPI
califonia_cpi_df = pd.read_excel("../data/California_CPI_1985_to_2023.xlsx")

# DataFrame for median household income
median_house_income = pd.read_excel("../data/Median_income_2000_and_2009_to_2023.xlsx")

In [28]:
califonia_cpi_df.head()

Unnamed: 0,Year,CPI
0,2024,341.951
1,2023,331.804
2,2022,319.224
3,2021,297.371
4,2020,285.315


In [29]:
median_house_income.head()

Unnamed: 0,Year,County,Median Household Income
0,2023,California,95473
1,2023,Alameda County,119230
2,2023,Alpine County,83265
3,2023,Amador County,80767
4,2023,Butte County,63084


In [30]:
median_house_cpi = median_house_income.merge(califonia_cpi_df, on="Year", how="left")
median_house_cpi["County"] = median_house_cpi["County"].apply(
    lambda x: x.replace(" County", "").strip()
)
median_house_cpi = median_house_cpi.rename(
    columns={"Median Household Income": "median_hse_income"}
)
median_house_cpi = median_house_cpi.set_index(["County", "Year"])
median_house_cpi["median_hse_income"] = median_house_cpi["median_hse_income"].astype(
    "float64"
)

In [31]:
median_house_cpi["adjusted_income"] = (
    median_house_cpi["median_hse_income"] / median_house_cpi.CPI
)
median_house_cpi.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,median_hse_income,CPI,adjusted_income
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
California,2023,95473.0,331.804,287.739147
Alameda,2023,119230.0,331.804,359.338646
Alpine,2023,83265.0,331.804,250.946342


In [32]:
crime_data_df.loc[:, "adjusted_income"] = median_house_cpi.loc[:, "adjusted_income"]

In [39]:
crime_data_df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,...,BurglaryClr_sum,VehicleTheftClr_sum,LTtotalClr_sum,Population,Area_sq_mi,crime_rate,clearance_rate,population_density,unemployment_rate,adjusted_income
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,...,3117,1607,10685,1185500,738.0,0.009809,0.00458,1606.368564,,
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,...,2899,1698,10524,1206900,738.0,0.010353,0.004615,1635.365854,,
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,...,2848,2189,11343,1220600,738.0,0.009588,0.005164,1653.929539,,


# Work on Poverty

In [34]:
poverty_rate_df = pd.read_excel("../data/Poverty_rate_2009_2023.xlsx")

In [35]:
poverty_rate_df.head()

Unnamed: 0,Year,ID,Name,Poverty Universe,Number in Poverty,90% Confidence Interval,Percent in Poverty,90% Confidence Interval.1
0,2023,6000,California,38249913,4597732,"4,546,196 to 4,649,268",12.0,11.9 to 12.1
1,2023,6001,Alameda County,1594026,151872,"138,959 to 164,785",9.5,8.7 to 10.3
2,2023,6003,Alpine County,1136,177,134 to 220,15.6,11.8 to 19.4
3,2023,6005,Amador County,37700,4400,"3,493 to 5,307",11.7,9.3 to 14.1
4,2023,6007,Butte County,203267,40532,"36,792 to 44,272",19.9,18.1 to 21.7


# Work on health Insurance

In [36]:
health_insurance_df = pd.read_excel("../data/Health_Insurance_2010_to_2023.xlsx")

In [37]:
health_insurance_df.head()

Unnamed: 0,Year,County,Label,Civilian noninstitutionalized population,With health insurance coverage,With private health insurance,With public coverage,No health insurance coverage
0,2023,"Alameda County, California",Estimate,1641321,1574283,1228986,502254,67038
1,2023,"Alpine County, California",Estimate,1695,1610,1119,734,85
2,2023,"Amador County, California",Estimate,37789,35568,26408,17708,2221
3,2023,"Butte County, California",Estimate,207385,194212,127086,98182,13173
4,2023,"Calaveras County, California",Estimate,45670,43263,28952,22699,2407


In [40]:
health_insurance_df.tail()

Unnamed: 0,Year,County,Label,Civilian noninstitutionalized population,With health insurance coverage,With private health insurance,With public coverage,No health insurance coverage
1547,2010,"Sutter County, California",Percent,92664,79.3%,51.1%,38.3%,20.7%
1548,2010,"Tulare County, California",Percent,440539,78.1%,46.6%,39.4%,21.9%
1549,2010,"Ventura County, California",Percent,816034,83.7%,68.3%,25.0%,16.3%
1550,2010,"Yolo County, California",Percent,199916,88.2%,73.7%,23.0%,11.8%
1551,2010,"Yuba County, California",Percent,68357,83.2%,49.4%,44.7%,16.8%


# Work on House Ownership data

In [None]:
hse_ownership_df = pd.read_excel(
    "../data/House_Ownership_rent_2010_to_2023.xlsx", header=None
)

In [234]:
hse_own_df = hse_ownership_df.copy()
hse_own_df = hse_own_df.T
hse_own_df.columns = hse_own_df.iloc[0]
hse_own_df = hse_own_df[1:].reset_index(drop=True)

In [235]:
hse_own_df["County"] = hse_own_df["County"].apply(
    lambda x: x.replace(" County, California", "").strip()
)
hse_own_df = hse_own_df.set_index(["County", "Year"])

In [236]:
hse_own_df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Label,Total housing units,Occupied housing units,Owner-occupied,Renter-occupied,Less than 15.0 percent,15.0 to 19.9 percent,20.0 to 24.9 percent,25.0 to 29.9 percent,30.0 to 34.9 percent,35.0 percent or more
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Alameda,2023,Estimate,630726,593117,320712,272405,32899,34476,33678,29869,23770,106282
Alameda,2023,Margin of Error,±295,"±1,676","±2,874","±2,581","±1,393","±1,609","±1,269","±1,279","±1,155","±2,542"
Alameda,2023,Percent,630726,94.0%,54.1%,45.9%,12.6%,13.2%,12.9%,11.4%,9.1%,40.7%
Alameda,2023,Percent Margin of Error,(X),±0.3,±0.4,±0.4,±0.5,±0.6,±0.5,±0.5,±0.5,±0.8
Alpine,2023,Estimate,1587,473,383,90,35,2,10,22,0,5
Alpine,2023,Margin of Error,±96,±103,±92,±52,±33,±4,±14,±25,±14,±6
Alpine,2023,Percent,1587,29.8%,81.0%,19.0%,47.3%,2.7%,13.5%,29.7%,0.0%,6.8%
Alpine,2023,Percent Margin of Error,(X),±5.8,±9.7,±9.7,±31.2,±4.9,±17.0,±26.5,±39.4,±10.3
Amador,2023,Estimate,18919,16066,12863,3203,252,472,415,310,333,1064
Amador,2023,Margin of Error,±47,±341,±457,±472,±117,±227,±176,±132,±202,±333


In [240]:
hse_own_df = hse_own_df.map(lambda x: x.replace(",", "").strip("±, %"))
hse_own_df["Total housing units"] = hse_own_df["Total housing units"].apply(
    lambda x: x.replace("(X)", "0")
)

In [None]:
hse_own_df[hse_own_df.columns[1:]] = hse_own_df[hse_own_df.columns[1:]].astype(
    "float64"
)

In [245]:
hse_own_df["home_ownership_rate"] = (
    hse_own_df["Owner-occupied"] / hse_own_df["Occupied housing units"]
)

In [246]:
hse_own_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Label,Total housing units,Occupied housing units,Owner-occupied,Renter-occupied,Less than 15.0 percent,15.0 to 19.9 percent,20.0 to 24.9 percent,25.0 to 29.9 percent,30.0 to 34.9 percent,35.0 percent or more,home_ownership_rate
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Alameda,2023,Estimate,630726.0,593117.0,320712.0,272405.0,32899.0,34476.0,33678.0,29869.0,23770.0,106282.0,0.540723
Alameda,2023,Margin of Error,295.0,1676.0,2874.0,2581.0,1393.0,1609.0,1269.0,1279.0,1155.0,2542.0,1.714797
Alameda,2023,Percent,630726.0,94.0,54.1,45.9,12.6,13.2,12.9,11.4,9.1,40.7,0.575532
Alameda,2023,Percent Margin of Error,0.0,0.3,0.4,0.4,0.5,0.6,0.5,0.5,0.5,0.8,1.333333
Alpine,2023,Estimate,1587.0,473.0,383.0,90.0,35.0,2.0,10.0,22.0,0.0,5.0,0.809725
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yolo,2010,Percent Margin of Error,0.0,0.8,1.1,1.1,1.4,1.2,1.5,1.3,1.3,2.1,1.375000
Yuba,2010,Estimate,27351.0,23750.0,14214.0,9536.0,603.0,1037.0,1029.0,1092.0,955.0,3680.0,0.598484
Yuba,2010,Estimate Margin of Error,220.0,494.0,525.0,540.0,142.0,290.0,243.0,247.0,221.0,407.0,1.062753
Yuba,2010,Percent,27351.0,86.8,59.8,40.2,7.2,12.4,12.3,13.0,11.4,43.8,0.688940


In [267]:
# Define the criteria for Urban, Suburban, and Rural
conditions = [
    (crime_data_df["population_density"] > 1500),
    (crime_data_df["population_density"] <= 1500)
    & (crime_data_df["population_density"] >= 500),
    (crime_data_df["population_density"] < 500),
]

# The Categories: Index must much index of conditions
categories = ["Urban", "Suburban", "Rural"]

In [268]:
# Create the categorical variable 'area_type'
crime_data_df["county_category"] = np.select(conditions, categories, default="unknown")

In [269]:
crime_data_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Violent_sum,Homicide_sum,ForRape_sum,Robbery_sum,AggAssault_sum,ViolentClr_sum,HomicideClr_sum,ForRapeClr_sum,RobberyClr_sum,AggAssaultClr_sum,...,VehicleTheftClr_sum,LTtotalClr_sum,Population,Area_sq_mi,crime_rate,clearance_rate,population_density,unemployment_rate,adjusted_income,county_category
County,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alameda,1985,11628,143,791,5427,5267,5429,91,445,1517,3376,...,1607,10685,1185500,738.0,0.009809,0.004580,1606.368564,,,Urban
Alameda,1986,12495,174,820,5971,5530,5570,114,532,1545,3379,...,1698,10524,1206900,738.0,0.010353,0.004615,1635.365854,,,Urban
Alameda,1987,11703,147,770,5019,5767,6303,91,511,1569,4132,...,2189,11343,1220600,738.0,0.009588,0.005164,1653.929539,,,Urban
Alameda,1988,10963,159,722,4863,5219,5708,100,498,1545,3565,...,2533,11543,1242300,738.0,0.008825,0.004595,1683.333333,,,Urban
Alameda,1989,10563,172,670,4879,4842,5250,98,453,1496,3203,...,2560,11072,1261200,738.0,0.008375,0.004163,1708.943089,,,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yuba,2019,326,7,41,61,217,134,8,6,13,107,...,15,61,79619,630.0,0.004095,0.001683,126.379365,,,Rural
Yuba,2020,404,2,37,57,308,161,2,5,26,128,...,23,47,81178,630.0,0.004977,0.001983,128.853968,,,Rural
Yuba,2021,354,6,35,70,243,149,7,9,21,112,...,42,40,82091,630.0,0.004312,0.001815,130.303175,,,Rural
Yuba,2022,279,5,33,40,201,121,4,7,18,92,...,25,40,82563,630.0,0.003379,0.001466,131.052381,,,Rural
