# **Section: Load all packages**

In [26]:
# Importing all of the required python packages
import sys
import os
import pandas as pd

In [2]:
# This reloads the extension if already loaded - Everytime you update the .py files, re-run this cell!
%reload_ext autoreload
# Automatically reloads modules before executing code OR makes Jupyter reload your .py files whenever you run a cell.
%autoreload 2

In [15]:
# Setting up the folder paths.
sys.path.append('./lib')

# ---- Import functions within .py files.
# From data_loader.py
from data_loader import load_csv, save_df_to_csv

# From data_wrangle.py
# ---- Section 1: Modular Functions ----
from data_wrangle import add_cols, remove_cols, drop_columns, filter_dataframe, remove_rows, remove_nan_cols, remove_nan_rows, col_name_changer, rename_columns, column_value_changer, remove_leading_wspace, df_split, df_combo, grab_cols_for_visual, select_columns, df_transpose
# ---- Section 2: Specific Functions for Census Data ----
from data_wrangle import df_formater, df_split_state_city, remove_percent, remove_symbol, census_filter_cols, census_rename_cols, numeric_converter
# ---- Section 3: Specific Functions for Chronic Disease Data----
from data_wrangle import stratify_dataframe, pivot_questions

# **Section: Loading and converting datasets into DataFrames**

In [5]:
df_indicators_raw = load_csv('./data/raw/U.S._Chronic_Disease_Indicators.csv')

In [61]:
df_census_raw = load_csv('./data/raw/US_Census_Data_2022_v04_transpose.csv')
print(df_census_raw.shape)

(784, 346)


In [None]:
df_chronic_raw = load_csv('./data/processed/Chronic_Disease_Final.csv')

# **Section: Data Wrangling**

Census DataFrame: df_census
1. Adding the 'State' column to the dataframe.
2. Adding the correct state name to the newly created State column.
3. Removing blank columns.
4. Removing blank rows.
5. Split DataFrame into 2 separate DataFrames, 1 with state and 1 with city, state; City, State Dataset no long need.
6. Split state DataFrame into 2 separate Dataset on State for  Estimate and Margin of Error.
7. Take only the estimate data base.
8. filter down by the only needed columns.
9. convert all values into numeric data type.
10. Combine Census and Chronic Diease dataset on State.
11. Save both DataFrames into csv files in /data/processed folder.

In [63]:
col_names = ['State']
df_census_temp0 = add_cols(df_census_raw, col_names)
# display(df_census_temp0)    # For debugging only - comment out when not needed.
print(df_census_temp0.shape)

(784, 347)


In [65]:
df_census_temp1 = df_formater(df_census_temp0)
# display(df_census_temp1)    # For debugging only - comment out when not needed.
print(df_census_temp1.shape)

(784, 347)


In [67]:
df_census_temp2 = remove_nan_cols(df_census_temp1)
# display(df_census_temp2)    # For debugging only - comment out when not needed.
print(df_census_temp2.shape)

(784, 310)


In [69]:
df_census_temp3 = remove_nan_rows(df_census_temp2)
# display(df_census_temp3)    # For debugging only - comment out when not needed.
print(df_census_temp3.shape)

(392, 310)


In [73]:
df_state_only, df_city_state = df_split_state_city(df_census_temp3, 'State')
# display(df_state_only)    # For debugging only - comment out when not needed.
# display(df_state_city)    # For debugging only - comment out when not needed.

In [75]:
# Remove State/location we do not care for.
state_remove = ['Guam', 'District of Columbia', 'Puerto Rico', 'United States', 'Virgin Islands']
df_state_only_updated = remove_rows(df_state_only, 'State', state_remove)
display(df_state_only_updated)

Unnamed: 0,State,Label (Grouping),TOTAL NUMBER OF RACES REPORTED!!Total population,TOTAL NUMBER OF RACES REPORTED!!Total population!!One race,TOTAL NUMBER OF RACES REPORTED!!Total population!!Two races,TOTAL NUMBER OF RACES REPORTED!!Total population!!Three races,TOTAL NUMBER OF RACES REPORTED!!Total population!!Four or more races,SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Male,SEX AND AGE!!Total population!!Female,...,OWNER CHARACTERISTICS!!Owner-occupied housing units!!Median selected monthly owner costs with a mortgage (dollars),OWNER CHARACTERISTICS!!Owner-occupied housing units!!Median selected monthly owner costs without a mortgage (dollars),GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Occupied units paying rent (excluding units where GRAPI cannot be computed),GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Occupied units paying rent (excluding units where GRAPI cannot be computed)!!Less than 30 percent,GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Occupied units paying rent (excluding units where GRAPI cannot be computed)!!30 percent or more,GROSS RENT!!Occupied units paying rent,GROSS RENT!!Occupied units paying rent!!Median gross rent (dollars),COMPUTERS AND INTERNET USE!!Total households,COMPUTERS AND INTERNET USE!!Total households!!With a computer,COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription
2,Alabama,Estimate,5074296,94.90%,4.70%,0.30%,0.10%,5074296,48.50%,51.50%,...,1293,395,517883,50.50%,49.50%,535960,913,2016448,94.00%,87.40%
3,Alabama,Margin of Error,*****,±0.2,±0.2,±0.1,±0.1,*****,±0.1,±0.1,...,±15,±5,"±11,892",±1.5,±1.5,"±11,920",±14,"±11,475",±0.3,±0.4
4,Alaska,Estimate,733583,85.80%,12.80%,1.30%,0.20%,733583,52.60%,47.40%,...,2019,629,84809,56.90%,43.10%,85226,1329,274574,97.30%,91.60%
5,Alaska,Margin of Error,*****,±1.0,±0.9,±0.3,±0.1,*****,±0.3,±0.3,...,±56,±18,"±4,489",±3.3,±3.3,"±4,502",±31,"±3,261",±0.5,±0.6
6,Arizona,Estimate,7359197,80.60%,18.50%,0.80%,0.10%,7359197,50.00%,50.00%,...,1616,468,867455,47.20%,52.80%,887809,1450,2850377,96.40%,91.10%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,West Virginia,Margin of Error,*****,±0.3,±0.3,±0.1,±0.1,*****,±0.2,±0.2,...,±17,±6,"±6,456",±2.1,±2.1,"±6,256",±17,"±6,960",±0.6,±0.7
100,Wisconsin,Estimate,5892539,92.10%,7.50%,0.40%,0.00%,5892539,50.20%,49.80%,...,1545,616,748787,54.60%,45.40%,763110,992,2491121,94.90%,90.60%
101,Wisconsin,Margin of Error,*****,±0.2,±0.2,±0.1,±0.1,*****,±0.1,±0.1,...,±9,±5,"±11,064",±1.0,±1.0,"±11,107",±9,"±7,766",±0.2,±0.3
102,Wyoming,Estimate,581381,91.70%,7.70%,0.60%,0.00%,581381,51.20%,48.80%,...,1564,484,58178,55.90%,44.10%,59616,895,243321,95.70%,89.50%


In [77]:
# Filter out only columns we care about.
df_state_only_cols = census_filter_cols(df_state_only_updated)
display(df_state_only_cols)
# print(df_state_only_cols.columns)

Unnamed: 0,State,Label (Grouping),SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Male,SEX AND AGE!!Total population!!Female,SEX AND AGE!!Total population!!18 years and over,SEX AND AGE!!Total population!!18 years and over!!Male,SEX AND AGE!!Total population!!18 years and over!!Female,EDUCATIONAL ATTAINMENT!!Population 25 years and over,EDUCATIONAL ATTAINMENT!!Population 25 years and over!!High school graduate or higher,...,HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population!!With private health insurance,HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population!!With public coverage,HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population!!No health insurance coverage,POVERTY RATES FOR FAMILIES AND PEOPLE FOR WHOM POVERTY STATUS IS DETERMINED!!All people!!18 years and over,VEHICLES AVAILABLE!!Occupied housing units,VEHICLES AVAILABLE!!Occupied housing units!!None,VEHICLES AVAILABLE!!Occupied housing units!!1 or more,SELECTED MONTHLY OWNER COSTS AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Housing units with a mortgage (excluding units where SMOC cannot be computed),SELECTED MONTHLY OWNER COSTS AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Housing units with a mortgage (excluding units where SMOC cannot be computed)!!Less than 30 percent,SELECTED MONTHLY OWNER COSTS AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Housing units with a mortgage (excluding units where SMOC cannot be computed)!!30 percent or more
2,Alabama,Estimate,5074296,48.50%,51.50%,78.10%,47.80%,52.20%,3474924,88.80%,...,67.00%,38.40%,8.80%,14.60%,2016448,5.20%,94.80%,768014,77.00%,23.00%
3,Alabama,Margin of Error,*****,±0.1,±0.1,±0.1,±0.1,±0.1,"±5,963",±0.3,...,±0.5,±0.4,±0.3,±0.4,"±11,475",±0.3,±0.3,"±12,780",±0.8,±0.8
4,Alaska,Estimate,733583,52.60%,47.40%,76.00%,53.10%,46.90%,489218,93.30%,...,66.40%,37.20%,11.00%,10.10%,274574,9.10%,90.90%,110040,70.90%,29.10%
5,Alaska,Margin of Error,*****,±0.3,±0.3,±0.2,±0.2,±0.2,"±1,732",±0.7,...,±1.2,±1.2,±0.8,±0.8,"±3,261",±0.8,±0.8,"±4,022",±2.3,±2.3
6,Arizona,Estimate,7359197,50.00%,50.00%,78.40%,49.70%,50.30%,5053656,89.20%,...,63.90%,38.30%,10.30%,11.50%,2850377,5.10%,94.90%,1180749,73.70%,26.30%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,West Virginia,Margin of Error,*****,±0.2,±0.2,±0.1,±0.1,±0.1,"±3,566",±0.5,...,±1.0,±0.9,±0.4,±0.7,"±6,960",±0.6,±0.6,"±7,604",±1.3,±1.3
100,Wisconsin,Estimate,5892539,50.20%,49.80%,78.90%,49.80%,50.20%,4088500,93.50%,...,73.30%,35.70%,5.20%,10.20%,2491121,6.50%,93.50%,1031942,76.50%,23.50%
101,Wisconsin,Margin of Error,*****,±0.1,±0.1,±0.1,±0.1,±0.1,"±3,825",±0.2,...,±0.4,±0.4,±0.2,±0.3,"±7,766",±0.3,±0.3,"±11,026",±0.5,±0.5
102,Wyoming,Estimate,581381,51.20%,48.80%,77.70%,51.00%,49.00%,399796,93.70%,...,71.00%,31.20%,11.50%,11.10%,243321,4.10%,95.90%,101037,73.70%,26.30%


In [79]:
# Updating the column labels to a more friendly version.
df_state_only_final = census_rename_cols(df_state_only_cols)
display(df_state_only_final)

Unnamed: 0,State,Label (Grouping),Total Pop,Total Pop - Male,Total Pop - Female,Total Pop 18 and Over,Total Pop 18 and Over – Male,Total Pop 18 and Over – Female,Pop 25 and Over - Educated,Pop 25 and Over – HS Graduate or Higher,...,Pop With Private Health Insurance,Pop With Public Health Insurance,Pop Uninsured,Pop 18 and Over Below Poverty,Occupied Housing Units,Households With No Vehicles,Households With 1 and Over Vehicles,Owner-Occupied With Mortgage,Mortgage Costs less than 30 percent Income,Mortgage Costs 30 precent or more of Income
2,Alabama,Estimate,5074296,48.50%,51.50%,78.10%,47.80%,52.20%,3474924,88.80%,...,67.00%,38.40%,8.80%,14.60%,2016448,5.20%,94.80%,768014,77.00%,23.00%
3,Alabama,Margin of Error,*****,±0.1,±0.1,±0.1,±0.1,±0.1,"±5,963",±0.3,...,±0.5,±0.4,±0.3,±0.4,"±11,475",±0.3,±0.3,"±12,780",±0.8,±0.8
4,Alaska,Estimate,733583,52.60%,47.40%,76.00%,53.10%,46.90%,489218,93.30%,...,66.40%,37.20%,11.00%,10.10%,274574,9.10%,90.90%,110040,70.90%,29.10%
5,Alaska,Margin of Error,*****,±0.3,±0.3,±0.2,±0.2,±0.2,"±1,732",±0.7,...,±1.2,±1.2,±0.8,±0.8,"±3,261",±0.8,±0.8,"±4,022",±2.3,±2.3
6,Arizona,Estimate,7359197,50.00%,50.00%,78.40%,49.70%,50.30%,5053656,89.20%,...,63.90%,38.30%,10.30%,11.50%,2850377,5.10%,94.90%,1180749,73.70%,26.30%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,West Virginia,Margin of Error,*****,±0.2,±0.2,±0.1,±0.1,±0.1,"±3,566",±0.5,...,±1.0,±0.9,±0.4,±0.7,"±6,960",±0.6,±0.6,"±7,604",±1.3,±1.3
100,Wisconsin,Estimate,5892539,50.20%,49.80%,78.90%,49.80%,50.20%,4088500,93.50%,...,73.30%,35.70%,5.20%,10.20%,2491121,6.50%,93.50%,1031942,76.50%,23.50%
101,Wisconsin,Margin of Error,*****,±0.1,±0.1,±0.1,±0.1,±0.1,"±3,825",±0.2,...,±0.4,±0.4,±0.2,±0.3,"±7,766",±0.3,±0.3,"±11,026",±0.5,±0.5
102,Wyoming,Estimate,581381,51.20%,48.80%,77.70%,51.00%,49.00%,399796,93.70%,...,71.00%,31.20%,11.50%,11.10%,243321,4.10%,95.90%,101037,73.70%,26.30%


In [81]:
# split the dataset into 2 datasets, 1 - estimate value, 1 - moe
df_state_only_estimate, df_state_only_moe = df_split(df_state_only_final, 'Label (Grouping)', 'Estimate', 'Margin of Error')
# display(df_state_only_estimate)         # For debugging only - comment out when not needed.
# display(df_state_only_moe)              # For debugging only - comment out when not needed.

In [83]:
df_state_only_estimate_no_percent = remove_percent(df_state_only_estimate)
display(df_state_only_estimate_no_percent)

Unnamed: 0,State,Label (Grouping),Total Pop,Total Pop - Male - %,Total Pop - Female - %,Total Pop 18 and Over - %,Total Pop 18 and Over – Male - %,Total Pop 18 and Over – Female - %,Pop 25 and Over - Educated,Pop 25 and Over – HS Graduate or Higher - %,...,Pop With Private Health Insurance - %,Pop With Public Health Insurance - %,Pop Uninsured - %,Pop 18 and Over Below Poverty - %,Occupied Housing Units,Households With No Vehicles - %,Households With 1 and Over Vehicles - %,Owner-Occupied With Mortgage,Mortgage Costs less than 30 percent Income - %,Mortgage Costs 30 precent or more of Income - %
2,Alabama,Estimate,5074296,48.5,51.5,78.1,47.8,52.2,3474924,88.8,...,67.0,38.4,8.8,14.6,2016448,5.2,94.8,768014,77.0,23.0
4,Alaska,Estimate,733583,52.6,47.4,76.0,53.1,46.9,489218,93.3,...,66.4,37.2,11.0,10.1,274574,9.1,90.9,110040,70.9,29.1
6,Arizona,Estimate,7359197,50.0,50.0,78.4,49.7,50.3,5053656,89.2,...,63.9,38.3,10.3,11.5,2850377,5.1,94.9,1180749,73.7,26.3
8,Arkansas,Estimate,3045637,49.4,50.6,77.2,48.8,51.2,2057624,89.1,...,60.0,45.0,8.4,15.2,1216207,6.5,93.5,424171,77.3,22.7
10,California,Estimate,39029342,50.1,49.9,78.2,49.7,50.3,26866773,84.7,...,63.8,40.0,6.5,11.3,13550586,6.9,93.1,5044333,62.2,37.8
12,Colorado,Estimate,5839926,50.7,49.3,79.3,50.6,49.4,4084004,93.0,...,70.5,33.3,7.1,9.0,2384584,5.0,95.0,1089369,70.6,29.4
14,Connecticut,Estimate,3626205,49.0,51.0,79.9,48.4,51.6,2545188,91.5,...,68.9,37.6,5.2,9.2,1433635,9.0,91.0,610624,70.2,29.8
16,Delaware,Estimate,1018396,48.6,51.4,79.6,48.0,52.0,724041,92.0,...,71.8,39.5,5.6,8.5,402334,5.9,94.1,187123,75.8,24.2
20,Florida,Estimate,22244823,49.2,50.8,80.7,48.8,51.2,16104410,89.9,...,63.6,37.5,11.2,11.6,8826394,6.0,94.0,3315723,65.8,34.2
22,Georgia,Estimate,10912876,48.8,51.2,77.0,48.1,51.9,7332980,89.5,...,66.6,33.0,11.7,11.5,4092467,5.7,94.3,1705941,75.0,25.0


In [85]:
# combine the 2 datasets into 1 main dataset - double the columns 
df_state_only_estimate_updated = df_state_only_estimate_no_percent.add_prefix('est - ')
# display(df_state_only_estimate_updated)
df_state_only_estimate_updated = df_state_only_estimate_updated.rename(columns={'est - State': 'State'})
df_state_only_estimate_updated = df_state_only_estimate_updated.drop(columns=['est - Label (Grouping)'])
display(df_state_only_estimate_updated)

Unnamed: 0,State,est - Total Pop,est - Total Pop - Male - %,est - Total Pop - Female - %,est - Total Pop 18 and Over - %,est - Total Pop 18 and Over – Male - %,est - Total Pop 18 and Over – Female - %,est - Pop 25 and Over - Educated,est - Pop 25 and Over – HS Graduate or Higher - %,est - Pop 25 and Over – Male HS and Over - %,...,est - Pop With Private Health Insurance - %,est - Pop With Public Health Insurance - %,est - Pop Uninsured - %,est - Pop 18 and Over Below Poverty - %,est - Occupied Housing Units,est - Households With No Vehicles - %,est - Households With 1 and Over Vehicles - %,est - Owner-Occupied With Mortgage,est - Mortgage Costs less than 30 percent Income - %,est - Mortgage Costs 30 precent or more of Income - %
2,Alabama,5074296,48.5,51.5,78.1,47.8,52.2,3474924,88.8,87.3,...,67.0,38.4,8.8,14.6,2016448,5.2,94.8,768014,77.0,23.0
4,Alaska,733583,52.6,47.4,76.0,53.1,46.9,489218,93.3,93.3,...,66.4,37.2,11.0,10.1,274574,9.1,90.9,110040,70.9,29.1
6,Arizona,7359197,50.0,50.0,78.4,49.7,50.3,5053656,89.2,88.9,...,63.9,38.3,10.3,11.5,2850377,5.1,94.9,1180749,73.7,26.3
8,Arkansas,3045637,49.4,50.6,77.2,48.8,51.2,2057624,89.1,88.1,...,60.0,45.0,8.4,15.2,1216207,6.5,93.5,424171,77.3,22.7
10,California,39029342,50.1,49.9,78.2,49.7,50.3,26866773,84.7,84.2,...,63.8,40.0,6.5,11.3,13550586,6.9,93.1,5044333,62.2,37.8
12,Colorado,5839926,50.7,49.3,79.3,50.6,49.4,4084004,93.0,92.7,...,70.5,33.3,7.1,9.0,2384584,5.0,95.0,1089369,70.6,29.4
14,Connecticut,3626205,49.0,51.0,79.9,48.4,51.6,2545188,91.5,90.7,...,68.9,37.6,5.2,9.2,1433635,9.0,91.0,610624,70.2,29.8
16,Delaware,1018396,48.6,51.4,79.6,48.0,52.0,724041,92.0,90.3,...,71.8,39.5,5.6,8.5,402334,5.9,94.1,187123,75.8,24.2
20,Florida,22244823,49.2,50.8,80.7,48.8,51.2,16104410,89.9,89.2,...,63.6,37.5,11.2,11.6,8826394,6.0,94.0,3315723,65.8,34.2
22,Georgia,10912876,48.8,51.2,77.0,48.1,51.9,7332980,89.5,88.6,...,66.6,33.0,11.7,11.5,4092467,5.7,94.3,1705941,75.0,25.0


In [87]:
# Convert all values within the census DataFrame into Numerical value except for the "State" column.
df_census_final = numeric_converter(df_state_only_estimate_updated, 1)
display(df_census_final)

Unnamed: 0,State,est - Total Pop,est - Total Pop - Male - %,est - Total Pop - Female - %,est - Total Pop 18 and Over - %,est - Total Pop 18 and Over – Male - %,est - Total Pop 18 and Over – Female - %,est - Pop 25 and Over - Educated,est - Pop 25 and Over – HS Graduate or Higher - %,est - Pop 25 and Over – Male HS and Over - %,...,est - Pop With Private Health Insurance - %,est - Pop With Public Health Insurance - %,est - Pop Uninsured - %,est - Pop 18 and Over Below Poverty - %,est - Occupied Housing Units,est - Households With No Vehicles - %,est - Households With 1 and Over Vehicles - %,est - Owner-Occupied With Mortgage,est - Mortgage Costs less than 30 percent Income - %,est - Mortgage Costs 30 precent or more of Income - %
2,Alabama,5074296,48.5,51.5,78.1,47.8,52.2,3474924,88.8,87.3,...,67.0,38.4,8.8,14.6,2016448,5.2,94.8,768014,77.0,23.0
4,Alaska,733583,52.6,47.4,76.0,53.1,46.9,489218,93.3,93.3,...,66.4,37.2,11.0,10.1,274574,9.1,90.9,110040,70.9,29.1
6,Arizona,7359197,50.0,50.0,78.4,49.7,50.3,5053656,89.2,88.9,...,63.9,38.3,10.3,11.5,2850377,5.1,94.9,1180749,73.7,26.3
8,Arkansas,3045637,49.4,50.6,77.2,48.8,51.2,2057624,89.1,88.1,...,60.0,45.0,8.4,15.2,1216207,6.5,93.5,424171,77.3,22.7
10,California,39029342,50.1,49.9,78.2,49.7,50.3,26866773,84.7,84.2,...,63.8,40.0,6.5,11.3,13550586,6.9,93.1,5044333,62.2,37.8
12,Colorado,5839926,50.7,49.3,79.3,50.6,49.4,4084004,93.0,92.7,...,70.5,33.3,7.1,9.0,2384584,5.0,95.0,1089369,70.6,29.4
14,Connecticut,3626205,49.0,51.0,79.9,48.4,51.6,2545188,91.5,90.7,...,68.9,37.6,5.2,9.2,1433635,9.0,91.0,610624,70.2,29.8
16,Delaware,1018396,48.6,51.4,79.6,48.0,52.0,724041,92.0,90.3,...,71.8,39.5,5.6,8.5,402334,5.9,94.1,187123,75.8,24.2
20,Florida,22244823,49.2,50.8,80.7,48.8,51.2,16104410,89.9,89.2,...,63.6,37.5,11.2,11.6,8826394,6.0,94.0,3315723,65.8,34.2
22,Georgia,10912876,48.8,51.2,77.0,48.1,51.9,7332980,89.5,88.6,...,66.6,33.0,11.7,11.5,4092467,5.7,94.3,1705941,75.0,25.0


Census DataFrame: df_chronic_raw
1. Remove all columns with "ConfidenceLimit" within

## Chronic Disease Data Wrangling
Dataset Name: _df_indicators_raw_

1. Filter values in specified columns - year, data type, question, and state
2. Update the values in the 'Question' column to readable names
3. Select the columns of interest
4. Rename the 'State' column for the future join
5. Process each stratification (overall, sex, race/ethnicity)
6. Merge processed dataframes together

In [20]:
# filter values in raw chronic disease data - year, data type, question, and state
columns_include = ['YearStart','DataValueType','Question']
values_include = [[2022],['Crude Prevalence'],['Diabetes among adults','Obesity among adults','Arthritis among adults',
                                                'Food insecure in the past 12 months among households',
                                                'Chronic obstructive pulmonary disease among adults',
                                                'Lack of health insurance among adults aged 18-64',
                                                'Lack of reliable transportation in the past 12 months among adults',
                                                'Unable to pay mortgage, rent, or utility bills in the past 12 months among adults',
                                                'Current asthma among adults']]
columns_exclude = ['LocationDesc']
values_exclude = [['Guam','District of Columbia','Puerto Rico','United States','Virgin Islands']]

cd_filtered_df = filter_dataframe(df = df_indicators_raw,
                               columns_with_include = columns_include,
                               values_to_include = values_include,
                               columns_with_exclude = columns_exclude,
                               values_to_exclude = values_exclude)

In [21]:
# update values in the 'Question' column to readable names
cd_rename_mapping_dict = {'Arthritis among adults': 'Arthritis', 
                  'Current asthma among adults': 'Asthma',
                  'Unable to pay mortgage, rent, or utility bills in the past 12 months among adults': 'Bill Payment Instability',
                  'Obesity among adults': 'Obesity',
                  'Diabetes among adults': 'Diabetes',
                  'Lack of reliable transportation in the past 12 months among adults': 'Transportation Instability',
                  'Chronic obstructive pulmonary disease among adults': 'COPD'
                 }

cd_renamed_df = column_value_changer(cd_filtered_df, 'Question', cd_rename_mapping_dict)

In [22]:
# select columns of interest
cd_column_name_list = ['LocationDesc','Question','DataValueUnit','DataValue',
                    'Stratification1','LowConfidenceLimit','HighConfidenceLimit',
                    'Geolocation']

cd_selected_columns = select_columns(cd_renamed_df, cd_column_name_list)

In [23]:
# rename state column for later join
cd_state_rename = {'LocationDesc': 'State'}
cd_state_rename_df = rename_columns(cd_selected_columns, cd_state_rename)

In [28]:
# process each stratification and append to a list 

cd_processed_dfs = []
stratifications = [
    'Overall', 'Male', 'Female',
    'Hispanic', 'White, non-Hispanic', 'Black, non-Hispanic',
    'Hawaiian or Pacific Islander, non-Hispanic',
    'American Indian or Alaska Native, non-Hispanic',
    'Asian, non-Hispanic',
    'Multiracial, non-Hispanic'
]
                   

for strat in stratifications:
    # filter to the specified value
    temp_df = stratify_dataframe(cd_state_rename_df, 'Stratification1', strat)
    # pivot to make each question its own column
    temp_df = pivot_questions(temp_df)
    # add prefixes and update 'State' column
    prefix = f'{strat} - '
    temp_df = temp_df.add_prefix(prefix)
    temp_df = temp_df.rename(columns={f'{prefix}State': 'State'})
    
    cd_processed_dfs.append(temp_df)

# merge all processed chronic disease dataframes together 
chronic_disease_final = cd_processed_dfs[0]
for next_df in cd_processed_dfs[1:]:
    chronic_disease_final = pd.merge(chronic_disease_final, next_df, on='State', how='outer')

In [None]:
# Drop columns that have limited data or are not important to project.
cols_to_drop_confidence = ['ConfidenceLimit']
df_chronic1 = drop_columns(df_chronic_raw, cols_to_drop_confidence)
display(df_chronic1)

cols_to_drop_race = ["White", "Black", "Hispanic", "Hawaiian or Pacific Islander",
                       "American Indian or Alaska Native", "Multiracial", "Asian", "ConfidenceLimit"
                    ]
df_chronic_final = drop_columns(df_chronic1, cols_to_drop_race)
display(df_chronic_final)

In [None]:
# Combine the chronic disease and census DataFrames.
df_final = df_combo(df_chronic_final, df_census_final, 'State', 'outer')
display(df_final)

In [None]:
display(df_final.describe())

In [None]:
save_df_to_csv(df_final, './data/processed/Final_dataset.csv')

# **Section: Visualization**

## Part 1: Distribution of Numeric Variables
Create bloxplots and histograms for each variable in the dataset to view and understand the distribution and outliers.

### Diabetes Distribution

### Other Chronic Diseases

### Census Data