# **Section: Python Packages**

In [72]:
# Importing all of the required python packages
import sys
import os
# import pandas as pd

In [90]:
# This reloads the extension if already loaded - Everytime you update the .py files, re-run this cell!
%reload_ext autoreload
# Automatically reloads modules before executing code OR makes Jupyter reload your .py files whenever you run a cell.
%autoreload 2

In [91]:
# Setting up the folder paths.
sys.path.append('./lib')

# ---- Import functions within .py files.
# From data_loader.py
from data_loader import load_csv, save_df_to_csv

# From data_wrangle.py
# ---- Section 1: Modular Functions ----
from data_wrangle import add_cols, remove_cols, drop_columns, remove_rows, remove_nan_cols, remove_nan_rows, col_name_changer, remove_leading_wspace, df_split, df_combo, select_columns, df_transpose
# ---- Section 2: Specific Functions ----
from data_wrangle import df_formater, df_split_state_city, remove_percent, remove_symbol, census_filter_cols, census_rename_cols, numeric_converter, diabete_metrics

# **Section: Loading and converting datasets into DataFrames**

In [16]:
df_indicators_raw = load_csv('./data/raw/U.S._Chronic_Disease_Indicators.csv')

In [17]:
df_census_raw = load_csv('./data/raw/US_Census_Data_2022_v04_transpose.csv')
print(df_census_raw.shape)

(784, 346)


In [24]:
df_chronic_raw = load_csv('./data/processed/Chronic_Disease_Final.csv')

# **Section: Data Wrangling**

Census DataFrame: df_census
1. Adding the 'State' column to the dataframe.
2. Adding the correct state name to the newly created State column.
3. Removing blank columns.
4. Removing blank rows.
5. Split DataFrame into 2 separate DataFrames, 1 with state and 1 with city, state; City, State Dataset no long need.
6. Split state DataFrame into 2 separate Dataset on State for  Estimate and Margin of Error.
7. Take only the estimate data base.
8. filter down by the only needed columns.
9. convert all values into numeric data type.
10. Combine Census and Chronic Diease dataset on State.
11. Save both DataFrames into csv files in /data/processed folder.

In [26]:
col_names = ['State']
df_census_temp0 = add_cols(df_census_raw, col_names)
# display(df_census_temp0)    # For debugging only - comment out when not needed.
print(df_census_temp0.shape)

(784, 347)


In [28]:
df_census_temp1 = df_formater(df_census_temp0)
# display(df_census_temp1)    # For debugging only - comment out when not needed.
print(df_census_temp1.shape)

(784, 347)


In [30]:
df_census_temp2 = remove_nan_cols(df_census_temp1)
# display(df_census_temp2)    # For debugging only - comment out when not needed.
print(df_census_temp2.shape)

(784, 310)


In [32]:
df_census_temp3 = remove_nan_rows(df_census_temp2)
# display(df_census_temp3)    # For debugging only - comment out when not needed.
print(df_census_temp3.shape)

(392, 310)


In [34]:
df_state_only, df_city_state = df_split_state_city(df_census_temp3, 'State')
# display(df_state_only)    # For debugging only - comment out when not needed.
# display(df_state_city)    # For debugging only - comment out when not needed.

In [36]:
# Remove State/location we do not care for.
state_remove = ['Guam', 'District of Columbia', 'Puerto Rico', 'United States', 'Virgin Islands']
df_state_only_updated = remove_rows(df_state_only, 'State', state_remove)
display(df_state_only_updated.head())

Unnamed: 0,State,Label (Grouping),TOTAL NUMBER OF RACES REPORTED!!Total population,TOTAL NUMBER OF RACES REPORTED!!Total population!!One race,TOTAL NUMBER OF RACES REPORTED!!Total population!!Two races,TOTAL NUMBER OF RACES REPORTED!!Total population!!Three races,TOTAL NUMBER OF RACES REPORTED!!Total population!!Four or more races,SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Male,SEX AND AGE!!Total population!!Female,...,OWNER CHARACTERISTICS!!Owner-occupied housing units!!Median selected monthly owner costs with a mortgage (dollars),OWNER CHARACTERISTICS!!Owner-occupied housing units!!Median selected monthly owner costs without a mortgage (dollars),GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Occupied units paying rent (excluding units where GRAPI cannot be computed),GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Occupied units paying rent (excluding units where GRAPI cannot be computed)!!Less than 30 percent,GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Occupied units paying rent (excluding units where GRAPI cannot be computed)!!30 percent or more,GROSS RENT!!Occupied units paying rent,GROSS RENT!!Occupied units paying rent!!Median gross rent (dollars),COMPUTERS AND INTERNET USE!!Total households,COMPUTERS AND INTERNET USE!!Total households!!With a computer,COMPUTERS AND INTERNET USE!!Total households!!With a broadband Internet subscription
2,Alabama,Estimate,5074296,94.90%,4.70%,0.30%,0.10%,5074296,48.50%,51.50%,...,1293,395,517883,50.50%,49.50%,535960,913,2016448,94.00%,87.40%
3,Alabama,Margin of Error,*****,±0.2,±0.2,±0.1,±0.1,*****,±0.1,±0.1,...,±15,±5,"±11,892",±1.5,±1.5,"±11,920",±14,"±11,475",±0.3,±0.4
4,Alaska,Estimate,733583,85.80%,12.80%,1.30%,0.20%,733583,52.60%,47.40%,...,2019,629,84809,56.90%,43.10%,85226,1329,274574,97.30%,91.60%
5,Alaska,Margin of Error,*****,±1.0,±0.9,±0.3,±0.1,*****,±0.3,±0.3,...,±56,±18,"±4,489",±3.3,±3.3,"±4,502",±31,"±3,261",±0.5,±0.6
6,Arizona,Estimate,7359197,80.60%,18.50%,0.80%,0.10%,7359197,50.00%,50.00%,...,1616,468,867455,47.20%,52.80%,887809,1450,2850377,96.40%,91.10%


In [38]:
# Filter out only columns we care about.
df_state_only_cols = census_filter_cols(df_state_only_updated)
display(df_state_only_cols.head())
# print(df_state_only_cols.columns)

Unnamed: 0,State,Label (Grouping),SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Male,SEX AND AGE!!Total population!!Female,SEX AND AGE!!Total population!!18 years and over,SEX AND AGE!!Total population!!18 years and over!!Male,SEX AND AGE!!Total population!!18 years and over!!Female,EDUCATIONAL ATTAINMENT!!Population 25 years and over,EDUCATIONAL ATTAINMENT!!Population 25 years and over!!High school graduate or higher,...,HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population!!With private health insurance,HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population!!With public coverage,HEALTH INSURANCE COVERAGE!!Civilian noninstitutionalized population!!No health insurance coverage,POVERTY RATES FOR FAMILIES AND PEOPLE FOR WHOM POVERTY STATUS IS DETERMINED!!All people!!18 years and over,VEHICLES AVAILABLE!!Occupied housing units,VEHICLES AVAILABLE!!Occupied housing units!!None,VEHICLES AVAILABLE!!Occupied housing units!!1 or more,SELECTED MONTHLY OWNER COSTS AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Housing units with a mortgage (excluding units where SMOC cannot be computed),SELECTED MONTHLY OWNER COSTS AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Housing units with a mortgage (excluding units where SMOC cannot be computed)!!Less than 30 percent,SELECTED MONTHLY OWNER COSTS AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS!!Housing units with a mortgage (excluding units where SMOC cannot be computed)!!30 percent or more
2,Alabama,Estimate,5074296,48.50%,51.50%,78.10%,47.80%,52.20%,3474924,88.80%,...,67.00%,38.40%,8.80%,14.60%,2016448,5.20%,94.80%,768014,77.00%,23.00%
3,Alabama,Margin of Error,*****,±0.1,±0.1,±0.1,±0.1,±0.1,"±5,963",±0.3,...,±0.5,±0.4,±0.3,±0.4,"±11,475",±0.3,±0.3,"±12,780",±0.8,±0.8
4,Alaska,Estimate,733583,52.60%,47.40%,76.00%,53.10%,46.90%,489218,93.30%,...,66.40%,37.20%,11.00%,10.10%,274574,9.10%,90.90%,110040,70.90%,29.10%
5,Alaska,Margin of Error,*****,±0.3,±0.3,±0.2,±0.2,±0.2,"±1,732",±0.7,...,±1.2,±1.2,±0.8,±0.8,"±3,261",±0.8,±0.8,"±4,022",±2.3,±2.3
6,Arizona,Estimate,7359197,50.00%,50.00%,78.40%,49.70%,50.30%,5053656,89.20%,...,63.90%,38.30%,10.30%,11.50%,2850377,5.10%,94.90%,1180749,73.70%,26.30%


In [40]:
# Updating the column labels to a more friendly version.
df_state_only_final = census_rename_cols(df_state_only_cols)
display(df_state_only_final.head())

Unnamed: 0,State,Label (Grouping),Total Pop,Total Pop - Male,Total Pop - Female,Total Pop 18 and Over,Total Pop 18 and Over – Male,Total Pop 18 and Over – Female,Pop 25 and Over - Educated,Pop 25 and Over – HS Graduate or Higher,...,Pop With Private Health Insurance,Pop With Public Health Insurance,Pop Uninsured,Pop 18 and Over Below Poverty,Occupied Housing Units,Households With No Vehicles,Households With 1 and Over Vehicles,Owner-Occupied With Mortgage,Mortgage Costs less than 30 percent Income,Mortgage Costs 30 precent or more of Income
2,Alabama,Estimate,5074296,48.50%,51.50%,78.10%,47.80%,52.20%,3474924,88.80%,...,67.00%,38.40%,8.80%,14.60%,2016448,5.20%,94.80%,768014,77.00%,23.00%
3,Alabama,Margin of Error,*****,±0.1,±0.1,±0.1,±0.1,±0.1,"±5,963",±0.3,...,±0.5,±0.4,±0.3,±0.4,"±11,475",±0.3,±0.3,"±12,780",±0.8,±0.8
4,Alaska,Estimate,733583,52.60%,47.40%,76.00%,53.10%,46.90%,489218,93.30%,...,66.40%,37.20%,11.00%,10.10%,274574,9.10%,90.90%,110040,70.90%,29.10%
5,Alaska,Margin of Error,*****,±0.3,±0.3,±0.2,±0.2,±0.2,"±1,732",±0.7,...,±1.2,±1.2,±0.8,±0.8,"±3,261",±0.8,±0.8,"±4,022",±2.3,±2.3
6,Arizona,Estimate,7359197,50.00%,50.00%,78.40%,49.70%,50.30%,5053656,89.20%,...,63.90%,38.30%,10.30%,11.50%,2850377,5.10%,94.90%,1180749,73.70%,26.30%


In [42]:
# split the dataset into 2 datasets, 1 - estimate value, 1 - moe
df_state_only_estimate, df_state_only_moe = df_split(df_state_only_final, 'Label (Grouping)', 'Estimate', 'Margin of Error')
# display(df_state_only_estimate)         # For debugging only - comment out when not needed.
# display(df_state_only_moe)              # For debugging only - comment out when not needed.

In [44]:
df_state_only_estimate_no_percent = remove_percent(df_state_only_estimate)
display(df_state_only_estimate_no_percent.head())

Unnamed: 0,State,Label (Grouping),Total Pop,Total Pop - Male - %,Total Pop - Female - %,Total Pop 18 and Over - %,Total Pop 18 and Over – Male - %,Total Pop 18 and Over – Female - %,Pop 25 and Over - Educated,Pop 25 and Over – HS Graduate or Higher - %,...,Pop With Private Health Insurance - %,Pop With Public Health Insurance - %,Pop Uninsured - %,Pop 18 and Over Below Poverty - %,Occupied Housing Units,Households With No Vehicles - %,Households With 1 and Over Vehicles - %,Owner-Occupied With Mortgage,Mortgage Costs less than 30 percent Income - %,Mortgage Costs 30 precent or more of Income - %
2,Alabama,Estimate,5074296,48.5,51.5,78.1,47.8,52.2,3474924,88.8,...,67.0,38.4,8.8,14.6,2016448,5.2,94.8,768014,77.0,23.0
4,Alaska,Estimate,733583,52.6,47.4,76.0,53.1,46.9,489218,93.3,...,66.4,37.2,11.0,10.1,274574,9.1,90.9,110040,70.9,29.1
6,Arizona,Estimate,7359197,50.0,50.0,78.4,49.7,50.3,5053656,89.2,...,63.9,38.3,10.3,11.5,2850377,5.1,94.9,1180749,73.7,26.3
8,Arkansas,Estimate,3045637,49.4,50.6,77.2,48.8,51.2,2057624,89.1,...,60.0,45.0,8.4,15.2,1216207,6.5,93.5,424171,77.3,22.7
10,California,Estimate,39029342,50.1,49.9,78.2,49.7,50.3,26866773,84.7,...,63.8,40.0,6.5,11.3,13550586,6.9,93.1,5044333,62.2,37.8


In [46]:
# combine the 2 datasets into 1 main dataset - double the columns 
df_state_only_estimate_updated = df_state_only_estimate_no_percent.add_prefix('est - ')
# display(df_state_only_estimate_updated)
df_state_only_estimate_updated = df_state_only_estimate_updated.rename(columns={'est - State': 'State'})
df_state_only_estimate_updated = df_state_only_estimate_updated.drop(columns=['est - Label (Grouping)'])
display(df_state_only_estimate_updated.head())

Unnamed: 0,State,est - Total Pop,est - Total Pop - Male - %,est - Total Pop - Female - %,est - Total Pop 18 and Over - %,est - Total Pop 18 and Over – Male - %,est - Total Pop 18 and Over – Female - %,est - Pop 25 and Over - Educated,est - Pop 25 and Over – HS Graduate or Higher - %,est - Pop 25 and Over – Male HS and Over - %,...,est - Pop With Private Health Insurance - %,est - Pop With Public Health Insurance - %,est - Pop Uninsured - %,est - Pop 18 and Over Below Poverty - %,est - Occupied Housing Units,est - Households With No Vehicles - %,est - Households With 1 and Over Vehicles - %,est - Owner-Occupied With Mortgage,est - Mortgage Costs less than 30 percent Income - %,est - Mortgage Costs 30 precent or more of Income - %
2,Alabama,5074296,48.5,51.5,78.1,47.8,52.2,3474924,88.8,87.3,...,67.0,38.4,8.8,14.6,2016448,5.2,94.8,768014,77.0,23.0
4,Alaska,733583,52.6,47.4,76.0,53.1,46.9,489218,93.3,93.3,...,66.4,37.2,11.0,10.1,274574,9.1,90.9,110040,70.9,29.1
6,Arizona,7359197,50.0,50.0,78.4,49.7,50.3,5053656,89.2,88.9,...,63.9,38.3,10.3,11.5,2850377,5.1,94.9,1180749,73.7,26.3
8,Arkansas,3045637,49.4,50.6,77.2,48.8,51.2,2057624,89.1,88.1,...,60.0,45.0,8.4,15.2,1216207,6.5,93.5,424171,77.3,22.7
10,California,39029342,50.1,49.9,78.2,49.7,50.3,26866773,84.7,84.2,...,63.8,40.0,6.5,11.3,13550586,6.9,93.1,5044333,62.2,37.8


In [48]:
# Convert all values within the census DataFrame into Numerical value except for the "State" column.
df_census_final = numeric_converter(df_state_only_estimate_updated, 1)
display(df_census_final.head())

Unnamed: 0,State,est - Total Pop,est - Total Pop - Male - %,est - Total Pop - Female - %,est - Total Pop 18 and Over - %,est - Total Pop 18 and Over – Male - %,est - Total Pop 18 and Over – Female - %,est - Pop 25 and Over - Educated,est - Pop 25 and Over – HS Graduate or Higher - %,est - Pop 25 and Over – Male HS and Over - %,...,est - Pop With Private Health Insurance - %,est - Pop With Public Health Insurance - %,est - Pop Uninsured - %,est - Pop 18 and Over Below Poverty - %,est - Occupied Housing Units,est - Households With No Vehicles - %,est - Households With 1 and Over Vehicles - %,est - Owner-Occupied With Mortgage,est - Mortgage Costs less than 30 percent Income - %,est - Mortgage Costs 30 precent or more of Income - %
2,Alabama,5074296,48.5,51.5,78.1,47.8,52.2,3474924,88.8,87.3,...,67.0,38.4,8.8,14.6,2016448,5.2,94.8,768014,77.0,23.0
4,Alaska,733583,52.6,47.4,76.0,53.1,46.9,489218,93.3,93.3,...,66.4,37.2,11.0,10.1,274574,9.1,90.9,110040,70.9,29.1
6,Arizona,7359197,50.0,50.0,78.4,49.7,50.3,5053656,89.2,88.9,...,63.9,38.3,10.3,11.5,2850377,5.1,94.9,1180749,73.7,26.3
8,Arkansas,3045637,49.4,50.6,77.2,48.8,51.2,2057624,89.1,88.1,...,60.0,45.0,8.4,15.2,1216207,6.5,93.5,424171,77.3,22.7
10,California,39029342,50.1,49.9,78.2,49.7,50.3,26866773,84.7,84.2,...,63.8,40.0,6.5,11.3,13550586,6.9,93.1,5044333,62.2,37.8


Census DataFrame: df_chronic_raw
1. Remove all columns with "ConfidenceLimit" within

In [50]:
# Drop columns that have limited data or are not important to project.
cols_to_drop_confidence = ['ConfidenceLimit']
df_chronic1 = drop_columns(df_chronic_raw, cols_to_drop_confidence)
display(df_chronic1.head())

cols_to_drop_race = ["White", "Black", "Hispanic", "Hawaiian or Pacific Islander",
                       "American Indian or Alaska Native", "Multiracial", "Asian", "ConfidenceLimit"
                    ]
df_chronic_final = drop_columns(df_chronic1, cols_to_drop_race)
display(df_chronic_final.head())

Unnamed: 0,State,Males - Arthritis-DataValue,Males - Asthma-DataValue,Males - Bill Payment Instability-DataValue,Males - COPD-DataValue,Males - Diabetes-DataValue,Males - Obesity-DataValue,Males - Transportation Instability-DataValue,Females - Arthritis-DataValue,Females - Asthma-DataValue,...,Multiracial - Diabetes-DataValue,Multiracial - Obesity-DataValue,Multiracial - Transportation Instability-DataValue,Asian - Arthritis-DataValue,Asian - Asthma-DataValue,Asian - Bill Payment Instability-DataValue,Asian - COPD-DataValue,Asian - Diabetes-DataValue,Asian - Obesity-DataValue,Asian - Transportation Instability-DataValue
0,Alabama,30.3,7.4,11.1,8.1,14.8,35.8,9.7,37.7,11.7,...,,30.8,,,,,,,,
1,Alaska,22.0,7.5,10.2,4.9,8.7,31.2,10.1,25.6,14.1,...,6.6,36.6,16.4,13.2,,,,17.0,24.2,
2,Arizona,21.9,7.1,8.4,5.5,14.0,32.7,7.4,29.1,12.3,...,,39.6,,,,,,,,
3,Arkansas,28.6,8.1,,8.3,15.6,35.3,,38.8,12.7,...,9.0,40.7,,,,,,,,
4,California,18.0,5.9,12.6,4.2,12.0,28.4,8.9,22.8,11.4,...,11.4,26.6,11.2,10.2,5.8,6.3,1.9,11.4,12.5,4.1


Unnamed: 0,State,Males - Arthritis-DataValue,Males - Asthma-DataValue,Males - Bill Payment Instability-DataValue,Males - COPD-DataValue,Males - Diabetes-DataValue,Males - Obesity-DataValue,Males - Transportation Instability-DataValue,Females - Arthritis-DataValue,Females - Asthma-DataValue,...,Females - Diabetes-DataValue,Females - Obesity-DataValue,Females - Transportation Instability-DataValue,Overall - Arthritis-DataValue,Overall - Asthma-DataValue,Overall - Bill Payment Instability-DataValue,Overall - COPD-DataValue,Overall - Diabetes-DataValue,Overall - Obesity-DataValue,Overall - Transportation Instability-DataValue
0,Alabama,30.3,7.4,11.1,8.1,14.8,35.8,9.7,37.7,11.7,...,16.3,40.6,10.8,34.2,9.6,13.5,9.4,15.5,38.3,10.3
1,Alaska,22.0,7.5,10.2,4.9,8.7,31.2,10.1,25.6,14.1,...,8.7,33.3,10.7,23.7,10.7,12.3,5.6,8.7,32.1,10.4
2,Arizona,21.9,7.1,8.4,5.5,14.0,32.7,7.4,29.1,12.3,...,11.5,33.8,10.3,25.6,9.7,11.8,6.7,12.7,33.2,8.9
3,Arkansas,28.6,8.1,,8.3,15.6,35.3,,38.8,12.7,...,15.8,39.5,,33.9,10.5,,9.8,15.7,37.4,
4,California,18.0,5.9,12.6,4.2,12.0,28.4,8.9,22.8,11.4,...,11.1,27.7,8.3,20.4,8.7,13.2,4.7,11.5,28.1,8.6


In [56]:
# Combine the chronic disease and census DataFrames.
df_final = df_combo(df_chronic_final, df_census_final, 'State', 'outer')
display(df_final)

Unnamed: 0,State,Males - Arthritis-DataValue,Males - Asthma-DataValue,Males - Bill Payment Instability-DataValue,Males - COPD-DataValue,Males - Diabetes-DataValue,Males - Obesity-DataValue,Males - Transportation Instability-DataValue,Females - Arthritis-DataValue,Females - Asthma-DataValue,...,est - Pop With Private Health Insurance - %,est - Pop With Public Health Insurance - %,est - Pop Uninsured - %,est - Pop 18 and Over Below Poverty - %,est - Occupied Housing Units,est - Households With No Vehicles - %,est - Households With 1 and Over Vehicles - %,est - Owner-Occupied With Mortgage,est - Mortgage Costs less than 30 percent Income - %,est - Mortgage Costs 30 precent or more of Income - %
0,Alabama,30.3,7.4,11.1,8.1,14.8,35.8,9.7,37.7,11.7,...,67.0,38.4,8.8,14.6,2016448,5.2,94.8,768014,77.0,23.0
1,Alaska,22.0,7.5,10.2,4.9,8.7,31.2,10.1,25.6,14.1,...,66.4,37.2,11.0,10.1,274574,9.1,90.9,110040,70.9,29.1
2,Arizona,21.9,7.1,8.4,5.5,14.0,32.7,7.4,29.1,12.3,...,63.9,38.3,10.3,11.5,2850377,5.1,94.9,1180749,73.7,26.3
3,Arkansas,28.6,8.1,,8.3,15.6,35.3,,38.8,12.7,...,60.0,45.0,8.4,15.2,1216207,6.5,93.5,424171,77.3,22.7
4,California,18.0,5.9,12.6,4.2,12.0,28.4,8.9,22.8,11.4,...,63.8,40.0,6.5,11.3,13550586,6.9,93.1,5044333,62.2,37.8
5,Colorado,21.3,7.9,,4.9,8.8,24.9,,26.3,13.7,...,70.5,33.3,7.1,9.0,2384584,5.0,95.0,1089369,70.6,29.4
6,Connecticut,23.0,8.9,10.0,4.6,11.6,30.4,6.9,30.6,15.7,...,68.9,37.6,5.2,9.2,1433635,9.0,91.0,610624,70.2,29.8
7,Delaware,21.8,7.0,9.1,5.6,15.0,35.3,6.8,33.1,12.5,...,71.8,39.5,5.6,8.5,402334,5.9,94.1,187123,75.8,24.2
8,Florida,24.8,5.1,9.4,6.7,14.0,32.4,7.8,31.8,13.2,...,63.6,37.5,11.2,11.6,8826394,6.0,94.0,3315723,65.8,34.2
9,Georgia,20.9,6.2,9.9,5.3,12.6,34.1,8.2,29.8,12.7,...,66.6,33.0,11.7,11.5,4092467,5.7,94.3,1705941,75.0,25.0


In [54]:
display(df_final.describe())

Unnamed: 0,Males - Arthritis-DataValue,Males - Asthma-DataValue,Males - Bill Payment Instability-DataValue,Males - COPD-DataValue,Males - Diabetes-DataValue,Males - Obesity-DataValue,Males - Transportation Instability-DataValue,Females - Arthritis-DataValue,Females - Asthma-DataValue,Females - Bill Payment Instability-DataValue,...,est - Pop With Private Health Insurance - %,est - Pop With Public Health Insurance - %,est - Pop Uninsured - %,est - Pop 18 and Over Below Poverty - %,est - Occupied Housing Units,est - Households With No Vehicles - %,est - Households With 1 and Over Vehicles - %,est - Owner-Occupied With Mortgage,est - Mortgage Costs less than 30 percent Income - %,est - Mortgage Costs 30 precent or more of Income - %
count,50.0,50.0,34.0,50.0,50.0,50.0,34.0,50.0,50.0,34.0,...,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,24.156,7.616,9.441176,6.29,12.174,33.238,7.217647,31.754,13.266,12.564706,...,68.476,37.474,7.404,11.474,2590879.0,7.034,92.966,1020545.0,73.31,26.69
std,3.451954,1.214262,2.003046,1.819537,1.957071,3.416819,1.484765,4.060824,1.685788,2.490939,...,5.109198,5.112003,2.690115,2.221951,2742351.0,3.714484,3.714484,1006118.0,4.530813,4.530813
min,18.0,4.8,5.7,3.3,8.5,24.9,3.2,22.8,9.8,8.7,...,54.4,22.2,2.4,7.3,243321.0,3.7,70.9,101037.0,58.2,18.7
25%,21.825,6.825,8.0,4.9,10.7,31.05,6.0,29.725,12.325,10.6,...,65.25,34.525,5.6,10.1,753045.0,5.225,92.8,315084.8,70.525,23.55
50%,23.7,7.7,9.25,6.0,12.0,33.1,7.2,31.65,13.0,12.85,...,68.75,37.35,6.75,11.35,1822791.0,6.25,93.75,706623.0,73.7,26.3
75%,26.3,8.25,10.725,7.025,13.3,35.3,8.15,33.275,14.1,14.6,...,72.55,39.9,8.75,12.1,3022559.0,7.2,94.775,1276155.0,76.45,29.475
max,35.7,11.4,13.2,12.9,17.3,41.6,10.1,44.3,17.0,17.6,...,78.4,51.2,16.6,16.9,13550590.0,29.1,96.3,5044333.0,81.3,41.8


In [None]:
save_df_to_csv(df_final, './data/processed/Final_dataset.csv')

In [99]:
# Final visualization dataset based on the Census Categories vs Diabetes.
print(df_final.columns)
# dia_met_df = diabete_metrics(df_final)

Index(['State', 'Males - Arthritis-DataValue', 'Males - Asthma-DataValue',
       'Males - Bill Payment Instability-DataValue', 'Males - COPD-DataValue',
       'Males - Diabetes-DataValue', 'Males - Obesity-DataValue',
       'Males - Transportation Instability-DataValue',
       'Females - Arthritis-DataValue', 'Females - Asthma-DataValue',
       'Females - Bill Payment Instability-DataValue',
       'Females - COPD-DataValue', 'Females - Diabetes-DataValue',
       'Females - Obesity-DataValue',
       'Females - Transportation Instability-DataValue',
       'Overall - Arthritis-DataValue', 'Overall - Asthma-DataValue',
       'Overall - Bill Payment Instability-DataValue',
       'Overall - COPD-DataValue', 'Overall - Diabetes-DataValue',
       'Overall - Obesity-DataValue',
       'Overall - Transportation Instability-DataValue', 'est - Total Pop',
       'est - Total Pop - Male - %', 'est - Total Pop - Female - %',
       'est - Total Pop 18 and Over - %',
       'est - Total P

# **Section: Visualization**