In [None]:
# This reloads the extension if already loaded - Everytime you update the .py files, re-run this cell!
%reload_ext autoreload
# Automatically reloads modules before executing code OR makes Jupyter reload your .py files whenever you run a cell.
%autoreload 2

In [None]:
# Importing all of the required python packages
import sys
import os
# import pandas as pd

In [None]:
# Setting up the folder paths.
sys.path.append('./lib')

# ---- Import functions within .py files.
# From data_loader.py
from data_loader import load_csv, save_df_to_csv

# From data_wrangle.py
# ---- Section 1: Modular Functions ----
from data_wrangle import add_cols, remove_cols, remove_rows, remove_nan_cols, remove_nan_rows, col_name_changer, remove_leading_wspace, df_split, df_combo, grab_cols_for_visual, df_transpose
# ---- Section 2: Specific Functions ----
from data_wrangle import df_formater, df_split_state_city, census_filter_cols, census_rename_cols

Section: Loading and converting data sets into DataFrames.

In [None]:
df_indicators = load_csv('./data/raw/U.S._Chronic_Disease_Indicators.csv')

In [None]:
df_census = load_csv('./data/raw/US_Census_Data_2022_v04_transpose.csv')

In [None]:
df_chronic = load_csv('./data/processed/Chronic_Disease_Final.csv')

Section: Data Wrangling

Census DataFrame: df_census
1. Adding the 'State' column to the dataframe.
2. Adding the correct state name to the newly created State column.
3. Removing blank columns.
4. Removing blank rows.
5. Split DataFrame into 2 separate DataFrames, 1 with state and 1 with city, state; City, State Dataset no long need.
6. Split state DataFrame into 2 separate Dataset on State for  Estimate and Margin of Error.
7. Save both DataFrames into csv files in /data/processed folder.

In [None]:
col_names = ['State']
df_census_temp0 = add_cols(df_census, col_names)
# display(df_census_temp0)    # For debugging only - comment out when not needed.

In [None]:
df_census_temp1 = df_formater(df_census_temp0)
# display(df_census_temp1)    # For debugging only - comment out when not needed.

In [None]:
df_census_temp2 = remove_nan_cols(df_census_temp1)
# display(df_census_temp2)    # For debugging only - comment out when not needed.

In [None]:
df_census_temp3 = remove_nan_rows(df_census_temp2)
# display(df_census_temp3)    # For debugging only - comment out when not needed.

In [None]:
df_state_only, df_city_state = df_split_state_city(df_census_temp3, 'State')
# display(df_state_only)    # For debugging only - comment out when not needed.
# display(df_state_city)    # For debugging only - comment out when not needed.

In [None]:
# Remove State/location we do not care for.
state_remove = ['Guam', 'District of Columbia', 'Puerto Rico', 'United States', 'Virgin Islands']
df_state_only_updated = remove_rows(df_state_only, 'State', state_remove)
display(df_state_only_updated)

In [None]:
# Filter out only columns we care about.
df_state_only_cols = census_filter_cols(df_state_only_updated)
display(df_state_only_cols)
# print(df_state_only_cols.columns)

In [None]:
# Updating the column labels to a more friendly version.
df_state_only_final = census_rename_cols(df_state_only_cols)
display(df_state_only_final)

In [None]:
# split the dataset into 2 datasets, 1 - estimate value, 1 - moe
df_state_only_estimate, df_state_only_moe = df_split(df_state_only_final, 'Label (Grouping)', 'Estimate', 'Margin of Error')
# display(df_state_only_estimate)         # For debugging only - comment out when not needed.
# display(df_state_only_moe)              # For debugging only - comment out when not needed.

In [None]:
# combine the 2 datasets into 1 main dataset - double the columns 
df_state_only_estimate_updated = df_state_only_estimate.add_prefix('est - ')
df_state_only_moe_updated = df_state_only_moe.add_prefix('moe - ')
# display(df_state_only_estimate_updated)
# display(df_state_only_moe_updated)
df_state_only_estimate_updated = df_state_only_estimate_updated.rename(columns={'est - State': 'State'})
df_state_only_moe_updated = df_state_only_moe_updated.rename(columns={'moe - State': 'State'})
df_state_only_estimate_updated = df_state_only_estimate_updated.drop(columns=['est - Label (Grouping)'])
df_state_only_moe_updated = df_state_only_moe_updated.drop(columns=['moe - Label (Grouping)'])
# display(df_state_only_estimate_updated)
# display(df_state_only_moe_updated)

In [None]:
# combine the est and moe DataFrame into 1.
df_state_only_combined = df_combo(df_state_only_estimate_updated, df_state_only_moe_updated, 'State', 'outer')
display(df_state_only_combined)

In [None]:
save_df_to_csv(df_state_only_combined, './data/processed/US_Census_Data_2022_state_only_combined.csv')

In [None]:
df_final = df_combo(df_chronic, df_state_only_combined, 'State', 'outer')
display(df_final)

In [None]:
save_df_to_csv(df_final, './data/processed/Final_dataset.csv')

Section: Visualization