In [25]:
# This reloads the extension if already loaded - Everytime you update the .py files, re-run this cell!
%reload_ext autoreload
# Automatically reloads modules before executing code OR makes Jupyter reload your .py files whenever you run a cell.
%autoreload 2

In [26]:
# Importing all of the required python packages
import sys
import os

In [27]:
# Setting up the folder paths.
sys.path.append('./lib')

# Import resuable code (scripts) within .py files.
from data_loader import load_csv, save_df_to_csv
from data_wrangle import add_cols, df_formater, remove_cols, remove_rows, col_name_changer, df_split

Section: Loading and converting data sets into DataFrames.

In [28]:
df_indicators = load_csv('./data/raw/U.S._Chronic_Disease_Indicators.csv')
# print(df.head())

In [29]:
# df2 = load_csv('./data/raw/US_Census_Data_2022_v01.csv')
# print(df2.head())

In [30]:
df_census = load_csv('./data/raw/US_Census_Data_2022_v03_transpose.csv')
# display(df3.head())

Section: Data Wrangling

Census DataFrame: df_census
1. Adding the 'State' column to the dataframe.
2. Adding the correct state name to the newly created State column.
3. Removing blank columns
4. Removing blank rows
5. Split DataFrame into 2 separate DataFrames, 1 with state and 1 with city, state
6. Saves both datasets as 2 csv files at filepath: data/processed/

In [31]:
col_names = ['State']
df_census_temp0 = add_cols(df_census, col_names)
# display(df_census_temp0)    # For debugging only - comment out when not needed.

In [32]:
df_census_temp1 = df_formater(df_census_temp0)
# display(df_census_temp1)    # For debugging only - comment out when not needed.

In [36]:
#df_census_temp2 = remove_cols(df_census_temp1)
display(df_census_temp2)    # For debugging only - comment out when not needed.

In [37]:
#df_census_temp3 = remove_rows(df_census_temp2)
display(df_census_temp3)    # For debugging only - comment out when not needed.

In [None]:
og_string = "!!"
new_string = " -- "
df_census_temp4 = col_name_changer(df_census_temp3, og_string, new_string)
# display(df_census_temp4)     # For debugging only - comment out when not needed.

In [None]:
df_state_only, df_state_city = df_split(df_census_temp4)
# display(df_state_only)    # For debugging only - comment out when not needed.
# display(df_state_city)    # For debugging only - comment out when not needed.

In [None]:
save_df_to_csv(df_state_only, './data/processed/US_Census_Data_2022_state_only.csv')

In [None]:
save_df_to_csv(df_state_city, './data/processed/US_Census_Data_2022_city_state.csv')

**Section: Data Pre-processing and Exploratory Data Analysis**

In [39]:
# Importing visual.py for creating the EDA visualizations

from lib import visual as vis
import pandas as pd

In [24]:
# Loading final dataframe

df = pd.read_csv ('./data/processed/Final_dataset.csv')

In [22]:
# Confirming the number of rows and columns

df.shape

(50, 327)

In [40]:
# Generating descriptive statistics for numeric variables

df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Males - Arthritis-DataValue,50.0,24.156000,3.451954,18.0,21.825,23.70,26.300,35.7
Males - Asthma-DataValue,50.0,7.616000,1.214262,4.8,6.825,7.70,8.250,11.4
Males - Bill Payment Instability-DataValue,34.0,9.441176,2.003046,5.7,8.000,9.25,10.725,13.2
Males - COPD-DataValue,50.0,6.290000,1.819537,3.3,4.900,6.00,7.025,12.9
Males - Diabetes-DataValue,50.0,12.174000,1.957071,8.5,10.700,12.00,13.300,17.3
...,...,...,...,...,...,...,...,...
Asian - COPD-HighConfidenceLimit,3.0,3.066667,0.404145,2.6,2.950,3.30,3.300,3.3
Asian - Diabetes-HighConfidenceLimit,15.0,16.233333,4.942334,9.8,12.050,16.60,18.850,26.7
Asian - Obesity-HighConfidenceLimit,22.0,22.045455,6.707475,15.1,16.925,21.05,23.525,41.8
Asian - Transportation Instability-HighConfidenceLimit,4.0,8.200000,2.333809,6.1,6.250,8.05,10.000,10.6


In [41]:
# Obtaining detailed information about the DataFrame

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Columns: 327 entries, State to moe - Mortgage Costs 30 precent or more of Income
dtypes: float64(211), object(116)
memory usage: 127.9+ KB


In [43]:
# Checking for duplicate values

df.duplicated().sum()

np.int64(0)

In [46]:
# Identifying missing values

df.isnull().sum().sum() / df.size

np.float64(0.23669724770642203)

**Observation:**
- Approximate 23% of the entire dataset is missing

In [47]:
# Showing the percentage missing PER COLUMN

df.isnull().mean().sort_values(ascending=False).head(20)

Hawaiian or Pacific Islander - Transportation Instability-HighConfidenceLimit    1.00
Hawaiian or Pacific Islander - Transportation Instability-DataValue              1.00
Hawaiian or Pacific Islander - Transportation Instability-LowConfidenceLimit     1.00
Hawaiian or Pacific Islander - COPD-LowConfidenceLimit                           0.98
Hawaiian or Pacific Islander - Bill Payment Instability-LowConfidenceLimit       0.98
Hawaiian or Pacific Islander - Asthma-LowConfidenceLimit                         0.98
Hawaiian or Pacific Islander - COPD-HighConfidenceLimit                          0.98
Hawaiian or Pacific Islander - Bill Payment Instability-HighConfidenceLimit      0.98
Hawaiian or Pacific Islander - Asthma-HighConfidenceLimit                        0.98
Hawaiian or Pacific Islander - COPD-DataValue                                    0.98
Hawaiian or Pacific Islander - Bill Payment Instability-DataValue                0.98
Hawaiian or Pacific Islander - Asthma-DataValue       