Import packages first.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Read CSV and create pandas DataFrame.

In [5]:
data = pd.read_csv(r'../project-filtered-data.csv')
data

Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,current_status,symptom_status,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2020-05,IL,17,LAKE,17097.0,65+ years,Female,White,Non-Hispanic/Latino,,0.0,Missing,Yes,Laboratory-confirmed case,Missing,Yes,Yes,Yes,Yes
1,2020-07,KS,20,SHAWNEE,20177.0,18 to 49 years,Male,White,Non-Hispanic/Latino,1.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,Yes
2,2020-05,UT,49,DAVIS,49011.0,18 to 49 years,Male,White,Hispanic/Latino,1.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,Yes
3,2021-09,OH,39,DEFIANCE,39039.0,0 - 17 years,Female,White,Non-Hispanic/Latino,1.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,No
4,2021-12,PA,42,SUSQUEHANNA,42115.0,50 to 64 years,Male,White,Non-Hispanic/Latino,0.0,0.0,Missing,Missing,Laboratory-confirmed case,Symptomatic,No,No,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483782,2020-11,OH,39,MADISON,39097.0,18 to 49 years,Female,White,Non-Hispanic/Latino,,0.0,Missing,Unknown,Laboratory-confirmed case,Symptomatic,No,No,No,Yes
483783,2021-12,OH,39,LUCAS,39095.0,65+ years,Male,White,Non-Hispanic/Latino,0.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,Yes
483784,2021-12,OH,39,JACKSON,39079.0,65+ years,Female,White,Non-Hispanic/Latino,1.0,0.0,Multiple,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,Yes
483785,2021-09,OH,39,FAYETTE,39047.0,0 - 17 years,Male,White,Non-Hispanic/Latino,0.0,0.0,Clinical evaluation,Yes,Laboratory-confirmed case,Symptomatic,No,No,No,No


Check for any missing values.

In [8]:
pd.DataFrame.from_dict(data={
    'any_na' : data.isna().any()
})

Unnamed: 0,any_na
case_month,False
res_state,False
state_fips_code,False
res_county,True
county_fips_code,True
age_group,False
sex,False
race,True
ethnicity,True
case_positive_specimen_interval,True


Determine percentage of na values compared to total number of observations, for each attribute of the data set.

In [18]:
pd.DataFrame.from_dict(data={
    'na_sum' : data.isna().sum(),
    'percent_na' : (data.isna().sum() / data.shape[0]) * 100
})

Unnamed: 0,na_sum,percent_na
case_month,0,0.0
res_state,0,0.0
state_fips_code,0,0.0
res_county,30572,6.31931
county_fips_code,30572,6.31931
age_group,0,0.0
sex,0,0.0
race,39143,8.090957
ethnicity,43783,9.050057
case_positive_specimen_interval,117983,24.387385


Check data types for each attribute of the data.

In [19]:
pd.DataFrame.from_dict(data={
    'data_types' : data.dtypes
})

Unnamed: 0,data_types
case_month,object
res_state,object
state_fips_code,int64
res_county,object
county_fips_code,float64
age_group,object
sex,object
race,object
ethnicity,object
case_positive_specimen_interval,float64


Determine unique values for _yn attributes of the data.

In [20]:
pd.DataFrame.from_dict(data={
    'exposure_yn_unique_values' : data['exposure_yn'].unique()
})

Unnamed: 0,exposure_yn_unique_values
0,Yes
1,Missing
2,Unknown


In [21]:
pd.DataFrame.from_dict(data={
    'hosp_yn' : data['hosp_yn'].unique()
})

Unnamed: 0,hosp_yn
0,Yes
1,No


In [22]:
pd.DataFrame.from_dict(data={
    'icu_yn_unique_values' : data['icu_yn'].unique()
})

Unnamed: 0,icu_yn_unique_values
0,Yes
1,No
2,nul


In [23]:
pd.DataFrame.from_dict(data={
    'death_yn_unique_values' : data['death_yn'].unique()
})

Unnamed: 0,death_yn_unique_values
0,Yes
1,No


In [24]:
pd.DataFrame.from_dict(data={
    'underlying_conditions_yn_unique_values' : data['underlying_conditions_yn'].unique()
})

Unnamed: 0,underlying_conditions_yn_unique_values
0,Yes
1,No


It looks like some of these _yn features have missing observations as well; they didn't appear in our search for na values because they aren't na/null, but are actual string/object values indicating a missing observation, such as "Missing" and "nul." 

Let's filter the data further so these missing/nul values are excluded.

In [28]:
data2 = data.copy(deep=True)
data2 = data2[data2['exposure_yn'] != 'Missing']
data2 = data2[data2['icu_yn'] != 'nul']
print(f'data number of observations: {data.shape[0]}')
print(f'data2 number of observations: {data2.shape[0]}')
print(f'difference between data and data2 number of observations: {data.shape[0] - data2.shape[0]}')

data number of observations: 483787
data2 number of observations: 314877
difference between data and data2 number of observations: 168910


Removing missing/nul values in the _yn attributes from the data set seems to have cut out 168,910 observations. We should assign ```data2``` to our main ```data``` variable.

In [29]:
data = data2
print(f'data number of observations: {data.shape[0]}')

data number of observations: 314877


Now, let's check the other attributes in the data, to make sure we didn't miss any nul/missing values in those too.

In [46]:
data_frames = []
for col in data.columns:
    data_frames.append(pd.DataFrame.from_dict(data={
        'col' : col,
        f'unique_values' : data[f'{col}'].unique()
    }))

In [47]:
missing_nul_dfs = []
for df in data_frames:
    if str(df.dtypes.iloc[0]) == 'object':
        uniques_list = df['unique_values'].sort_values().to_list()

        if 'Missing' in uniques_list or 'nul' in uniques_list:
            missing_nul_dfs.append(df)

In [48]:
missing_nul_dfs2 = [df.copy(deep=True) for df in missing_nul_dfs]

In [60]:
for df2 in missing_nul_dfs2:
    data2 = data2[data2[f'{df2["col"].iloc[0]}'] != 'Missing']
    data2 = data2[data2[f'{df2["col"].iloc[0]}'] != 'nul']

print(f'data number of observations: {data.shape[0]}')
print(f'data2 number of observations: {data2.shape[0]}')
print(f'difference between data and data2 number fo observations: {data.shape[0] - data2.shape[0]}')

data number of observations: 314877
data2 number of observations: 190872
difference between data and data2 number fo observations: 124005


Cutting the 'Missing' and 'nul' values from the data2 variable seems to have decreased the number of observations by 124,005 records. Note, these only consider the attributes which contain 'object' type data. 

We should assign data2 to data to make sure our main data file is consistent.

In [61]:
data = data2
print(f'data number of observations: {data.shape[0]}')

data number of observations: 190872


Now, let's recount the number of nans in each attribute.

In [62]:
pd.DataFrame.from_dict(data={
    'num_nas' : data.isna().sum(),
    'percent_na' : (data.isna().sum() / data.shape[0]) * 100 
})

Unnamed: 0,num_nas,percent_na
case_month,0,0.0
res_state,0,0.0
state_fips_code,0,0.0
res_county,9972,5.224444
county_fips_code,9972,5.224444
age_group,0,0.0
sex,0,0.0
race,14677,7.689446
ethnicity,16184,8.478981
case_positive_specimen_interval,64680,33.886584


Now, let's drop all these nan records from the data.

In [63]:
data.dropna(inplace=True)
print(f'data number of observations: {data.shape[0]}')

data number of observations: 97697


Lastly, let's make sure ther aren't any duplicates.

In [70]:
data[data.duplicated()].count()

case_month                         48085
res_state                          48085
state_fips_code                    48085
res_county                         48085
county_fips_code                   48085
age_group                          48085
sex                                48085
race                               48085
ethnicity                          48085
case_positive_specimen_interval    48085
case_onset_interval                48085
process                            48085
exposure_yn                        48085
current_status                     48085
symptom_status                     48085
hosp_yn                            48085
icu_yn                             48085
death_yn                           48085
underlying_conditions_yn           48085
dtype: int64

We should drop the duplicate data.