In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.formula.api import ols
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")

In [2]:
covid_df = pd.read_csv('merged_df_ashv5.csv')
print(covid_df.shape)
covid_df.head(20)

(2555890, 181)


Unnamed: 0,Date,Completeness_pct,Administered_Dose1_Recip,Administered_Dose1_Recip_5Plus,Administered_Dose1_Recip_12Plus,Administered_Dose1_Recip_18Plus,Administered_Dose1_Recip_65Plus,Series_Complete_Yes,Series_Complete_5Plus,Series_Complete_5to17,...,AGE8084_MALE,AGE8084_FEM,AGE85PLUS_TOT,AGE85PLUS_MALE,AGE85PLUS_FEM,MEDIAN_AGE_TOT,MEDIAN_AGE_MALE,MEDIAN_AGE_FEM,GDP_current_dollar,personal_income
0,2020-01-21,,,,,,,,,,...,112,122,169,80,89,36.9,36.1,37.5,1391880,49571
1,2020-01-21,,,,,,,,,,...,232,246,444,136,308,53.0,56.4,44.7,387976,30660
2,2020-01-21,,,,,,,,,,...,65,105,107,41,66,46.0,45.7,46.6,141553,45270
3,2020-01-21,,,,,,,,,,...,2586,2657,4582,2129,2453,38.8,37.9,39.8,7743796,45360
4,2020-01-21,,,,,,,,,,...,291,337,529,206,323,42.2,42.0,42.6,750765,38304
5,2020-01-21,,,,,,,,,,...,69,113,143,44,99,33.9,31.4,37.8,265515,40074
6,2020-01-21,,,,,,,,,,...,115,108,200,85,115,40.7,40.2,41.8,782349,53517
7,2020-01-21,,,,,,,,,,...,360,496,849,309,540,38.0,38.0,38.1,4553170,62252
8,2020-01-21,,,,,,,,,,...,907,1243,2118,795,1323,40.5,39.8,41.3,7007576,45759
9,2020-01-21,,,,,,,,,,...,43,53,162,51,111,37.2,37.7,36.4,297666,50025


In [3]:
print(covid_df.columns.values, sep=',')

['Date' 'Completeness_pct' 'Administered_Dose1_Recip'
 'Administered_Dose1_Recip_5Plus' 'Administered_Dose1_Recip_12Plus'
 'Administered_Dose1_Recip_18Plus' 'Administered_Dose1_Recip_65Plus'
 'Series_Complete_Yes' 'Series_Complete_5Plus' 'Series_Complete_5to17'
 'Series_Complete_12Plus' 'Series_Complete_18Plus'
 'Series_Complete_65Plus' 'Booster_Doses' 'Booster_Doses_5Plus'
 'Booster_Doses_12Plus' 'Booster_Doses_18Plus' 'Booster_Doses_50Plus'
 'Booster_Doses_65Plus' 'Second_Booster_50Plus' 'Second_Booster_65Plus'
 'SVI_CTGY' 'Metro_status' 'Series_Complete_Pop_Pct_UR_Equity'
 'Series_Complete_5PlusPop_Pct_UR_Equity'
 'Series_Complete_5to17Pop_Pct_UR_Equity'
 'Series_Complete_12PlusPop_Pct_UR_Equity'
 'Series_Complete_18PlusPop_Pct_UR_Equity'
 'Series_Complete_65PlusPop_Pct_UR_Equity' 'Booster_Doses_Vax_Pct_SVI'
 'Booster_Doses_12PlusVax_Pct_SVI' 'Booster_Doses_18PlusVax_Pct_SVI'
 'Booster_Doses_65PlusVax_Pct_SVI' 'Booster_Doses_Vax_Pct_UR_Equity'
 'Booster_Doses_12PlusVax_Pct_UR_Equity

In [4]:
missing = covid_df.isnull().sum()
print(missing.to_string())

Date                                             0
Completeness_pct                            963040
Administered_Dose1_Recip                   1007876
Administered_Dose1_Recip_5Plus             2121838
Administered_Dose1_Recip_12Plus            1057706
Administered_Dose1_Recip_18Plus            1039754
Administered_Dose1_Recip_65Plus            1039728
Series_Complete_Yes                         964414
Series_Complete_5Plus                      2105566
Series_Complete_5to17                      2438822
Series_Complete_12Plus                      982366
Series_Complete_18Plus                      964414
Series_Complete_65Plus                      964414
Booster_Doses                              2125602
Booster_Doses_5Plus                        2555890
Booster_Doses_12Plus                       2261654
Booster_Doses_18Plus                       2125602
Booster_Doses_50Plus                       2125602
Booster_Doses_65Plus                       2125602
Second_Booster_50Plus          

In [5]:
num_cols = covid_df._get_numeric_data()
print(num_cols.columns.values, sep=',')

['Completeness_pct' 'Administered_Dose1_Recip'
 'Administered_Dose1_Recip_5Plus' 'Administered_Dose1_Recip_12Plus'
 'Administered_Dose1_Recip_18Plus' 'Administered_Dose1_Recip_65Plus'
 'Series_Complete_Yes' 'Series_Complete_5Plus' 'Series_Complete_5to17'
 'Series_Complete_12Plus' 'Series_Complete_18Plus'
 'Series_Complete_65Plus' 'Booster_Doses' 'Booster_Doses_5Plus'
 'Booster_Doses_12Plus' 'Booster_Doses_18Plus' 'Booster_Doses_50Plus'
 'Booster_Doses_65Plus' 'Second_Booster_50Plus' 'Second_Booster_65Plus'
 'Series_Complete_Pop_Pct_UR_Equity'
 'Series_Complete_5PlusPop_Pct_UR_Equity'
 'Series_Complete_5to17Pop_Pct_UR_Equity'
 'Series_Complete_12PlusPop_Pct_UR_Equity'
 'Series_Complete_18PlusPop_Pct_UR_Equity'
 'Series_Complete_65PlusPop_Pct_UR_Equity' 'Booster_Doses_Vax_Pct_SVI'
 'Booster_Doses_12PlusVax_Pct_SVI' 'Booster_Doses_18PlusVax_Pct_SVI'
 'Booster_Doses_65PlusVax_Pct_SVI' 'Booster_Doses_Vax_Pct_UR_Equity'
 'Booster_Doses_12PlusVax_Pct_UR_Equity'
 'Booster_Doses_18PlusVax_Pct_U

In [6]:
#Drop all States not in the Top 10
covid_df_uni_pre = covid_df.loc[covid_df['STATE'].isin([6, 48, 12, 36, 42, 17, 39, 13, 37, 26])]

In [7]:
#covid_df['Date'] = pd.to_datetime(covid_df['Date'])

In [8]:
#covid_df = covid_df.set_index('Date')
#covid_df.index = pd.to_datetime(covid_df.index)

In [9]:
#covid_df

In [10]:
covid_univariate_df = covid_df_uni_pre.loc[:, ['Date', 'STATE', 'PEOPLE_POSITIVE_NEW_CASES_COUNT']]
covid_univariate_df

Unnamed: 0,Date,STATE,PEOPLE_POSITIVE_NEW_CASES_COUNT
1,2020-01-21,39,0
2,2020-01-21,17,0
5,2020-01-21,48,0
7,2020-01-21,39,0
8,2020-01-21,36,0
...,...,...,...
2555873,2022-04-29,37,0
2555876,2022-04-29,12,0
2555878,2022-04-29,37,0
2555884,2022-04-29,36,65


In [11]:
covid_univariate_df.to_csv('covid_univariate_df.csv', index=False)