# Import Data and Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Create path to folder
path = r'/Users/caitlin/iCloud/Caitlin/COVID Data'

In [3]:
path

'/Users/caitlin/iCloud/Caitlin/COVID Data'

# Upload Data - COVID Cases and Deaths

In [6]:
# Import data file
COVID_cases = pd.read_csv(os.path.join(path, 'COVID_cases_deaths.csv'), index_col = False)

In [7]:
# Show all columns
pd.options.display.max_columns = None

In [8]:
# Examine shape
COVID_cases.shape

(47100, 15)

In [9]:
# Check info
COVID_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47100 entries, 0 to 47099
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   submission_date  47100 non-null  object 
 1   state            47100 non-null  object 
 2   tot_cases        47100 non-null  int64  
 3   conf_cases       25784 non-null  float64
 4   prob_cases       25712 non-null  float64
 5   new_case         47100 non-null  int64  
 6   pnew_case        43316 non-null  float64
 7   tot_death        47100 non-null  int64  
 8   conf_death       25408 non-null  float64
 9   prob_death       25408 non-null  float64
 10  new_death        47100 non-null  int64  
 11  pnew_death       43301 non-null  float64
 12  created_at       47100 non-null  object 
 13  consent_cases    39245 non-null  object 
 14  consent_deaths   40035 non-null  object 
dtypes: float64(6), int64(4), object(5)
memory usage: 5.4+ MB


In [10]:
# Examine columns for covid cases
COVID_cases.head()

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,09/09/2021,NC,1277752,1085076.0,192676.0,7785,1730.0,15703,13811.0,1892.0,69,13.0,09/09/2021 12:00:00 AM,Agree,Agree
1,09/01/2021,ND,118491,107475.0,11016.0,536,66.0,1562,,,1,0.0,09/02/2021 01:49:05 PM,Agree,Not agree
2,03/18/2020,ME,44,44.0,0.0,12,0.0,0,0.0,0.0,0,0.0,03/20/2020 12:00:00 AM,Agree,Agree
3,02/06/2020,NE,0,,,0,,0,,,0,,03/26/2020 04:22:39 PM,Agree,Agree
4,02/02/2021,IL,1130917,1130917.0,0.0,2304,0.0,21336,19306.0,2030.0,63,16.0,02/03/2021 02:55:58 PM,Agree,Agree


# Consistency Checks and Cleaning - Stay at Home Orders

## Drop Columns

In [11]:
# Drop unnecessary columns
## Only using total for each, not presumed or probable.
COVID_cases_2 = COVID_cases.drop(columns = ['created_at', 'conf_cases', 'prob_cases', 'pnew_case', 'conf_death','prob_death', 'pnew_death', 'consent_cases', 'consent_deaths'])

In [12]:
COVID_cases_2.head()

Unnamed: 0,submission_date,state,tot_cases,new_case,tot_death,new_death
0,09/09/2021,NC,1277752,7785,15703,69
1,09/01/2021,ND,118491,536,1562,1
2,03/18/2020,ME,44,12,0,0
3,02/06/2020,NE,0,0,0,0
4,02/02/2021,IL,1130917,2304,21336,63


## Find Duplicates

In [13]:
# Find duplicates
COVID_cases_dups = COVID_cases_2[COVID_cases_2.duplicated()]

In [14]:
COVID_cases_dups

Unnamed: 0,submission_date,state,tot_cases,new_case,tot_death,new_death


No duplicates

I am merging this dataset with mask mandate and stay at home orders data. That dataset starts 10 April. I am also only interested in the time before vaccines because I do not want to adjust for vaccines as another variable. The first vaccine was issued in the US on 14 Dec 2020, so I am ending on that date.

## Change date column format

In [15]:
# Change date column to datetime format so it behaves as a number
COVID_cases_2['submission_date'] = pd.to_datetime(COVID_cases_2['submission_date'])

In [16]:
COVID_cases_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47100 entries, 0 to 47099
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   submission_date  47100 non-null  datetime64[ns]
 1   state            47100 non-null  object        
 2   tot_cases        47100 non-null  int64         
 3   new_case         47100 non-null  int64         
 4   tot_death        47100 non-null  int64         
 5   new_death        47100 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 2.2+ MB


In [17]:
# Rename date column for later merge
COVID_cases_3 = COVID_cases_2.rename(columns = {'submission_date' : 'date'})

In [18]:
COVID_cases_3.head(2)

Unnamed: 0,date,state,tot_cases,new_case,tot_death,new_death
0,2021-09-09,NC,1277752,7785,15703,69
1,2021-09-01,ND,118491,536,1562,1


## Remove unwanted values

In [19]:
# Check min and max values to see which ones to eliminate
COVID_cases_3['date'].max()

Timestamp('2022-03-16 00:00:00')

In [20]:
# Check min and max values to see which ones to eliminate
COVID_cases_3['date'].min()

Timestamp('2020-01-22 00:00:00')

I am only interested in the time before the vaccine was administered - 14 December 2020. I will include 14 days after, however, because that is the incubation period for COVID.

In [21]:
# Make subset of desired date values
COVID_cases_4 = COVID_cases_3[(COVID_cases_3['date'] >= '2020-04-10') & (COVID_cases_3['date'] <= '2020-12-28')]

In [22]:
# Check min and max values to check subset
COVID_cases_4['date'].max()

Timestamp('2020-12-28 00:00:00')

In [23]:
COVID_cases_4['date'].min()

Timestamp('2020-04-10 00:00:00')

In [24]:
# Value counts to see states included
COVID_cases_4['state'].value_counts(dropna = False)

VT     263
MI     263
OR     263
VA     263
WY     263
AR     263
MP     263
AS     263
HI     263
PR     263
AK     263
TX     263
MA     263
GA     263
FL     263
WV     263
PW     263
OK     263
CO     263
NYC    263
KS     263
UT     263
PA     263
IA     263
NM     263
OH     263
NY     263
SD     263
TN     263
RI     263
DC     263
KY     263
IL     263
NH     263
ND     263
GU     263
AL     263
CT     263
NE     263
MO     263
MS     263
NC     263
MD     263
WI     263
ID     263
ME     263
MN     263
CA     263
DE     263
IN     263
VI     263
MT     263
NV     263
WA     263
LA     263
FSM    263
AZ     263
NJ     263
SC     263
RMI    263
Name: state, dtype: int64

Only interested in 50 states and DC, take out others. 

In [25]:
COVID_cases_5 = COVID_cases_4[(COVID_cases_4.state != "RMI") & (COVID_cases_4.state != "FSM") & (COVID_cases_4.state != "GU") & (COVID_cases_4.state != "PW") & (COVID_cases_4.state != "VI")]

In [26]:
# Check omissions
COVID_cases_5['state'].value_counts(dropna = False)

VT     263
FL     263
VA     263
WY     263
AR     263
MP     263
AS     263
HI     263
PR     263
AK     263
TX     263
MA     263
GA     263
WV     263
RI     263
OK     263
CO     263
NYC    263
KS     263
UT     263
PA     263
IA     263
NM     263
OH     263
NY     263
SD     263
OR     263
DC     263
MI     263
IL     263
NH     263
ND     263
AL     263
CT     263
NE     263
MO     263
MS     263
NC     263
MD     263
WI     263
ID     263
ME     263
KY     263
CA     263
DE     263
IN     263
MT     263
NV     263
WA     263
LA     263
AZ     263
NJ     263
SC     263
MN     263
TN     263
Name: state, dtype: int64

## Check for Missing Values

In [27]:
# Check for missing values in home order dataframe
COVID_cases_5.isnull().sum()

date         0
state        0
tot_cases    0
new_case     0
tot_death    0
new_death    0
dtype: int64

No missing values.

## Check for mixed types

In [28]:
# Check for mixed types
for col in COVID_cases_5.columns.tolist():
    weird = (COVID_cases_5[[col]].applymap(type) != COVID_cases_5[[col]].iloc[0].apply(type)).any(axis = 1) 
    if len (COVID_cases_5[weird]) > 0:
        print (col)

No mixed types.

# Download cleaned data

In [29]:
#Download cleaned data
COVID_cases_5.to_pickle(os.path.join(path, 'COVID_cases_clean.pkl'))