## Import Libraries, Create Folder Path

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Create path to folder
path = r'/Users/caitlin/iCloud/Caitlin/COVID Data'

In [3]:
path

'/Users/caitlin/iCloud/Caitlin/COVID Data'

## Upload Data - Stay at Home Orders

In [4]:
# Import data file
COVID_home_order = pd.read_csv(os.path.join(path, 'COVID_home_orders.csv'), index_col = False)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
# Show all columns
pd.options.display.max_columns = None

In [6]:
# Examine shape
COVID_home_order.shape

(1677927, 11)

In [7]:
# Check info
COVID_home_order.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1677927 entries, 0 to 1677926
Data columns (total 11 columns):
 #   Column                             Non-Null Count    Dtype 
---  ------                             --------------    ----- 
 0   State_Tribe_Territory              1677927 non-null  object
 1   County_Name                        1677927 non-null  object
 2   FIPS_State                         1677927 non-null  int64 
 3   FIPS_County                        1677927 non-null  int64 
 4   date                               1677927 non-null  object
 5   Order_code                         1677927 non-null  int64 
 6   Stay_at_Home_Order_Recommendation  1461933 non-null  object
 7   Express_Preemption                 1448788 non-null  object
 8   Source_of_Action                   1461223 non-null  object
 9   URL                                721717 non-null   object
 10  Citation                           1461933 non-null  object
dtypes: int64(3), object(8)
memory usage: 

In [8]:
# Examine columns for home order data
COVID_home_order.head()

Unnamed: 0,State_Tribe_Territory,County_Name,FIPS_State,FIPS_County,date,Order_code,Stay_at_Home_Order_Recommendation,Express_Preemption,Source_of_Action,URL,Citation
0,AL,Autauga County,1,1,3/15/2020,7,,,,,
1,AL,Autauga County,1,1,3/16/2020,7,,,,,
2,AL,Autauga County,1,1,3/17/2020,7,,,,,
3,AL,Autauga County,1,1,3/18/2020,7,,,,,
4,AL,Autauga County,1,1,3/19/2020,7,,,,,


## Consistency Checks and Cleaning - Stay at Home Orders

### Drop Columns

In [9]:
# Drop unnecessary columns
## The source of the data is not important - the focus is on the actual close orders.
COVID_home_2 = COVID_home_order.drop(columns = ['Express_Preemption', 'Source_of_Action','URL','Citation'],)

In [10]:
# Check column drop
COVID_home_2.head()

Unnamed: 0,State_Tribe_Territory,County_Name,FIPS_State,FIPS_County,date,Order_code,Stay_at_Home_Order_Recommendation
0,AL,Autauga County,1,1,3/15/2020,7,
1,AL,Autauga County,1,1,3/16/2020,7,
2,AL,Autauga County,1,1,3/17/2020,7,
3,AL,Autauga County,1,1,3/18/2020,7,
4,AL,Autauga County,1,1,3/19/2020,7,


### Find duplicates

In [11]:
# Find home order duplicates
COVID_home_order_dups = COVID_home_2[COVID_home_2.duplicated()]

In [12]:
COVID_home_order_dups

Unnamed: 0,State_Tribe_Territory,County_Name,FIPS_State,FIPS_County,date,Order_code,Stay_at_Home_Order_Recommendation


In [13]:
COVID_home_order_dups.shape

(0, 7)

No duplicates found.

I am merging this dataset with mask mandate data. That dataset starts 10 April. I am also only interested in the time before vaccines because I do not want to adjust for vaccines as another variable. The first vaccine was issued in the US on 14 Dec 2020, so I am ending on that date.

### Change date column format

In [14]:
# Change date column to datetime format so it behaves as a number
COVID_home_2['date'] = pd.to_datetime(COVID_home_2['date'])

In [15]:
# Check change
COVID_home_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1677927 entries, 0 to 1677926
Data columns (total 7 columns):
 #   Column                             Non-Null Count    Dtype         
---  ------                             --------------    -----         
 0   State_Tribe_Territory              1677927 non-null  object        
 1   County_Name                        1677927 non-null  object        
 2   FIPS_State                         1677927 non-null  int64         
 3   FIPS_County                        1677927 non-null  int64         
 4   date                               1677927 non-null  datetime64[ns]
 5   Order_code                         1677927 non-null  int64         
 6   Stay_at_Home_Order_Recommendation  1461933 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 89.6+ MB


## Take out unwanted values

In [16]:
# Check min and max values to see which ones to eliminate
COVID_home_2['date'].max()

Timestamp('2021-08-15 00:00:00')

In [17]:
COVID_home_2['date'].min()

Timestamp('2020-03-15 00:00:00')

In [18]:
# Make subset of desired time values
COVID_home_3 = COVID_home_2[(COVID_home_2['date'] >= '2020-04-10') & (COVID_home_2['date'] < '2020-12-14')]

In [19]:
COVID_home_3['date'].max()

Timestamp('2020-12-13 00:00:00')

In [20]:
COVID_home_3['date'].min()

Timestamp('2020-04-10 00:00:00')

In [21]:
COVID_home_3.shape

(801784, 7)

Only interested in 50 states and DC, eliminate other values.

In [22]:
# Group by to index state names by FIPS code
COVID_home_3.groupby(['FIPS_State'])['State_Tribe_Territory'].value_counts()

FIPS_State  State_Tribe_Territory
1           AL                       16616
2           AK                        7192
4           AZ                        3720
5           AR                       18600
6           CA                       14384
8           CO                       15872
9           CT                        1984
10          DE                         744
11          DC                         248
12          FL                       16616
13          GA                       39432
15          HI                        1240
16          ID                       10912
17          IL                       25296
18          IN                       22816
19          IA                       24552
20          KS                       26040
21          KY                       29760
22          LA                       15872
23          ME                        3968
24          MD                        5952
25          MA                        3472
26          MI      

In [23]:
# Remove all but 50 states and DC
COVID_home_4 = COVID_home_3[(COVID_home_3.FIPS_State != 78) 
                            & (COVID_home_3.FIPS_State != 72) 
                            & (COVID_home_3.FIPS_State != 69)
                            & (COVID_home_3.FIPS_State != 66)
                            & (COVID_home_3.FIPS_State != 60)]

In [24]:
# Index by group by to confirm drop
COVID_home_4.groupby(['FIPS_State'])['State_Tribe_Territory'].value_counts(dropna = False)

FIPS_State  State_Tribe_Territory
1           AL                       16616
2           AK                        7192
4           AZ                        3720
5           AR                       18600
6           CA                       14384
8           CO                       15872
9           CT                        1984
10          DE                         744
11          DC                         248
12          FL                       16616
13          GA                       39432
15          HI                        1240
16          ID                       10912
17          IL                       25296
18          IN                       22816
19          IA                       24552
20          KS                       26040
21          KY                       29760
22          LA                       15872
23          ME                        3968
24          MD                        5952
25          MA                        3472
26          MI      

In [25]:
COVID_home_4.shape

(779216, 7)

The DF is taking up a lot of memory, so changing int64 where there are not as many values.

## Reduce memeory by changing data types

In [26]:
# Change int values to lower int values to save memory - order code only has 2 values so int8
COVID_home_4['Order_code'] = COVID_home_4['Order_code'].astype('int8')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  COVID_home_4['Order_code'] = COVID_home_4['Order_code'].astype('int8')


In [27]:
# Change int values to lower int values to save memory - FIPS only has ~50 values so int8
COVID_home_4['FIPS_State'] = COVID_home_4['FIPS_State'].astype('int8')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  COVID_home_4['FIPS_State'] = COVID_home_4['FIPS_State'].astype('int8')


In [28]:
#Check max value for county for int change
COVID_home_4['FIPS_County'].max()

840

In [29]:
# Change FIPS county to int 16 because that has max value over 30,000
COVID_home_4['FIPS_County'] = COVID_home_4['FIPS_County'].astype('int16')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  COVID_home_4['FIPS_County'] = COVID_home_4['FIPS_County'].astype('int16')


In [30]:
# Check final changes - significantly reduced memory
COVID_home_4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 779216 entries, 26 to 917445
Data columns (total 7 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   State_Tribe_Territory              779216 non-null  object        
 1   County_Name                        779216 non-null  object        
 2   FIPS_State                         779216 non-null  int8          
 3   FIPS_County                        779216 non-null  int16         
 4   date                               779216 non-null  datetime64[ns]
 5   Order_code                         779216 non-null  int8          
 6   Stay_at_Home_Order_Recommendation  700626 non-null  object        
dtypes: datetime64[ns](1), int16(1), int8(2), object(3)
memory usage: 32.7+ MB


## Check for Missing Values

In [31]:
# Check for missing values in home order dataframe
COVID_home_4.isnull().sum()

State_Tribe_Territory                    0
County_Name                              0
FIPS_State                               0
FIPS_County                              0
date                                     0
Order_code                               0
Stay_at_Home_Order_Recommendation    78590
dtype: int64

In [32]:
#Create null subset
COVID_home_nan = COVID_home_4[COVID_home_4['Stay_at_Home_Order_Recommendation'].isnull() == True]

In [33]:
COVID_home_nan

Unnamed: 0,State_Tribe_Territory,County_Name,FIPS_State,FIPS_County,date,Order_code,Stay_at_Home_Order_Recommendation
19888,AR,Arkansas County,5,1,2020-07-10,7,
19889,AR,Arkansas County,5,1,2020-07-11,7,
20188,AR,Arkansas County,5,1,2020-07-12,7,
20213,AR,Arkansas County,5,1,2020-07-13,7,
20511,AR,Arkansas County,5,1,2020-07-14,7,
...,...,...,...,...,...,...,...
917441,WY,Weston County,56,45,2020-12-09,7,
917442,WY,Weston County,56,45,2020-12-10,7,
917443,WY,Weston County,56,45,2020-12-11,7,
917444,WY,Weston County,56,45,2020-12-12,7,


In [34]:
# Count values of nan values.
COVID_home_nan['Order_code'].value_counts(dropna=False)

7    78590
Name: Order_code, dtype: int64

The NaNs have been inaccurately labeled. They are no orders to stay home. Change to that code. They are all 7, which is stay at home, so that needs to be changed.

In [35]:
# Rename values from NaN to no order
COVID_home_4['Stay_at_Home_Order_Recommendation'] = COVID_home_4['Stay_at_Home_Order_Recommendation'].fillna('No order for individuals to stay home')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  COVID_home_4['Stay_at_Home_Order_Recommendation'] = COVID_home_4['Stay_at_Home_Order_Recommendation'].fillna('No order for individuals to stay home')


In [36]:
# Check values have been changed
COVID_home_4.isnull().sum()

State_Tribe_Territory                0
County_Name                          0
FIPS_State                           0
FIPS_County                          0
date                                 0
Order_code                           0
Stay_at_Home_Order_Recommendation    0
dtype: int64

## Check for mixed types

In [37]:
# Check for mixed types
for col in COVID_home_4.columns.tolist():
    weird = (COVID_home_4[[col]].applymap(type) != COVID_home_4[[col]].iloc[0].apply(type)).any(axis = 1) 
    if len (COVID_home_4[weird]) > 0:
        print (col)

It appears that through cleaning the data, mixed type was addressed.

In [38]:
#Check for Duplicates
COVID_home_dups = COVID_home_4[COVID_home_4.duplicated()]

In [39]:
COVID_home_dups

Unnamed: 0,State_Tribe_Territory,County_Name,FIPS_State,FIPS_County,date,Order_code,Stay_at_Home_Order_Recommendation


No duplicates

There is no numerical data for this dataset, so I am just cleaning and downloading it.

## Re-Name Columns for Later Merge

In [49]:
# Rename column for later merge
COVID_home_5 = COVID_home_4.rename(columns = {'Order_code' : 'order_code_home'})

In [51]:
COVID_home_5 = COVID_home_4.rename(columns = {'State_Tribe_Territory' : 'state'})

In [52]:
COVID_home_5.head()

Unnamed: 0,state,County_Name,FIPS_State,FIPS_County,date,order_code_home,Stay_at_Home_Order_Recommendation
26,AL,Autauga County,1,1,2020-04-10,1,Mandatory for all individuals
27,AL,Autauga County,1,1,2020-04-11,1,Mandatory for all individuals
28,AL,Autauga County,1,1,2020-04-12,1,Mandatory for all individuals
29,AL,Autauga County,1,1,2020-04-13,1,Mandatory for all individuals
30,AL,Autauga County,1,1,2020-04-14,1,Mandatory for all individuals


# Download Final Cleaned Data

In [53]:
#Download cleaned data
COVID_home_5.to_pickle(os.path.join(path, 'COVID_home_clean.pkl'))