## Import Libraries, Create Folder Path

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [3]:
# Create path to folder
path = r'/Users/caitlin/iCloud/Caitlin/COVID Data'

In [4]:
path

'/Users/caitlin/iCloud/Caitlin/COVID Data'

## Download Data - Mask Mandates

In [5]:
# Download mask mandate data
COVID_mask_order = pd.read_csv(os.path.join(path, 'COVID_mask_mandates.csv'))

In [6]:
# Examine shape
COVID_mask_order.shape

(1593869, 10)

In [7]:
# Examine columns for mask mandate data
COVID_mask_order.head()

Unnamed: 0,State_Tribe_Territory,County_Name,FIPS_State,FIPS_County,date,order_code,Face_Masks_Required_in_Public,Source_of_Action,URL,Citation
0,AL,Autauga County,1,1,4/10/2020,2,,,,
1,AL,Autauga County,1,1,4/11/2020,2,,,,
2,AL,Autauga County,1,1,4/12/2020,2,,,,
3,AL,Autauga County,1,1,4/13/2020,2,,,,
4,AL,Autauga County,1,1,4/14/2020,2,,,,


## Consistency Checks and Cleaning - Mask Mandates

In [8]:
# Check info
COVID_mask_order.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1593869 entries, 0 to 1593868
Data columns (total 10 columns):
 #   Column                         Non-Null Count    Dtype 
---  ------                         --------------    ----- 
 0   State_Tribe_Territory          1593869 non-null  object
 1   County_Name                    1593869 non-null  object
 2   FIPS_State                     1593869 non-null  int64 
 3   FIPS_County                    1593869 non-null  int64 
 4   date                           1593869 non-null  object
 5   order_code                     1593869 non-null  int64 
 6   Face_Masks_Required_in_Public  987555 non-null   object
 7   Source_of_Action               987555 non-null   object
 8   URL                            942295 non-null   object
 9   Citation                       977273 non-null   object
dtypes: int64(3), object(7)
memory usage: 121.6+ MB


## Drop Columns

In [9]:
# Drop unnecessary columns
## The source of the data is not important - the focus is on the actual close orders.
COVID_mask_order_2 = COVID_mask_order.drop(columns = ['Source_of_Action','URL','Citation'],)

In [10]:
# Check column drop
COVID_mask_order_2.head()

Unnamed: 0,State_Tribe_Territory,County_Name,FIPS_State,FIPS_County,date,order_code,Face_Masks_Required_in_Public
0,AL,Autauga County,1,1,4/10/2020,2,
1,AL,Autauga County,1,1,4/11/2020,2,
2,AL,Autauga County,1,1,4/12/2020,2,
3,AL,Autauga County,1,1,4/13/2020,2,
4,AL,Autauga County,1,1,4/14/2020,2,


## Find Duplicates

In [11]:
# Find mask mandate duplicates
COVID_mask_order_dups = COVID_mask_order_2[COVID_mask_order_2.duplicated()]

In [12]:
COVID_mask_order_dups

Unnamed: 0,State_Tribe_Territory,County_Name,FIPS_State,FIPS_County,date,order_code,Face_Masks_Required_in_Public


In [13]:
COVID_mask_order_dups.shape

(0, 7)

No duplicates

## Change Data Type for Date

In [14]:
# Change date column to datetime format so it behaves as a number
COVID_mask_order_2['date'] = pd.to_datetime(COVID_mask_order_2['date'])

In [15]:
COVID_mask_order_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1593869 entries, 0 to 1593868
Data columns (total 7 columns):
 #   Column                         Non-Null Count    Dtype         
---  ------                         --------------    -----         
 0   State_Tribe_Territory          1593869 non-null  object        
 1   County_Name                    1593869 non-null  object        
 2   FIPS_State                     1593869 non-null  int64         
 3   FIPS_County                    1593869 non-null  int64         
 4   date                           1593869 non-null  datetime64[ns]
 5   order_code                     1593869 non-null  int64         
 6   Face_Masks_Required_in_Public  987555 non-null   object        
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 85.1+ MB


## Eliminate unwanted values

In [16]:
# Check min and max values to see which ones to eliminate
COVID_mask_order_2['date'].max()

Timestamp('2021-08-15 00:00:00')

In [17]:
COVID_mask_order_2['date'].min()

Timestamp('2020-04-10 00:00:00')

I am only interested in the time before vaccines (14 Dec) so eliminating other values.

In [18]:
# Make subset of desired time values
COVID_mask_3 = COVID_mask_order_2[(COVID_mask_order_2['date'] < '2020-12-14')]

In [19]:
COVID_mask_3['date'].max()

Timestamp('2020-12-13 00:00:00')

In [20]:
# Group by to index state names by FIPS code
COVID_mask_3.groupby(['FIPS_State'])['State_Tribe_Territory'].value_counts()

FIPS_State  State_Tribe_Territory
1           AL                       16616
2           AK                        7192
4           AZ                        3720
5           AR                       18600
6           CA                       14384
8           CO                       15872
9           CT                        1984
10          DE                         744
11          DC                         248
12          FL                       16616
13          GA                       39432
15          HI                        1240
16          ID                       10912
17          IL                       25296
18          IN                       22816
19          IA                       24552
20          KS                       26040
21          KY                       29760
22          LA                       15872
23          ME                        3968
24          MD                        5952
25          MA                        3472
26          MI      

In [21]:
# Remove all but 50 states and DC
COVID_mask_4 = COVID_mask_3[(COVID_mask_3.FIPS_State != 78) 
                            & (COVID_mask_3.FIPS_State != 72) 
                            & (COVID_mask_3.FIPS_State != 69)
                            & (COVID_mask_3.FIPS_State != 66)
                            & (COVID_mask_3.FIPS_State != 60)]

In [22]:
# Index by group by to confirm drop
COVID_mask_4.groupby(['FIPS_State'])['State_Tribe_Territory'].value_counts()

FIPS_State  State_Tribe_Territory
1           AL                       16616
2           AK                        7192
4           AZ                        3720
5           AR                       18600
6           CA                       14384
8           CO                       15872
9           CT                        1984
10          DE                         744
11          DC                         248
12          FL                       16616
13          GA                       39432
15          HI                        1240
16          ID                       10912
17          IL                       25296
18          IN                       22816
19          IA                       24552
20          KS                       26040
21          KY                       29760
22          LA                       15872
23          ME                        3968
24          MD                        5952
25          MA                        3472
26          MI      

In [23]:
COVID_mask_4.shape

(779216, 7)

## Change data type to reduce memory

In [24]:
# Change int values to lower int values to save memory - order code only has 2 values so int8
COVID_mask_4['order_code'] = COVID_mask_4['order_code'].astype('int8')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  COVID_mask_4['order_code'] = COVID_mask_4['order_code'].astype('int8')


In [25]:
# Change int values to lower int values to save memory - FIPS only has ~50 values so int8
COVID_mask_4['FIPS_State'] = COVID_mask_4['FIPS_State'].astype('int8')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  COVID_mask_4['FIPS_State'] = COVID_mask_4['FIPS_State'].astype('int8')


In [26]:
#Check max value for county for int change
COVID_mask_4['FIPS_County'].max()

840

In [27]:
# Change FIPS county to int 16 because that has max value over 30,000
COVID_mask_4['FIPS_County'] = COVID_mask_4['FIPS_County'].astype('int16')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  COVID_mask_4['FIPS_County'] = COVID_mask_4['FIPS_County'].astype('int16')


In [28]:
# Check final changes - significantly reduced memory
COVID_mask_4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 779216 entries, 0 to 835753
Data columns (total 7 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   State_Tribe_Territory          779216 non-null  object        
 1   County_Name                    779216 non-null  object        
 2   FIPS_State                     779216 non-null  int8          
 3   FIPS_County                    779216 non-null  int16         
 4   date                           779216 non-null  datetime64[ns]
 5   order_code                     779216 non-null  int8          
 6   Face_Masks_Required_in_Public  377210 non-null  object        
dtypes: datetime64[ns](1), int16(1), int8(2), object(3)
memory usage: 32.7+ MB


## Check for missing values

In [29]:
# Check for missing values in mask mandate dataframe
COVID_mask_4.isnull().sum()

State_Tribe_Territory                 0
County_Name                           0
FIPS_State                            0
FIPS_County                           0
date                                  0
order_code                            0
Face_Masks_Required_in_Public    402006
dtype: int64

In [30]:
# Value counts to check source of NaNs
COVID_mask_4['Face_Masks_Required_in_Public'].value_counts(dropna=False)

NaN    402006
Yes    370675
No       6535
Name: Face_Masks_Required_in_Public, dtype: int64

In [31]:
# Value counts to check source of NaNs
COVID_mask_4['order_code'].value_counts(dropna=False)

2    408541
1    370675
Name: order_code, dtype: int64

In [32]:
# Create dataframe for NaNs - masks
COVID_mask_nan = COVID_mask_4[COVID_mask_4['Face_Masks_Required_in_Public'].isnull() == True]

In [33]:
COVID_mask_nan['order_code'].value_counts(dropna=False)

2    402006
Name: order_code, dtype: int64

All of the NaN values in the mask mandate database have the same code - 2 - which is " both in retail businesses and in restaurants/food establishments" defined by the CDC. Since the 2s are equal to the NaN and NO, I am going to change the NaNs to reflect those values.

In [34]:
# Change NaN to No
COVID_mask_4['Face_Masks_Required_in_Public'] = COVID_mask_4['Face_Masks_Required_in_Public'].fillna('No')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  COVID_mask_4['Face_Masks_Required_in_Public'] = COVID_mask_4['Face_Masks_Required_in_Public'].fillna('No')


In [35]:
# Value counts to check source of NaNs
COVID_mask_4['Face_Masks_Required_in_Public'].value_counts(dropna=False)

No     408541
Yes    370675
Name: Face_Masks_Required_in_Public, dtype: int64

## Rename columns for later merge

In [46]:
# Changing column names
COVID_mask_4.rename(columns = {'State_Tribe_Territory' : 'state'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [49]:
COVID_mask_4.rename(columns = {'order_code' : 'order_code_mask'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [50]:
COVID_mask_4.head()

Unnamed: 0,state,County_Name,FIPS_State,FIPS_County,date,order_code_mask,Face_Masks_Required_in_Public
0,AL,Autauga County,1,1,2020-04-10,2,No
1,AL,Autauga County,1,1,2020-04-11,2,No
2,AL,Autauga County,1,1,2020-04-12,2,No
3,AL,Autauga County,1,1,2020-04-13,2,No
4,AL,Autauga County,1,1,2020-04-14,2,No


# Download dataset

In [51]:
#Download mask data
COVID_mask_4.to_pickle(os.path.join(path, 'COVID_mask_clean.pkl'))