# Step 3 - Prepare Data - Task 7 - Merge Dataset

## Import libraries

In [1]:
import os
import pandas as pd
from datetime import date


## Set up environment flag

In [2]:
using_Google_colab = False
using_Anaconda_on_Mac_or_Linux = True
using_Anaconda_on_windows = False

if using_Google_colab:
    dir_input = "/content/drive/MyDrive/COVID_Project/input"
    dir_output = "/content/drive/MyDrive/COVID_Project/output"
if using_Anaconda_on_Mac_or_Linux:
    dir_input = "../input"
    dir_output = "../output"    
if using_Anaconda_on_windows:
    dir_input = r"..\input"  
    dir_output = r"..\output"  

## If using Google colab, get connected to google drive

In [3]:
if using_Google_colab:
    from google.colab import drive
    drive.mount('/content/drive')


## PD 7.1 Activity 1 - Upload county level covid cases data by county

In [4]:
df_sorted_confirmed_cases_county = pd.read_csv(os.path.join(dir_output, 'confirmed_cases_by_county.csv'))
df_sorted_confirmed_cases_county = df_sorted_confirmed_cases_county.astype({'countyFIPS': int, 
                                                                            'StateFIPS': int,
                                                                            'Date': 'datetime64[ns]'})
df_sorted_confirmed_cases_county

Unnamed: 0.1,Unnamed: 0,State,StateFIPS,County Name,countyFIPS,Date,Total Cases,Incremental Cases,cases moving_avg
0,0,AL,1,Statewide Unallocated,0,2020-01-22,0,,
1,3193,AL,1,Statewide Unallocated,0,2020-01-23,0,0.0,
2,6386,AL,1,Statewide Unallocated,0,2020-01-24,0,0.0,
3,9579,AL,1,Statewide Unallocated,0,2020-01-25,0,0.0,
4,12772,AL,1,Statewide Unallocated,0,2020-01-26,0,0.0,
...,...,...,...,...,...,...,...,...,...
2353236,2337598,CT,9,Windham County,9015,2022-01-23,23067,0.0,201.428571
2353237,2340791,CT,9,Windham County,9015,2022-01-24,23620,553.0,178.000000
2353238,2343984,CT,9,Windham County,9015,2022-01-25,23811,191.0,186.714286
2353239,2347177,CT,9,Windham County,9015,2022-01-26,23984,173.0,184.571429


## PD 7.1 Activity 2 - Upload county level covid deaths by county

In [5]:
df_sorted_covid_deaths_county = pd.read_csv(os.path.join(dir_output, 'covid_deaths_by_county.csv'))
df_sorted_covid_deaths_county = df_sorted_covid_deaths_county.astype({'countyFIPS': int, 
                                                                            'StateFIPS': int,
                                                                            'Date': 'datetime64[ns]'})
df_sorted_covid_deaths_county

Unnamed: 0.1,Unnamed: 0,State,StateFIPS,County Name,countyFIPS,Date,Total Deaths,Inc Deaths,death moving_avg
0,0,AL,1,Statewide Unallocated,0,2020-01-22,0,,
1,68,AK,2,Statewide Unallocated,0,2020-01-22,0,0.0,
2,98,AZ,4,Statewide Unallocated,0,2020-01-22,0,0.0,
3,114,AR,5,Statewide Unallocated,0,2020-01-22,0,0.0,
4,190,CA,6,Statewide Unallocated,0,2020-01-22,0,0.0,
...,...,...,...,...,...,...,...,...,...
2353236,2337598,CT,9,Windham County,9015,2022-01-23,265,0.0,1.000000
2353237,2340791,CT,9,Windham County,9015,2022-01-24,267,2.0,1.285714
2353238,2343984,CT,9,Windham County,9015,2022-01-25,268,1.0,1.428571
2353239,2347177,CT,9,Windham County,9015,2022-01-26,272,4.0,1.714286


## PD 7.2 Activity 3 - Merge covid cases and deaths data for each county and date

In [6]:
df_partial_abt_by_county = pd.merge(df_sorted_confirmed_cases_county, df_sorted_covid_deaths_county, 
                                    on=['StateFIPS','countyFIPS', 'Date'], 
                                    suffixes=('', '_DROP'), 
                                    how='inner').filter(regex='^(?!.*_DROP)')
df_partial_abt_by_county

Unnamed: 0.1,Unnamed: 0,State,StateFIPS,County Name,countyFIPS,Date,Total Cases,Incremental Cases,cases moving_avg,Total Deaths,Inc Deaths,death moving_avg
0,0,AL,1,Statewide Unallocated,0,2020-01-22,0,,,0,,
1,3193,AL,1,Statewide Unallocated,0,2020-01-23,0,0.0,,0,0.0,0.000000
2,6386,AL,1,Statewide Unallocated,0,2020-01-24,0,0.0,,0,0.0,0.000000
3,9579,AL,1,Statewide Unallocated,0,2020-01-25,0,0.0,,0,0.0,0.000000
4,12772,AL,1,Statewide Unallocated,0,2020-01-26,0,0.0,,0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
2353236,2337598,CT,9,Windham County,9015,2022-01-23,23067,0.0,201.428571,265,0.0,1.000000
2353237,2340791,CT,9,Windham County,9015,2022-01-24,23620,553.0,178.000000,267,2.0,1.285714
2353238,2343984,CT,9,Windham County,9015,2022-01-25,23811,191.0,186.714286,268,1.0,1.428571
2353239,2347177,CT,9,Windham County,9015,2022-01-26,23984,173.0,184.571429,272,4.0,1.714286


## PD 7.3 Activity 4 - Upload county level Google social mobility data

In [7]:
df_google_mobility_data = pd.read_csv(os.path.join(dir_input, 'Google', 'Region_Mobility_Report_CSVs', '2020_US_Region_Mobility_Report.csv'))
df_google_mobility_data = df_google_mobility_data.astype({'date': 'datetime64[ns]'})
df_google_mobility_data

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2020-02-15,6.0,2.0,15.0,3.0,2.0,-1.0
1,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2020-02-16,7.0,1.0,16.0,2.0,0.0,-1.0
2,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2020-02-17,6.0,0.0,28.0,-9.0,-24.0,5.0
3,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2020-02-18,0.0,-1.0,6.0,1.0,0.0,1.0
4,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2020-02-19,2.0,0.0,8.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812060,US,United States,Wyoming,Weston County,,,56045.0,ChIJd4Rqhed3YocR7ubT5-HgoJg,2020-12-24,,,,,-56.0,
812061,US,United States,Wyoming,Weston County,,,56045.0,ChIJd4Rqhed3YocR7ubT5-HgoJg,2020-12-28,,,,,-40.0,
812062,US,United States,Wyoming,Weston County,,,56045.0,ChIJd4Rqhed3YocR7ubT5-HgoJg,2020-12-29,,,,,-43.0,
812063,US,United States,Wyoming,Weston County,,,56045.0,ChIJd4Rqhed3YocR7ubT5-HgoJg,2020-12-30,,,,,-40.0,


## PD 7.3 Activity 5 - Understand Google Mobility data

In [8]:
df_google_mobility_data.columns


Index(['country_region_code', 'country_region', 'sub_region_1', 'sub_region_2',
       'metro_area', 'iso_3166_2_code', 'census_fips_code', 'place_id', 'date',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline'],
      dtype='object')

In [9]:
df_partial_abt_by_county[df_partial_abt_by_county['County Name'] == 'Los Angeles County '].count()

Unnamed: 0           737
State                737
StateFIPS            737
County Name          737
countyFIPS           737
Date                 737
Total Cases          737
Incremental Cases    736
cases moving_avg     730
Total Deaths         737
Inc Deaths           736
death moving_avg     730
dtype: int64

In [10]:
df_google_mobility_data[df_google_mobility_data['sub_region_2'] == 'Los Angeles County'].count()

country_region_code                                   321
country_region                                        321
sub_region_1                                          321
sub_region_2                                          321
metro_area                                              0
iso_3166_2_code                                         0
census_fips_code                                      321
place_id                                              321
date                                                  321
retail_and_recreation_percent_change_from_baseline    321
grocery_and_pharmacy_percent_change_from_baseline     321
parks_percent_change_from_baseline                    321
transit_stations_percent_change_from_baseline         321
workplaces_percent_change_from_baseline               321
residential_percent_change_from_baseline              321
dtype: int64

In [11]:
df_google_mobility_data[df_google_mobility_data['sub_region_2'] == 'Los Angeles County'].date.min()

Timestamp('2020-02-15 00:00:00')

In [12]:
df_google_mobility_data[df_google_mobility_data['sub_region_2'] == 'Los Angeles County'].date.max()

Timestamp('2020-12-31 00:00:00')

In [15]:
df_partial_abt_by_county[df_partial_abt_by_county['County Name'] == 'Los Angeles County '].Date.min()

Timestamp('2020-01-22 00:00:00')

In [16]:
df_partial_abt_by_county[df_partial_abt_by_county['County Name'] == 'Los Angeles County '].Date.max()

Timestamp('2022-01-27 00:00:00')

## PD 7.3 Activity 6: Reformat Google Mobility data


In [17]:
df_google_mobility_data_clean = df_google_mobility_data.dropna(subset=['census_fips_code'])
df_google_mobility_data_clean = df_google_mobility_data_clean.astype({'census_fips_code': int})
df_google_mobility_data_clean


Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
642,US,United States,Alabama,Autauga County,,,1001,ChIJg9z7ewWPjogRA_8QrB0va7o,2020-02-15,5.0,7.0,,,-4.0,
643,US,United States,Alabama,Autauga County,,,1001,ChIJg9z7ewWPjogRA_8QrB0va7o,2020-02-16,0.0,1.0,-23.0,,-4.0,
644,US,United States,Alabama,Autauga County,,,1001,ChIJg9z7ewWPjogRA_8QrB0va7o,2020-02-17,8.0,0.0,,,-27.0,5.0
645,US,United States,Alabama,Autauga County,,,1001,ChIJg9z7ewWPjogRA_8QrB0va7o,2020-02-18,-2.0,0.0,,,2.0,0.0
646,US,United States,Alabama,Autauga County,,,1001,ChIJg9z7ewWPjogRA_8QrB0va7o,2020-02-19,-2.0,0.0,,,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812060,US,United States,Wyoming,Weston County,,,56045,ChIJd4Rqhed3YocR7ubT5-HgoJg,2020-12-24,,,,,-56.0,
812061,US,United States,Wyoming,Weston County,,,56045,ChIJd4Rqhed3YocR7ubT5-HgoJg,2020-12-28,,,,,-40.0,
812062,US,United States,Wyoming,Weston County,,,56045,ChIJd4Rqhed3YocR7ubT5-HgoJg,2020-12-29,,,,,-43.0,
812063,US,United States,Wyoming,Weston County,,,56045,ChIJd4Rqhed3YocR7ubT5-HgoJg,2020-12-30,,,,,-40.0,


## PD 7.4 Activity 7 - Merge Covid Cases, deaths and Google Data

In [18]:
df_abt_by_county = pd.merge(df_partial_abt_by_county, 
                            df_google_mobility_data, left_on=['countyFIPS', 'Date'], 
                            right_on=['census_fips_code', 'date'], 
                            suffixes=('', '_DROP'), 
                            how='left').filter(regex='^(?!.*_DROP)')
df_abt_by_county

Unnamed: 0.1,Unnamed: 0,State,StateFIPS,County Name,countyFIPS,Date,Total Cases,Incremental Cases,cases moving_avg,Total Deaths,...,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,0,AL,1,Statewide Unallocated,0,2020-01-22,0,,,0,...,,,,NaT,,,,,,
1,3193,AL,1,Statewide Unallocated,0,2020-01-23,0,0.0,,0,...,,,,NaT,,,,,,
2,6386,AL,1,Statewide Unallocated,0,2020-01-24,0,0.0,,0,...,,,,NaT,,,,,,
3,9579,AL,1,Statewide Unallocated,0,2020-01-25,0,0.0,,0,...,,,,NaT,,,,,,
4,12772,AL,1,Statewide Unallocated,0,2020-01-26,0,0.0,,0,...,,,,NaT,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2353236,2337598,CT,9,Windham County,9015,2022-01-23,23067,0.0,201.428571,265,...,,,,NaT,,,,,,
2353237,2340791,CT,9,Windham County,9015,2022-01-24,23620,553.0,178.000000,267,...,,,,NaT,,,,,,
2353238,2343984,CT,9,Windham County,9015,2022-01-25,23811,191.0,186.714286,268,...,,,,NaT,,,,,,
2353239,2347177,CT,9,Windham County,9015,2022-01-26,23984,173.0,184.571429,272,...,,,,NaT,,,,,,


## PD7.4 Activity 8: Save County Data as Analytics Base Table

In [19]:
df_abt_by_county.to_csv(os.path.join(dir_output, 'abt_by_county.csv'))

In [20]:
df_abt_LA_county = df_abt_by_county[df_abt_by_county['County Name'] == 'Los Angeles County ']
df_abt_LA_county

Unnamed: 0.1,Unnamed: 0,State,StateFIPS,County Name,countyFIPS,Date,Total Cases,Incremental Cases,cases moving_avg,Total Deaths,...,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
2269223,209,CA,6,Los Angeles County,6037,2020-01-22,375,,,0,...,,,,NaT,,,,,,
2269224,3402,CA,6,Los Angeles County,6037,2020-01-23,379,4.0,,0,...,,,,NaT,,,,,,
2269225,6595,CA,6,Los Angeles County,6037,2020-01-24,382,3.0,,0,...,,,,NaT,,,,,,
2269226,9788,CA,6,Los Angeles County,6037,2020-01-25,384,2.0,,0,...,,,,NaT,,,,,,
2269227,12981,CA,6,Los Angeles County,6037,2020-01-26,385,1.0,,0,...,,,,NaT,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2269955,2337485,CA,6,Los Angeles County,6037,2022-01-23,2430653,40171.0,26989.142857,28276,...,,,,NaT,,,,,,
2269956,2340678,CA,6,Los Angeles County,6037,2022-01-24,2453693,23040.0,26929.285714,28433,...,,,,NaT,,,,,,
2269957,2343871,CA,6,Los Angeles County,6037,2022-01-25,2468026,14333.0,27376.857143,28458,...,,,,NaT,,,,,,
2269958,2347064,CA,6,Los Angeles County,6037,2022-01-26,2472960,4934.0,18528.428571,28463,...,,,,NaT,,,,,,


In [21]:
df_abt_LA_county.to_csv(os.path.join(dir_output, 'abt_LA_county.csv'))

In [23]:
df_abt_Orange_county_CA = df_abt_by_county[df_abt_by_county['countyFIPS'] == 6059]
df_abt_Orange_county_CA

Unnamed: 0.1,Unnamed: 0,State,StateFIPS,County Name,countyFIPS,Date,Total Cases,Incremental Cases,cases moving_avg,Total Deaths,...,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
2277330,220,CA,6,Orange County,6059,2020-01-22,8,,,0,...,,,,NaT,,,,,,
2277331,3413,CA,6,Orange County,6059,2020-01-23,8,0.0,,0,...,,,,NaT,,,,,,
2277332,6606,CA,6,Orange County,6059,2020-01-24,9,1.0,,0,...,,,,NaT,,,,,,
2277333,9799,CA,6,Orange County,6059,2020-01-25,9,0.0,,0,...,,,,NaT,,,,,,
2277334,12992,CA,6,Orange County,6059,2020-01-26,9,0.0,,0,...,,,,NaT,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2278062,2337496,CA,6,Orange County,6059,2022-01-23,503296,2883.0,5496.285714,5978,...,,,,NaT,,,,,,
2278063,2340689,CA,6,Orange County,6059,2022-01-24,507671,4375.0,4701.142857,6035,...,,,,NaT,,,,,,
2278064,2343882,CA,6,Orange County,6059,2022-01-25,509881,2210.0,4562.714286,6035,...,,,,NaT,,,,,,
2278065,2347075,CA,6,Orange County,6059,2022-01-26,510999,1118.0,3245.000000,6035,...,,,,NaT,,,,,,


In [24]:
df_abt_Orange_county_CA.to_csv(os.path.join(dir_output, 'abt_Orange_county.csv'))