# Step 3 - Prepare Data - Task 7 - Merge Dataset

## Import libraries

In [1]:
import pandas as pd
from datetime import date


## Set up environment flag

In [2]:
using_Google_colab = False
using_Anaconda_on_Mac_or_Linux = True
using_Anaconda_on_windows = False

## If using Google colab, get connected to google drive

In [3]:
if using_Google_colab:
    from google.colab import drive
    drive.mount('/content/drive')

## PD 7.1 Activity 1 - Upload county level covid cases data by county

In [5]:
if using_Google_colab:
    df_sorted_confirmed_cases_county = pd.read_csv('/content/drive/MyDrive/COVID_Project/output/confirmed_cases_by_county.csv')
if using_Anaconda_on_Mac_or_Linux:
    df_sorted_confirmed_cases_county = pd.read_csv('../output/confirmed_cases_by_county.csv')
if using_Anaconda_on_windows:
    df_sorted_confirmed_cases_county = pd.read_csv(r'../output/confirmed_cases_by_county.csv') 
df_sorted_confirmed_cases_county = df_sorted_confirmed_cases_county.astype({'countyFIPS': int, 
                                                                            'stateFIPS': int,
                                                                            'Date': 'datetime64[ns]'})
df_sorted_confirmed_cases_county

Unnamed: 0.1,Unnamed: 0,State,stateFIPS,County Name,countyFIPS,Date,Total Cases,Incremental Cases,cases moving_avg
0,0,AL,1,Statewide Unallocated,0,2020-01-22,0,,
1,3195,AL,1,Statewide Unallocated,0,2020-01-23,0,0.0,
2,6390,AL,1,Statewide Unallocated,0,2020-01-24,0,0.0,
3,9585,AL,1,Statewide Unallocated,0,2020-01-25,0,0.0,
4,12780,AL,1,Statewide Unallocated,0,2020-01-26,0,0.0,
...,...,...,...,...,...,...,...,...,...
1067125,1051479,CT,9,Windham County,9015,2020-12-16,3921,130.0,90.142857
1067126,1054674,CT,9,Windham County,9015,2020-12-17,3998,77.0,90.285714
1067127,1057869,CT,9,Windham County,9015,2020-12-18,4153,155.0,103.714286
1067128,1061064,CT,9,Windham County,9015,2020-12-19,4153,0.0,103.714286


## PD 7.1 Activity 2 - Upload county level covid deaths by county

In [6]:
if using_Google_colab:
    df_sorted_covid_deaths_county = pd.read_csv('/content/drive/MyDrive/COVID_Project/output/covid_deaths_by_county.csv')
if using_Anaconda_on_Mac_or_Linux:
    df_sorted_covid_deaths_county = pd.read_csv('../output/covid_deaths_by_county.csv')
if using_Anaconda_on_windows:
    df_sorted_covid_deaths_county = pd.read_csv(r'../output/covid_deaths_by_county.csv') 
df_sorted_covid_deaths_county = df_sorted_covid_deaths_county.astype({'countyFIPS': int, 
                                                                            'stateFIPS': int,
                                                                            'Date': 'datetime64[ns]'})
df_sorted_covid_deaths_county

Unnamed: 0.1,Unnamed: 0,State,stateFIPS,County Name,countyFIPS,Date,Total Deaths,Inc Deaths,death moving_avg
0,0,AL,1,Statewide Unallocated,0,2020-01-22,0,,
1,68,AK,2,Statewide Unallocated,0,2020-01-22,0,0.0,
2,99,AZ,4,Statewide Unallocated,0,2020-01-22,0,0.0,
3,115,AR,5,Statewide Unallocated,0,2020-01-22,0,0.0,
4,191,CA,6,Statewide Unallocated,0,2020-01-22,0,0.0,
...,...,...,...,...,...,...,...,...,...
1067125,1051479,CT,9,Windham County,9015,2020-12-16,48,2.0,1.714286
1067126,1054674,CT,9,Windham County,9015,2020-12-17,51,3.0,1.571429
1067127,1057869,CT,9,Windham County,9015,2020-12-18,51,0.0,1.285714
1067128,1061064,CT,9,Windham County,9015,2020-12-19,51,0.0,1.285714


## PD 7.2 Activity 3 - Merge covid cases and deaths data for each county and date

In [7]:
df_partial_abt_by_county = pd.merge(df_sorted_confirmed_cases_county, df_sorted_covid_deaths_county, 
                                    on=['stateFIPS','countyFIPS', 'Date'], 
                                    suffixes=('', '_DROP'), 
                                    how='inner').filter(regex='^(?!.*_DROP)')
df_partial_abt_by_county

Unnamed: 0.1,Unnamed: 0,State,stateFIPS,County Name,countyFIPS,Date,Total Cases,Incremental Cases,cases moving_avg,Total Deaths,Inc Deaths,death moving_avg
0,0,AL,1,Statewide Unallocated,0,2020-01-22,0,,,0,,
1,3195,AL,1,Statewide Unallocated,0,2020-01-23,0,0.0,,0,0.0,0.000000
2,6390,AL,1,Statewide Unallocated,0,2020-01-24,0,0.0,,0,0.0,0.000000
3,9585,AL,1,Statewide Unallocated,0,2020-01-25,0,0.0,,0,0.0,0.000000
4,12780,AL,1,Statewide Unallocated,0,2020-01-26,0,0.0,,0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1067125,1051479,CT,9,Windham County,9015,2020-12-16,3921,130.0,90.142857,48,2.0,1.714286
1067126,1054674,CT,9,Windham County,9015,2020-12-17,3998,77.0,90.285714,51,3.0,1.571429
1067127,1057869,CT,9,Windham County,9015,2020-12-18,4153,155.0,103.714286,51,0.0,1.285714
1067128,1061064,CT,9,Windham County,9015,2020-12-19,4153,0.0,103.714286,51,0.0,1.285714


## PD 7.3 Activity 4 - Upload county level Google social mobility data

In [8]:
if using_Google_colab:
    df_google_mobility_data = pd.read_csv('/content/drive/MyDrive/COVID_Project/input/Google/Region_Mobility_Report_CSVs/2020_US_Region_Mobility_Report.csv')
if using_Anaconda_on_Mac_or_Linux:
    df_google_mobility_data = pd.read_csv('../input/Google/Region_Mobility_Report_CSVs/2020_US_Region_Mobility_Report.csv')
if using_Anaconda_on_windows:
    df_google_mobility_data = pd.read_csv('..\input\Google\Region_Mobility_Report_CSVs\2020_US_Region_Mobility_Report.csv')
df_google_mobility_data = df_google_mobility_data.astype({'date': 'datetime64[ns]'})
df_google_mobility_data

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,US,United States,,,,,,2020-02-15,6.0,2.0,15.0,3.0,2.0,-1.0
1,US,United States,,,,,,2020-02-16,7.0,1.0,16.0,2.0,0.0,-1.0
2,US,United States,,,,,,2020-02-17,6.0,0.0,28.0,-9.0,-24.0,5.0
3,US,United States,,,,,,2020-02-18,0.0,-1.0,6.0,1.0,0.0,1.0
4,US,United States,,,,,,2020-02-19,2.0,0.0,8.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796331,US,United States,Wyoming,Weston County,,,56045.0,2020-12-18,,,,,-17.0,
796332,US,United States,Wyoming,Weston County,,,56045.0,2020-12-21,,,,,-39.0,
796333,US,United States,Wyoming,Weston County,,,56045.0,2020-12-22,,,,,-38.0,
796334,US,United States,Wyoming,Weston County,,,56045.0,2020-12-23,,,,,-48.0,


## PD 7.3 Activity 5 - Understand Google Mobility data

In [9]:
df_google_mobility_data.columns


Index(['country_region_code', 'country_region', 'sub_region_1', 'sub_region_2',
       'metro_area', 'iso_3166_2_code', 'census_fips_code', 'date',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline'],
      dtype='object')

In [20]:
df_partial_abt_by_county[df_partial_abt_by_county['County Name'] == 'Los Angeles County'].count()

Unnamed: 0           334
State                334
stateFIPS            334
County Name          334
countyFIPS           334
Date                 334
Total Cases          334
Incremental Cases    333
cases moving_avg     327
Total Deaths         334
Inc Deaths           333
death moving_avg     327
dtype: int64

In [10]:
df_google_mobility_data[df_google_mobility_data['sub_region_2'] == 'Los Angeles County'].count()

country_region_code                                   315
country_region                                        315
sub_region_1                                          315
sub_region_2                                          315
metro_area                                              0
iso_3166_2_code                                         0
census_fips_code                                      315
date                                                  315
retail_and_recreation_percent_change_from_baseline    315
grocery_and_pharmacy_percent_change_from_baseline     315
parks_percent_change_from_baseline                    315
transit_stations_percent_change_from_baseline         315
workplaces_percent_change_from_baseline               315
residential_percent_change_from_baseline              315
dtype: int64

In [11]:
df_google_mobility_data[df_google_mobility_data['sub_region_2'] == 'Los Angeles County'].date.min()

Timestamp('2020-02-15 00:00:00')

In [12]:
df_google_mobility_data[df_google_mobility_data['sub_region_2'] == 'Los Angeles County'].date.max()

Timestamp('2020-12-25 00:00:00')

In [22]:
df_partial_abt_by_county[df_partial_abt_by_county['County Name'] == 'Los Angeles County'].Date.min()

Timestamp('2020-01-22 00:00:00')

In [23]:
df_partial_abt_by_county[df_partial_abt_by_county['County Name'] == 'Los Angeles County'].Date.max()

Timestamp('2020-12-20 00:00:00')

## PD 7.3 Activity 6: Reformat Google Mobility data


In [13]:
df_google_mobility_data_clean = df_google_mobility_data.dropna(subset=['census_fips_code'])
df_google_mobility_data_clean = df_google_mobility_data_clean.astype({'census_fips_code': int})
df_google_mobility_data_clean


Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
630,US,United States,Alabama,Autauga County,,,1001,2020-02-15,5.0,7.0,,,-4.0,
631,US,United States,Alabama,Autauga County,,,1001,2020-02-16,0.0,1.0,-23.0,,-4.0,
632,US,United States,Alabama,Autauga County,,,1001,2020-02-17,8.0,0.0,,,-27.0,5.0
633,US,United States,Alabama,Autauga County,,,1001,2020-02-18,-2.0,0.0,,,2.0,0.0
634,US,United States,Alabama,Autauga County,,,1001,2020-02-19,-2.0,0.0,,,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796331,US,United States,Wyoming,Weston County,,,56045,2020-12-18,,,,,-17.0,
796332,US,United States,Wyoming,Weston County,,,56045,2020-12-21,,,,,-39.0,
796333,US,United States,Wyoming,Weston County,,,56045,2020-12-22,,,,,-38.0,
796334,US,United States,Wyoming,Weston County,,,56045,2020-12-23,,,,,-48.0,


## PD 7.4 Activity 7 - Merge Covid Cases, deaths and Google Data

In [14]:
df_abt_by_county = pd.merge(df_partial_abt_by_county, 
                            df_google_mobility_data, left_on=['countyFIPS', 'Date'], 
                            right_on=['census_fips_code', 'date'], 
                            suffixes=('', '_DROP'), 
                            how='left').filter(regex='^(?!.*_DROP)')
df_abt_by_county

Unnamed: 0.1,Unnamed: 0,State,stateFIPS,County Name,countyFIPS,Date,Total Cases,Incremental Cases,cases moving_avg,Total Deaths,...,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,0,AL,1,Statewide Unallocated,0,2020-01-22,0,,,0,...,,,,NaT,,,,,,
1,3195,AL,1,Statewide Unallocated,0,2020-01-23,0,0.0,,0,...,,,,NaT,,,,,,
2,6390,AL,1,Statewide Unallocated,0,2020-01-24,0,0.0,,0,...,,,,NaT,,,,,,
3,9585,AL,1,Statewide Unallocated,0,2020-01-25,0,0.0,,0,...,,,,NaT,,,,,,
4,12780,AL,1,Statewide Unallocated,0,2020-01-26,0,0.0,,0,...,,,,NaT,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1067125,1051479,CT,9,Windham County,9015,2020-12-16,3921,130.0,90.142857,48,...,,,9015.0,2020-12-16,16.0,49.0,23.0,-26.0,-22.0,9.0
1067126,1054674,CT,9,Windham County,9015,2020-12-17,3998,77.0,90.285714,51,...,,,9015.0,2020-12-17,-57.0,-52.0,-49.0,-69.0,-68.0,30.0
1067127,1057869,CT,9,Windham County,9015,2020-12-18,4153,155.0,103.714286,51,...,,,9015.0,2020-12-18,-11.0,5.0,-20.0,-35.0,-24.0,13.0
1067128,1061064,CT,9,Windham County,9015,2020-12-19,4153,0.0,103.714286,51,...,,,9015.0,2020-12-19,0.0,16.0,,-32.0,1.0,6.0


## PD7.4 Activity 8: Save County Data as Analytics Base Table

In [15]:
if using_Google_colab:
    df_abt_by_county.to_csv('/content/drive/MyDrive/COVID_Project/output/abt_by_county.csv')
if using_Anaconda_on_Mac_or_Linux:
    df_abt_by_county.to_csv('../output/abt_by_county.csv')
if using_Anaconda_on_windows:
    df_abt_by_county.to_csv('..\output\abt_by_county.csv')