# Step 3 -  Prepare Data - Task 4 - Transform - CLASS ASSIGNMENT

This notebook provides the Python code for taking Covid total cases at county level and creating state level aggregation.  It demonstrates how you can transform the data, sort the data, and engineer new features.  Its results are stored in an intermediate file for rest of exercises.

Students will be developing a similar notebook for total deaths.  The corresponding notebook is included in the answer section.

## Import Libraries

In [1]:
import pandas as pd
from datetime import date


## Set up Environment Flag

In [2]:
using_Google_colab = False
using_Anaconda_on_Mac_or_Linux = True
using_Anaconda_on_windows = False

## Connect to Google Drive

This step will only be executed if you have set environment flag using_Google_colab to True

In [3]:
if using_Google_colab:
    from google.colab import drive
    drive.mount('/content/drive')

## PD4.4 -  Read file in your chosen environment

In [4]:
if using_Google_colab:
    df_total_deaths = pd.read_csv('/content/drive/MyDrive/COVID_Project/input/USA_Facts/covid_deaths_usafacts.csv')
if using_Anaconda_on_Mac_or_Linux:
    df_total_deaths = pd.read_csv('../input/USA_Facts/covid_deaths_usafacts.csv')
if using_Anaconda_on_windows:
    df_total_deaths = pd.read_csv(r'..\input\USA_Facts\covid_deaths_usafacts.csv') 
df_total_deaths = df_total_deaths.astype({'countyFIPS': str}).astype({'stateFIPS': str})
df_total_deaths

Unnamed: 0,countyFIPS,County Name,State,stateFIPS,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,12/11/20,12/12/20,12/13/20,12/14/20,12/15/20,12/16/20,12/17/20,12/18/20,12/19/20,12/20/20
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,41,41,41,41,43,43,43,44,44,44
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,141,141,141,141,141,145,145,146,147,147
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,30,30,30,30,30,30,30,30,32,32
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,39,39,39,39,39,42,42,42,42,42
5,1009,Blount County,AL,1,0,0,0,0,0,0,...,47,47,47,47,49,50,52,52,54,54
6,1011,Bullock County,AL,1,0,0,0,0,0,0,...,20,20,20,20,20,20,20,20,20,20
7,1013,Butler County,AL,1,0,0,0,0,0,0,...,44,44,44,44,44,44,44,45,45,45
8,1015,Calhoun County,AL,1,0,0,0,0,0,0,...,129,129,129,129,130,135,136,137,140,140
9,1017,Chambers County,AL,1,0,0,0,0,0,0,...,55,55,55,57,57,57,58,58,60,60


## PD 4.4 -  Select data for LA County

In [5]:
df_total_deaths_LA = df_total_deaths[df_total_deaths['County Name'] == 'Los Angeles County']
df_total_deaths_LA

Unnamed: 0,countyFIPS,County Name,State,stateFIPS,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,12/11/20,12/12/20,12/13/20,12/14/20,12/15/20,12/16/20,12/17/20,12/18/20,12/19/20,12/20/20
211,6037,Los Angeles County,CA,6,0,0,0,0,0,0,...,8199,8269,8298,8345,8431,8568,8664,8757,8817,8875


## PD 4.4 - Transform LA County data to total cases by date

In [6]:
df_total_deaths_LA_by_date = df_total_deaths_LA.melt(id_vars=['State', 
                                                            'stateFIPS', 
                                                            'County Name',
                                                            'countyFIPS'],
                                                   var_name='Date', 
                                                   value_name='Total Deaths')
df_total_deaths_LA_by_date

Unnamed: 0,State,stateFIPS,County Name,countyFIPS,Date,Total Deaths
0,CA,6,Los Angeles County,6037,1/22/20,0
1,CA,6,Los Angeles County,6037,1/23/20,0
2,CA,6,Los Angeles County,6037,1/24/20,0
3,CA,6,Los Angeles County,6037,1/25/20,0
4,CA,6,Los Angeles County,6037,1/26/20,0
5,CA,6,Los Angeles County,6037,1/27/20,0
6,CA,6,Los Angeles County,6037,1/28/20,0
7,CA,6,Los Angeles County,6037,1/29/20,0
8,CA,6,Los Angeles County,6037,1/30/20,0
9,CA,6,Los Angeles County,6037,1/31/20,0


## PD 4.4 -  Transform all County data to total cases by date

In [7]:
df_total_county_deaths_by_date = df_total_deaths.melt(id_vars=['State', 
                                                      'stateFIPS', 
                                                      'County Name',
                                                      'countyFIPS'],
                                             var_name='Date', 
                                             value_name='Total Deaths')
df_total_county_deaths_by_date

Unnamed: 0,State,stateFIPS,County Name,countyFIPS,Date,Total Deaths
0,AL,1,Statewide Unallocated,0,1/22/20,0
1,AL,1,Autauga County,1001,1/22/20,0
2,AL,1,Baldwin County,1003,1/22/20,0
3,AL,1,Barbour County,1005,1/22/20,0
4,AL,1,Bibb County,1007,1/22/20,0
5,AL,1,Blount County,1009,1/22/20,0
6,AL,1,Bullock County,1011,1/22/20,0
7,AL,1,Butler County,1013,1/22/20,0
8,AL,1,Calhoun County,1015,1/22/20,0
9,AL,1,Chambers County,1017,1/22/20,0


## PD4.5 -  Group total deaths by state 


In [8]:
df_total_deaths_by_state = df_total_deaths.groupby(['State', 'stateFIPS']).sum().reset_index()
df_total_deaths_by_state

Unnamed: 0,State,stateFIPS,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,...,12/11/20,12/12/20,12/13/20,12/14/20,12/15/20,12/16/20,12/17/20,12/18/20,12/19/20,12/20/20
0,AK,2,0,0,0,0,0,0,0,0,...,149,167,167,167,170,172,174,174,175,175
1,AL,1,0,0,0,0,0,0,0,0,...,4078,4078,4078,4098,4123,4197,4253,4296,4389,4389
2,AR,5,0,0,0,0,0,0,0,0,...,2874,2910,2944,2989,3016,3073,3112,3139,3191,3237
3,AZ,4,0,0,0,0,0,0,0,0,...,7245,7322,7357,7358,7422,7530,7677,7819,7937,7971
4,CA,6,0,0,0,0,0,0,0,0,...,20849,20970,21027,21183,21466,21855,22159,22431,22585,22670
5,CO,8,0,0,0,0,0,0,0,0,...,3839,3864,3949,3961,4077,4135,4203,4259,4339,4368
6,CT,9,0,0,0,0,0,0,0,0,...,5363,5363,5363,5444,5466,5506,5552,5581,5581,5581
7,DC,11,0,0,0,0,0,0,0,0,...,709,713,715,716,720,720,725,728,730,737
8,DE,10,0,0,0,0,0,0,0,0,...,807,815,816,816,826,833,845,854,858,862
9,FL,12,0,0,0,0,0,0,0,0,...,19714,19785,19866,19998,20073,20195,20292,20401,20473,20568


## PD 4.5 -  Transform state total deaths to total death by date


In [9]:
df_total_deaths_by_state_by_date = df_total_deaths_by_state.melt(id_vars=['State','stateFIPS'], 
                                                               var_name='Date', 
                                                               value_name='Total Deaths')
df_total_deaths_by_state_by_date

Unnamed: 0,State,stateFIPS,Date,Total Deaths
0,AK,2,1/22/20,0
1,AL,1,1/22/20,0
2,AR,5,1/22/20,0
3,AZ,4,1/22/20,0
4,CA,6,1/22/20,0
5,CO,8,1/22/20,0
6,CT,9,1/22/20,0
7,DC,11,1/22/20,0
8,DE,10,1/22/20,0
9,FL,12,1/22/20,0
