# Data Preprocessing

In [1]:
#import packages
import pandas as pd
import numpy as np 

#Read datasets from CSSE github repo
confirmed = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
deaths = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
recoveries = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

In [2]:
#view the first 5 rows of each data frame
confirmed.head(5)
#print(deaths.head(5))
#print(recoveries.head(5))

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,8/10/21,8/11/21,8/12/21,8/13/21,8/14/21,8/15/21,8/16/21,8/17/21,8/18/21,8/19/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,151013,151291,151563,151770,151770,151770,152142,152243,152363,152411
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,134201,134487,134761,135140,135550,135947,136147,136598,137075,137597
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,183347,184191,185042,185902,186655,187258,187968,188663,189384,190078
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,14873,14891,14908,14924,14924,14924,14954,14960,14976,14981
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,43890,43998,44174,44328,44534,44617,44739,44972,45175,45325


Data sources from CSSE are in wide format. To better analyze data, we need to transform from wide to long.

In [3]:
# Transform wide format to long format
confirmed = pd.melt(confirmed, id_vars=confirmed.columns[:4], value_vars = confirmed.columns[4:], var_name = 'date', value_name = 'confirmed')
deaths = pd.melt(deaths, id_vars=deaths.columns[:4], value_vars = deaths.columns[4:], var_name = 'date', value_name = 'deaths')
recoveries = pd.melt(recoveries, id_vars=recoveries.columns[:4], value_vars = recoveries.columns[4:], var_name = 'date', value_name = 'recoveries')

In [4]:
#view the first 5 rows of each data frame
confirmed.head(5)
#print(deaths.head(5))
#print(recoveries.head(5))

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,confirmed
0,,Afghanistan,33.93911,67.709953,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


## Group the Provinces/State into their Respective Countries

### Canada

The confirmed & deaths tables present Canada data by Province/State while the recoveries table only displays the total number of cases in the whole country. This conflict will need to addressed first before combining the tables to gether as unmatched join keys will be omitted.

In [5]:
#Summarize confirmed and deaths data by date
confirmed_canada = confirmed[confirmed['Country/Region'] == 'Canada'].groupby('date').sum()[['confirmed']]
deaths_canada = deaths[deaths['Country/Region'] == 'Canada'].groupby('date').sum()[['deaths']]
 
confirmed_canada.head()

Unnamed: 0_level_0,confirmed
date,Unnamed: 1_level_1
1/1/21,591149
1/10/21,666375
1/11/21,674624
1/12/21,681015
1/13/21,688097


In [6]:
#Extract columns from recoveries table
recoveries_canada = recoveries[recoveries['Country/Region'] == 'Canada']
canada_template = recoveries_canada[recoveries_canada.columns[:-1]].reset_index(drop=True)  
canada_template.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,date
0,,Canada,56.1304,-106.3468,1/22/20
1,,Canada,56.1304,-106.3468,1/23/20
2,,Canada,56.1304,-106.3468,1/24/20
3,,Canada,56.1304,-106.3468,1/25/20
4,,Canada,56.1304,-106.3468,1/26/20


reset_index() is a method to reset the index of a data frame.

Syntax: DataFrame.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill="")

drop: Boolean value, adds the replaced index column to the data if False

## Combining Tables

In [7]:
#Join aggregated confirmed and deaths data with extracted columns
confirmed_canada = canada_template.merge(confirmed_canada, how='inner', left_on='date', right_index=True)
deaths_canada = canada_template.merge(deaths_canada, how='inner', left_on='date', right_index=True)
confirmed_canada.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,confirmed
0,,Canada,56.1304,-106.3468,1/22/20,0
1,,Canada,56.1304,-106.3468,1/23/20,0
2,,Canada,56.1304,-106.3468,1/24/20,0
3,,Canada,56.1304,-106.3468,1/25/20,0
4,,Canada,56.1304,-106.3468,1/26/20,1


The left dataframe is canada_template, the right dataframe is confirmed_canada

left_on: label or list, or array-like 
* Column or index level names to join on in the left DataFrame.

right_index: bool, default False
* Use the index from the right DataFrame as the join key. Same caveats as left_index

In [8]:
#Add the aggregated data for Canada back to confirmed and deaths table
confirmed = confirmed[confirmed['Country/Region'] != 'Canada'].append(confirmed_canada)  #selected df without Canada
deaths = deaths[deaths['Country/Region'] != 'Canada'].append(deaths_canada)

confirmed.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,confirmed
0,,Afghanistan,33.93911,67.709953,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


In [9]:
#Join confirmed, deaths and recoveries data together
data = confirmed.merge(deaths, how='inner', on=['Country/Region', 'Province/State','date']).merge(recoveries,how = 'inner', on =['Country/Region', 'Province/State','date'])
data['recoveries'] = data['recoveries'].astype(int)
#rearrange columns
data = data[['Province/State', 'Country/Region', 'date', 'Lat', 'Long', 'confirmed', 'deaths', 'recoveries']]
data.sample(5)

Unnamed: 0,Province/State,Country/Region,date,Lat,Long,confirmed,deaths,recoveries
2453,Faroe Islands,Denmark,1/31/20,61.8926,-6.9118,0,0,0
102743,,Morocco,2/15/21,31.7917,-7.0926,478595,8491,459549
92533,,South Sudan,1/7/21,6.877,31.307,3589,63,3131
19959,,Togo,4/6/20,8.6195,0.8248,58,3,23
98469,Mayotte,France,1/30/21,-12.8275,45.166244,8231,61,2964


## Population Data
One metric used in the Covid-19 dashboard is infection rate: $confirmed / population$. Countries' population is not available in the CSSE dataset so we will need to combine with another source. 

Source: [Tanu N Prabhu](https://www.kaggle.com/tanuprabhu/population-by-country-2020)

One very common problems when combining different data sources is unmatched value names.

In [10]:
#Read dataset
population = pd.read_csv('https://raw.githubusercontent.com/cpepingco/COVID-19-Dash-/main/population_by_country_2020.csv')
population.sample(5)

Unnamed: 0,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
62,Chile,19144605,0.87 %,164163,26,743532,111708.0,1.7,35,85 %,0.25 %
102,Sierra Leone,8004158,2.10 %,163768,111,72180,-4200.0,4.3,19,43 %,0.10 %
213,Monaco,39290,0.71 %,278,26337,1,,N.A.,N.A.,N.A.,0.00 %
150,Latvia,1882408,-1.08 %,-20545,30,62200,-14837.0,1.7,44,69 %,0.02 %
190,Guam,169031,0.89 %,1481,313,540,-506.0,2.3,31,95 %,0.00 %


In [11]:
for c in data['Country/Region'].unique():
    if c not in population['Country (or dependency)'].unique():
        print(c)

Burma
Congo (Brazzaville)
Congo (Kinshasa)
Cote d'Ivoire
Czechia
Diamond Princess
Korea, South
Kosovo
MS Zaandam
Saint Kitts and Nevis
Saint Vincent and the Grenadines
Sao Tome and Principe
Summer Olympics 2020
Taiwan*
US
West Bank and Gaza


A few of the countries have to be manually replaced.

In [12]:
country_mapper = {
   # 'Congo (Brazzaville)': 'Congo',
   # 'Congo (Kinshasa)': 'Congo',
    "Cote d'Ivoire": "Côte d'Ivoire",
    'Czechia': 'Czech Republic (Czechia)',
    'Korea, South': 'South Korea',
    'Saint Vincent and the Grenadines': 'St. Vincent & Grenadines',
    'Taiwan*': 'Taiwan',
    'US': 'United States',
    'West Bank and Gaza': 'Israel',
    'Saint Kitts and Nevis': 'Saint Kitts & Nevis',
    'Burma': 'Myanmar',
    'Sao Tome and Principe': 'Sao Tome & Principe'
}

data['Country/Region'] = data['Country/Region'].replace(country_mapper)
#data.index = data['Country/Region']
data.sample(10)

Unnamed: 0,Province/State,Country/Region,date,Lat,Long,confirmed,deaths,recoveries
66095,,Croatia,9/29/20,45.1,15.2,16380,275,14947
118387,,Cambodia,4/16/21,11.55,104.9167,5480,38,2393
71385,St Martin,France,10/19/20,18.0708,-63.0501,531,8,380
125945,,Tanzania,5/14/21,-6.369028,34.888822,509,21,183
60329,,Finland,9/7/20,61.92411,25.748151,8327,336,7350
120923,,Samoa,4/25/21,-13.759,-172.1046,3,0,2
24107,,Mozambique,4/22/20,-18.6657,35.5296,41,0,8
103580,,Spain,2/18/21,40.463667,-3.74922,3121687,66704,150376
44006,,Czech Republic (Czechia),7/7/20,49.8175,15.473,12685,351,7910
102359,Heilongjiang,China,2/14/21,47.862,127.7615,1609,13,1504


### China Problem

In [14]:
#Group China by Country/Region and date
china_template = data[(data['Country/Region'] == 'China')]
china_template['Province/State'] = np.nan
china_template = china_template.groupby(['date','Country/Region']).sum()   #ignore lat and long because it is wrong
china_template.reset_index(inplace = True)
china_template[['Province/State', 'Country/Region', 'date', 'Lat', 'Long', 'confirmed', 'deaths', 'recoveries']]
china_template['Province/State'] = np.nan
china_template.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  china_template['Province/State'] = np.nan


Unnamed: 0,date,Country/Region,Province/State,Lat,Long,confirmed,deaths,recoveries
0,1/1/21,China,,1079.6253,3686.4211,96023,4782,90031
1,1/10/21,China,,1079.6253,3686.4211,96824,4792,90697
2,1/11/21,China,,1079.6253,3686.4211,96920,4793,90772
3,1/12/21,China,,1079.6253,3686.4211,97095,4794,90857
4,1/13/21,China,,1079.6253,3686.4211,97275,4796,90953


In [15]:
#add the china data back into the original df
data = pd.concat([data, china_template], ignore_index = True)
data.tail(10)

Unnamed: 0,Province/State,Country/Region,date,Lat,Long,confirmed,deaths,recoveries
152630,,China,9/28/20,1079.6253,3686.4211,90505,4739,85398
152631,,China,9/29/20,1079.6253,3686.4211,90528,4739,85423
152632,,China,9/3/20,1079.6253,3686.4211,89986,4728,84737
152633,,China,9/30/20,1079.6253,3686.4211,90545,4739,85460
152634,,China,9/4/20,1079.6253,3686.4211,90008,4728,84782
152635,,China,9/5/20,1079.6253,3686.4211,90025,4728,84837
152636,,China,9/6/20,1079.6253,3686.4211,90058,4730,84873
152637,,China,9/7/20,1079.6253,3686.4211,90078,4732,84900
152638,,China,9/8/20,1079.6253,3686.4211,90087,4733,84932
152639,,China,9/9/20,1079.6253,3686.4211,90100,4733,84957


### UK Problem

In [16]:
#Group UK by Country/Region and date
UK_template = data[(data['Country/Region'] == 'United Kingdom')]
UK_template['Province/State'] = np.nan
UK_template = UK_template.groupby(['date','Country/Region']).sum()   #ignore lat and long because it is wrong
UK_template.reset_index(inplace = True)
UK_template[['Province/State', 'Country/Region', 'date', 'Lat', 'Long', 'confirmed', 'deaths', 'recoveries']]
UK_template['Province/State'] = np.nan
UK_template.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  UK_template['Province/State'] = np.nan


Unnamed: 0,date,Country/Region,Province/State,Lat,Long,confirmed,deaths,recoveries
0,1/1/21,United Kingdom,,262.083198,-497.280566,2549693,74237,5682
1,1/10/21,United Kingdom,,262.083198,-497.280566,3081372,81567,7013
2,1/11/21,United Kingdom,,262.083198,-497.280566,3127647,82096,7153
3,1/12/21,United Kingdom,,262.083198,-497.280566,3173295,83342,7352
4,1/13/21,United Kingdom,,262.083198,-497.280566,3220957,84910,7434


In [17]:
#add UK back to original df
data = pd.concat([data, UK_template], ignore_index = True)
data.tail(10)

Unnamed: 0,Province/State,Country/Region,date,Lat,Long,confirmed,deaths,recoveries
153206,,United Kingdom,9/28/20,262.083198,-497.280566,441575,42090,2354
153207,,United Kingdom,9/29/20,262.083198,-497.280566,448731,42162,2370
153208,,United Kingdom,9/3/20,262.083198,-497.280566,342708,41616,1750
153209,,United Kingdom,9/30/20,262.083198,-497.280566,455848,42233,2375
153210,,United Kingdom,9/4/20,262.083198,-497.280566,344686,41626,1756
153211,,United Kingdom,9/5/20,262.083198,-497.280566,346513,41638,1806
153212,,United Kingdom,9/6/20,262.083198,-497.280566,349500,41640,1824
153213,,United Kingdom,9/7/20,262.083198,-497.280566,352453,41643,1824
153214,,United Kingdom,9/8/20,262.083198,-497.280566,354934,41675,1827
153215,,United Kingdom,9/9/20,262.083198,-497.280566,357615,41683,1833


### Australia Problem

In [18]:
#Group UK by Country/Region and date
Australia_template = data[(data['Country/Region'] == 'Australia')]
Australia_template['Province/State'] = np.nan
#Australia_template = Australia_template.loc[Australia_template['Province/State'].isna()]
Australia_template = Australia_template.groupby(['date','Country/Region']).sum()   #ignore lat and long because it is wrong
Australia_template.reset_index(inplace = True)
Australia_template[['Province/State', 'Country/Region', 'date', 'Lat', 'Long', 'confirmed', 'deaths', 'recoveries']]
Australia_template['Province/State'] = np.nan
Australia_template.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Australia_template['Province/State'] = np.nan


Unnamed: 0,date,Country/Region,Province/State,Lat,Long,confirmed,deaths,recoveries
0,1/1/21,Australia,,-256.8502,1130.8439,28460,909,22573
1,1/10/21,Australia,,-256.8502,1130.8439,28614,909,22621
2,1/11/21,Australia,,-256.8502,1130.8439,28634,909,22627
3,1/12/21,Australia,,-256.8502,1130.8439,28650,909,22643
4,1/13/21,Australia,,-256.8502,1130.8439,28660,909,22650


In [19]:
#add Australia back to original df
data = pd.concat([data, Australia_template], ignore_index = True)
data.tail(10)

Unnamed: 0,Province/State,Country/Region,date,Lat,Long,confirmed,deaths,recoveries
153782,,Australia,9/28/20,-256.8502,1130.8439,27055,882,21558
153783,,Australia,9/29/20,-256.8502,1130.8439,27078,886,21623
153784,,Australia,9/3/20,-256.8502,1130.8439,26136,737,18920
153785,,Australia,9/30/20,-256.8502,1130.8439,27096,888,21655
153786,,Australia,9/4/20,-256.8502,1130.8439,26207,748,19178
153787,,Australia,9/5/20,-256.8502,1130.8439,26278,753,19336
153788,,Australia,9/6/20,-256.8502,1130.8439,26321,762,19473
153789,,Australia,9/7/20,-256.8502,1130.8439,26373,770,19610
153790,,Australia,9/8/20,-256.8502,1130.8439,26465,781,19731
153791,,Australia,9/9/20,-256.8502,1130.8439,26524,788,19869


In [20]:
data.to_csv('covid19.csv')