# Covid-19

In [0]:
import numpy as np
import pandas as pd

## Download the data from Data Repository by Johns Hopkins CSSE

https://github.com/CSSEGISandData/COVID-19

In [0]:
!git clone https://github.com/CSSEGISandData/COVID-19.git

fatal: destination path 'COVID-19' already exists and is not an empty directory.


## Expore the data


In [0]:
ls -lt ./COVID-19/csse_covid_19_data/csse_covid_19_daily_reports | head

total 8300
-rw-r--r-- 1 root root 314337 Apr 17 03:34 04-16-2020.csv
-rw-r--r-- 1 root root      0 Apr 17 03:34 README.md
-rw-r--r-- 1 root root 312662 Apr 17 03:34 04-15-2020.csv
-rw-r--r-- 1 root root 311180 Apr 17 03:34 04-14-2020.csv
-rw-r--r-- 1 root root 309854 Apr 17 03:34 04-13-2020.csv
-rw-r--r-- 1 root root 305660 Apr 17 03:34 04-12-2020.csv
-rw-r--r-- 1 root root 304033 Apr 17 03:34 04-11-2020.csv
-rw-r--r-- 1 root root 301328 Apr 17 03:34 04-10-2020.csv
-rw-r--r-- 1 root root 298124 Apr 17 03:34 04-09-2020.csv


In [0]:
first = pd.read_csv("./COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/01-22-2020.csv").rename(columns = {'Province/State' : 'Province_State', 
                        "Country/Region" : 'Country_Region',
                        'Last Update' : 'Last_Update'
})

In [0]:
first.head()

Unnamed: 0,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,1/22/2020 17:00,1.0,,
1,Beijing,Mainland China,1/22/2020 17:00,14.0,,
2,Chongqing,Mainland China,1/22/2020 17:00,6.0,,
3,Fujian,Mainland China,1/22/2020 17:00,1.0,,
4,Gansu,Mainland China,1/22/2020 17:00,,,


In [0]:
last = pd.read_csv("./COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/04-16-2020.csv").rename(columns = {'Province/State' : 'Province_State', 
                        "Country/Region" : 'Country_Region',
                        'Last Update' : 'Last_Update',
                        'Lat' : 'Latitude',
                        'Long_' : 'Longitude'
})

In [0]:
last.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Latitude,Longitude,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-04-16 23:30:51,34.223334,-82.461707,10,0,0,10,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-04-16 23:30:51,30.295065,-92.414197,108,6,0,102,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-04-16 23:30:51,37.767072,-75.632346,19,0,0,19,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-04-16 23:30:51,43.452658,-116.241552,567,9,0,558,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-04-16 23:30:51,41.330756,-94.471059,1,0,0,1,"Adair, Iowa, US"


In [0]:
pd.concat((first, last), axis = 0)

Unnamed: 0,Province_State,Country_Region,Last_Update,Confirmed,Deaths,Recovered,FIPS,Admin2,Latitude,Longitude,Active,Combined_Key
0,Anhui,Mainland China,1/22/2020 17:00,1.0,,,,,,,,
1,Beijing,Mainland China,1/22/2020 17:00,14.0,,,,,,,,
2,Chongqing,Mainland China,1/22/2020 17:00,6.0,,,,,,,,
3,Fujian,Mainland China,1/22/2020 17:00,1.0,,,,,,,,
4,Gansu,Mainland China,1/22/2020 17:00,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3037,,West Bank and Gaza,2020-04-16 23:30:31,374.0,2.0,63.0,,,31.952200,35.233200,309.0,West Bank and Gaza
3038,,Western Sahara,2020-04-16 23:30:31,6.0,0.0,0.0,,,24.215500,-12.885800,6.0,Western Sahara
3039,,Yemen,2020-04-16 23:30:31,1.0,0.0,0.0,,,15.552727,48.516388,1.0,Yemen
3040,,Zambia,2020-04-16 23:30:31,48.0,2.0,30.0,,,-13.133897,27.849332,16.0,Zambia


## Data loading into Pandas

In [0]:
import glob
import os

files = glob.glob("./COVID-19/csse_covid_19_data/csse_covid_19_daily_reports/*.csv")
files.sort(key=os.path.getmtime)

Let's do a little Data Cleaning ...

In [0]:
data = pd.DataFrame()
for file in files:
  df = pd.read_csv(file).rename(columns = {'Province/State' : 'State', 
                        "Country/Region" : 'Country',
                        'Province_State' : 'State', 
                        "Country_Region" : 'Country',
                        'Last Update' : 'Last_Update',
                        'Confirmed' : 'ConfirmedAcum',
                        'Deaths' : 'DeathsAcum',
                        'Recovered' : 'RecoveredAcum'})
  df = df.assign(Date = pd.to_datetime(file[-14:-4], format = '%m-%d-%Y'),
                 Country = df.Country.str.strip())
  data = pd.concat((data, df), axis = 0)


In [0]:
data = data[["Date", "Admin2", "State", "Country", "ConfirmedAcum", "DeathsAcum", "RecoveredAcum"]]
data = data.replace({'Bahamas, The' : 'Bahamas',
                         'Congo (Brazzaville)' : 'Congo',
                         'Congo (Kinshasa)' : 'Congo',
                         "Cote d'Ivoire" : "Cote d'Ivoire",
                         "Curacao" : "Curaçao",
                         'Czech Republic' : 'Czech Republic (Czechia)',
                         'Czechia' : 'Czech Republic (Czechia)',
                         'Faroe Islands' : 'Faeroe Islands',
                         'Macau' : 'Macao',
                         'Mainland China' : 'China',
                         'Palestine' : 'State of Palestine',
                         'Reunion' : 'Réunion',
                         'Saint Kitts and Nevis' : 'Saint Kitts & Nevis',
                         'Sao Tome and Principe' : 'Sao Tome & Principe',
                         'US' : 'United States',
                         'Gambia, The' : 'Gambia',
                         'Hong Kong SAR' : 'Hong Kong',
                         'Korea, South' : 'South Korea',
                         'Macao SAR' : 'Macao',
                         'Taiwan*' : 'Taiwan',
                         'Viet Nam' : 'Vietnam',
                         'West Bank and Gaza' : 'State of Palestine'
                         })
data = data.fillna({'State' : 'NA', 'Admin2' : 'NA'})
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86394 entries, 0 to 86393
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           86394 non-null  datetime64[ns]
 1   Admin2         86394 non-null  object        
 2   State          86394 non-null  object        
 3   Country        86394 non-null  object        
 4   ConfirmedAcum  86394 non-null  float64       
 5   DeathsAcum     86394 non-null  float64       
 6   RecoveredAcum  86394 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(3)
memory usage: 4.6+ MB


In [0]:
data = data.groupby(["Date", "Country", "State", "Admin2"]).agg("sum").reset_index()

In [0]:
data.head()

Unnamed: 0,Date,Country,State,Admin2,ConfirmedAcum,DeathsAcum,RecoveredAcum
0,2020-01-22,China,Anhui,,1.0,0.0,0.0
1,2020-01-22,China,Beijing,,14.0,0.0,0.0
2,2020-01-22,China,Chongqing,,6.0,0.0,0.0
3,2020-01-22,China,Fujian,,1.0,0.0,0.0
4,2020-01-22,China,Gansu,,0.0,0.0,0.0


In [0]:
data[data.Country == 'Spain'].head()

Unnamed: 0,Date,Country,State,Admin2,ConfirmedAcum,DeathsAcum,RecoveredAcum
545,2020-02-01,Spain,,,1.0,0.0,0.0
612,2020-02-02,Spain,,,1.0,0.0,0.0
679,2020-02-03,Spain,,,1.0,0.0,0.0
749,2020-02-04,Spain,,,1.0,0.0,0.0
819,2020-02-05,Spain,,,1.0,0.0,0.0


## Enrich the Data

In [0]:
data = data.sort_values(['State', 'Country', 'Date'])

data = data.assign(ConfirmedPrevious = data.groupby(['Admin2', 'State', 'Country']).shift(1)["ConfirmedAcum"],
                   DeathsPrevious = data.groupby(['Admin2', 'State', 'Country']).shift(1)["DeathsAcum"],
                   RecoveredPrevious = data.groupby(['Admin2', 'State', 'Country']).shift(1)["RecoveredAcum"],
            ).fillna({ 'ConfirmedPrevious' : 0, 'DeathsPrevious' : 0, 'RecoveredPrevious' : 0 })

In [0]:
data.head()

Unnamed: 0,Date,Country,State,Admin2,ConfirmedAcum,DeathsAcum,RecoveredAcum,ConfirmedPrevious,DeathsPrevious,RecoveredPrevious
2599,2020-02-28,Canada,"Montreal, QC",,1.0,0.0,0.0,0.0,0.0,0.0
2713,2020-02-29,Canada,"Montreal, QC",,1.0,0.0,0.0,1.0,0.0,0.0
2834,2020-03-01,Canada,"Montreal, QC",,1.0,0.0,0.0,1.0,0.0,0.0
2961,2020-03-02,Canada,"Montreal, QC",,1.0,0.0,0.0,1.0,0.0,0.0
3103,2020-03-03,Canada,"Montreal, QC",,1.0,0.0,0.0,1.0,0.0,0.0


In [0]:
data = data.assign(Confirmed = data.ConfirmedAcum -  data.ConfirmedPrevious,
            Deaths = data.DeathsAcum - data.DeathsPrevious,
            Recovered = data.RecoveredAcum - data.RecoveredPrevious
            )
data = data.drop(['ConfirmedPrevious', 'DeathsPrevious', 'RecoveredPrevious'], axis = 1)

In [0]:
data[data.Country == 'Spain']

Unnamed: 0,Date,Country,State,Admin2,ConfirmedAcum,DeathsAcum,RecoveredAcum,Confirmed,Deaths,Recovered
545,2020-02-01,Spain,,,1.0,0.0,0.0,1.0,0.0,0.0
612,2020-02-02,Spain,,,1.0,0.0,0.0,0.0,0.0,0.0
679,2020-02-03,Spain,,,1.0,0.0,0.0,0.0,0.0,0.0
749,2020-02-04,Spain,,,1.0,0.0,0.0,0.0,0.0,0.0
819,2020-02-05,Spain,,,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
71548,2020-04-12,Spain,,,166831.0,17209.0,62391.0,3804.0,603.0,3282.0
74536,2020-04-13,Spain,,,170099.0,17756.0,64727.0,3268.0,547.0,2336.0
77537,2020-04-14,Spain,,,172541.0,18056.0,67504.0,2442.0,300.0,2777.0
80550,2020-04-15,Spain,,,177644.0,18708.0,70853.0,5103.0,652.0,3349.0


## Data By Country

---



In [0]:
data_by_country = data.groupby(["Date", "Country"]).agg("sum").reset_index()
data_by_country = data_by_country.sort_values(['Country', 'Date'])
data_by_country.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8376 entries, 848 to 3045
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           8376 non-null   datetime64[ns]
 1   Country        8376 non-null   object        
 2   ConfirmedAcum  8376 non-null   float64       
 3   DeathsAcum     8376 non-null   float64       
 4   RecoveredAcum  8376 non-null   float64       
 5   Confirmed      8376 non-null   float64       
 6   Deaths         8376 non-null   float64       
 7   Recovered      8376 non-null   float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 588.9+ KB


In [0]:
data_by_country.head()

Unnamed: 0,Date,Country,ConfirmedAcum,DeathsAcum,RecoveredAcum,Confirmed,Deaths,Recovered
848,2020-02-24,Afghanistan,1.0,0.0,0.0,1.0,0.0,0.0
886,2020-02-25,Afghanistan,1.0,0.0,0.0,0.0,0.0,0.0
928,2020-02-26,Afghanistan,1.0,0.0,0.0,0.0,0.0,0.0
977,2020-02-27,Afghanistan,1.0,0.0,0.0,0.0,0.0,0.0
1030,2020-02-28,Afghanistan,1.0,0.0,0.0,0.0,0.0,0.0


In [0]:
data_by_country[data_by_country.Country == 'United States']

Unnamed: 0,Date,Country,ConfirmedAcum,DeathsAcum,RecoveredAcum,Confirmed,Deaths,Recovered
7,2020-01-22,United States,1.0,0.0,0.0,1.0,0.0,0.0
22,2020-01-23,United States,1.0,0.0,0.0,0.0,0.0,0.0
33,2020-01-24,United States,2.0,0.0,0.0,1.0,0.0,0.0
47,2020-01-25,United States,2.0,0.0,0.0,1.0,0.0,0.0
62,2020-01-26,United States,5.0,0.0,0.0,3.0,0.0,0.0
...,...,...,...,...,...,...,...,...
7631,2020-04-12,United States,555313.0,22020.0,32988.0,28915.0,1557.0,1718.0
7815,2020-04-13,United States,580619.0,23529.0,43482.0,25310.0,1514.0,10494.0
7999,2020-04-14,United States,607670.0,25832.0,47763.0,27062.0,2303.0,4281.0
8183,2020-04-15,United States,636350.0,28326.0,52096.0,28671.0,2494.0,4333.0


## Population

https://www.worldometers.info/world-population/population-by-country/

In [0]:
population = pd.read_excel("https://github.com/dvillaj/world-population/blob/master/data/world-popultation-2020.xlsx?raw=true", sheet_name="Data")

In [0]:
population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Country            235 non-null    object 
 1   Population (2020)  235 non-null    int64  
 2   Yearly Change      235 non-null    float64
 3   Net Change         235 non-null    int64  
 4   Density (P/Km²)    235 non-null    float64
 5   Land Area (Km²)    235 non-null    int64  
 6   Migrants (net)     201 non-null    float64
 7   Fertility Rate     201 non-null    float64
 8   Average Age        201 non-null    float64
 9   Urban Pop %        222 non-null    float64
 10  World Share        235 non-null    float64
dtypes: float64(7), int64(3), object(1)
memory usage: 20.3+ KB


In [0]:
population = population.rename(columns = {
    'Population (2020)' : 'Population',
    'Yearly Change' : 'Yearly_Change',
    'Net Change' : 'Net_Change',
    'Density (P/Km²)' : 'Density',
    'Land Area (Km²)' : 'Land_Area',
    'Migrants (net)' : 'Migrants',
    'Fertility Rate' : 'Fertility',
    'Average Age' : 'Mean_Age',
    'Urban Pop %' : 'Urban_Pop',
    'World Share' : 'World_Share'
})

In [0]:
population.head()

Unnamed: 0,Country,Population,Yearly_Change,Net_Change,Density,Land_Area,Migrants,Fertility,Mean_Age,Urban_Pop,World_Share
0,Afghanistan,38928346,2.33,886592,60.0,652860,-62920.0,4.6,18.0,25.0,0.5
1,Albania,2877797,-0.11,-3120,105.0,27400,-14000.0,1.6,36.0,63.0,0.04
2,Algeria,43851044,1.85,797990,18.0,2381740,-10000.0,3.1,29.0,73.0,0.56
3,American Samoa,55191,-0.22,-121,276.0,200,,,,88.0,0.0
4,Andorra,77265,0.16,123,164.0,470,,,,88.0,0.0
