# Analysis of COVID-19 Cases in the world


Some requirements:

- Convert data into tidy format
- Export to normalized tables in SQLite3 Database
- Use SQL statements (from within pandas) to retrieve data needed for each visualization

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [40]:
# Load raw dataset 
df = pd.read_csv("../data/raw/Data.csv")
df.head()

Unnamed: 0,Entity,Continent,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Date,Daily tests,Cases,Deaths
0,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,2020-02-25,8.0,,
1,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,2020-02-26,5.0,,
2,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,2020-02-27,4.0,,
3,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,2020-02-28,1.0,,
4,Albania,Europe,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,2020-02-29,8.0,,


In [41]:
# Categories
df.columns

Index(['Entity', 'Continent', 'Latitude', 'Longitude',
       'Average temperature per year', 'Hospital beds per 1000 people',
       'Medical doctors per 1000 people', 'GDP/Capita', 'Population',
       'Median age', 'Population aged 65 and over (%)', 'Date', 'Daily tests',
       'Cases', 'Deaths'],
      dtype='object')

Since the column name "Entity" is misleading, I will rename it to be country, but first I will check if all the data in that column is actually a country name.

In [42]:
country_list = set(df['Entity'])
set(df['Entity'])

{'Albania',
 'Algeria',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Bulgaria',
 'Canada',
 'Cape Verde',
 'Chile',
 'Colombia',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'El Salvador',
 'Estonia',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Ghana',
 'Greece',
 'Guatemala',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kuwait',
 'Latvia',
 'Libya',
 'Lithuania',
 'Luxembourg',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Malta',
 'Mauritania',
 'Mexico',
 'Mongolia',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Namibia',
 'Nepal',
 'New Zealand',
 'Nigeria',
 'Norway',
 'Oman',
 'Pakistan',
 'Panama',
 'Paraguay',
 'Peru',
 'Philippines',
 'Poland',
 'Portugal',
 'Qatar',
 'Romania',
 'Russia',
 'Rwanda',
 'Saudi Arabia

In [43]:
new_df = df.rename(columns={'Entity': 'Country'})

In [44]:
# Shift (Country, Continent, Date) to the front

shift_keys = ['Country', 'Continent', 'Date']
new_df = new_df[[key for key in shift_keys if key in new_df] + [i for i in new_df if i not in shift_keys]]
new_df

Unnamed: 0,Country,Continent,Date,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Daily tests,Cases,Deaths
0,Albania,Europe,2020-02-25,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,8.0,,
1,Albania,Europe,2020-02-26,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,5.0,,
2,Albania,Europe,2020-02-27,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,4.0,,
3,Albania,Europe,2020-02-28,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,1.0,,
4,Albania,Europe,2020-02-29,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,8.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38467,Zimbabwe,Africa,2021-02-24,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,1804.0,35960.0,1456.0
38468,Zimbabwe,Africa,2021-02-25,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,2965.0,35994.0,1458.0
38469,Zimbabwe,Africa,2021-02-26,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,,36044.0,1463.0
38470,Zimbabwe,Africa,2021-02-27,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,,36058.0,1463.0


In [45]:
# Replace NaN values in table
new_df = new_df.fillna(0)
new_df

Unnamed: 0,Country,Continent,Date,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Daily tests,Cases,Deaths
0,Albania,Europe,2020-02-25,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,8.0,0.0,0.0
1,Albania,Europe,2020-02-26,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,5.0,0.0,0.0
2,Albania,Europe,2020-02-27,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,4.0,0.0,0.0
3,Albania,Europe,2020-02-28,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,1.0,0.0,0.0
4,Albania,Europe,2020-02-29,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,8.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38467,Zimbabwe,Africa,2021-02-24,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,1804.0,35960.0,1456.0
38468,Zimbabwe,Africa,2021-02-25,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,2965.0,35994.0,1458.0
38469,Zimbabwe,Africa,2021-02-26,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,0.0,36044.0,1463.0
38470,Zimbabwe,Africa,2021-02-27,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,0.0,36058.0,1463.0


In [46]:
# Compare statistics of countries
country_df = new_df.groupby(by='Country').mean(numeric_only = True)
country_df

Unnamed: 0_level_0,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Daily tests,Cases,Deaths
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Albania,41.15,20.17,14.0,2.89,1.29,5353.2,2873457.0,38.0,14.0,1170.264865,2.277428e+04,464.572973
Algeria,28.03,1.66,25.0,1.90,1.83,3974.0,41318142.0,29.0,6.0,8.378378,4.565502e+04,1480.656757
Argentina,-38.42,-63.62,14.0,5.00,3.91,9912.3,44271041.0,31.0,11.0,15085.977723,6.445117e+05,16392.730198
Armenia,40.07,45.04,11.0,4.20,2.80,4622.7,2930450.0,35.0,11.0,273.035616,6.676794e+04,1174.460274
Australia,-25.27,133.78,22.0,3.84,3.50,55060.3,24598933.0,37.0,16.0,23341.352500,1.697356e+04,466.437500
...,...,...,...,...,...,...,...,...,...,...,...,...
United States,37.09,-95.71,11.0,2.77,2.57,65297.5,325719178.0,38.0,16.0,812883.797030,7.991328e+06,180869.995050
Uruguay,-32.52,-55.77,16.0,2.80,3.74,16190.1,3456750.0,35.0,15.0,2279.059490,8.799329e+03,104.288952
Vietnam,14.06,108.28,25.0,2.60,0.82,2715.3,95540800.0,32.0,7.0,828.017370,8.168933e+02,17.302730
Zambia,-13.13,27.85,21.0,2.00,0.09,1305.1,17094130.0,17.0,2.0,3000.876437,1.571817e+04,277.178161


In [47]:
# Compare statistics of continents
continent_df = new_df.groupby(by = 'Continent').mean(numeric_only = True)
continent_df

Unnamed: 0_level_0,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Daily tests,Cases,Deaths
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Africa,3.551885,15.747523,23.348312,1.362825,0.447113,2373.875488,34656690.0,21.680748,3.783802,3104.280237,56471.208664,1269.688013
Asia,26.648408,78.826848,20.860524,3.075836,1.657297,14729.942616,109073900.0,31.011977,7.025745,45337.286213,350663.834214,6149.283761
Europe,48.987978,15.519167,10.086875,4.928825,3.420736,33769.866981,13657620.0,41.531816,18.220805,24512.868427,175556.997074,5883.651686
North America,21.854118,-85.625612,21.450269,2.127629,2.174985,18735.789637,54450390.0,32.438021,10.341744,87639.33317,910815.10975,25423.49706
Oceania,-27.652714,161.632464,19.341071,2.95158,2.517446,35401.060268,10641590.0,34.163393,12.533036,9669.627679,6613.240179,173.610714
South America,-19.40002,-67.570861,18.278689,2.182247,1.904809,8725.912637,23053940.0,30.153005,9.274932,10908.385929,334457.302937,10599.414959


## Normalizing Data

1NF - Remove duplicates  
2NF - Remove partial dependency (i.e. no candidate/super keys)  
3NF - Remove transistive dependency (i.e. no attributes are determined by another attribute other than primary key)

In [48]:
# Remove any duplicates in dataframe (1NF)
new_df = new_df.drop_duplicates()
new_df

Unnamed: 0,Country,Continent,Date,Latitude,Longitude,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Daily tests,Cases,Deaths
0,Albania,Europe,2020-02-25,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,8.0,0.0,0.0
1,Albania,Europe,2020-02-26,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,5.0,0.0,0.0
2,Albania,Europe,2020-02-27,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,4.0,0.0,0.0
3,Albania,Europe,2020-02-28,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,1.0,0.0,0.0
4,Albania,Europe,2020-02-29,41.15,20.17,14,2.89,1.29,5353.2,2873457,38,14,8.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38467,Zimbabwe,Africa,2021-02-24,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,1804.0,35960.0,1456.0
38468,Zimbabwe,Africa,2021-02-25,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,2965.0,35994.0,1458.0
38469,Zimbabwe,Africa,2021-02-26,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,0.0,36044.0,1463.0
38470,Zimbabwe,Africa,2021-02-27,-19.02,29.15,20,1.70,0.08,1464.0,16529904,19,3,0.0,36058.0,1463.0


In [49]:
# 2NF Remove partial dependency (no candidate/super keys)
## Candidate keys are Continent/Latitude/Longtitude; {Latitude + Longtitude} is a composite key as you can determine the country using these two keys

candidate_keys = ['Country', 'Latitude', 'Longitude']
country_details = new_df[[key for key in candidate_keys]].drop_duplicates().reset_index(drop=True)
country_details

Unnamed: 0,Country,Latitude,Longitude
0,Albania,41.15,20.17
1,Algeria,28.03,1.66
2,Argentina,-38.42,-63.62
3,Armenia,40.07,45.04
4,Australia,-25.27,133.78
...,...,...,...
99,United States,37.09,-95.71
100,Uruguay,-32.52,-55.77
101,Vietnam,14.06,108.28
102,Zambia,-13.13,27.85


In [50]:
second_normalized_table = new_df[['Country'] + [i for i in new_df if i not in candidate_keys]]
second_normalized_table

Unnamed: 0,Country,Continent,Date,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Daily tests,Cases,Deaths
0,Albania,Europe,2020-02-25,14,2.89,1.29,5353.2,2873457,38,14,8.0,0.0,0.0
1,Albania,Europe,2020-02-26,14,2.89,1.29,5353.2,2873457,38,14,5.0,0.0,0.0
2,Albania,Europe,2020-02-27,14,2.89,1.29,5353.2,2873457,38,14,4.0,0.0,0.0
3,Albania,Europe,2020-02-28,14,2.89,1.29,5353.2,2873457,38,14,1.0,0.0,0.0
4,Albania,Europe,2020-02-29,14,2.89,1.29,5353.2,2873457,38,14,8.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38467,Zimbabwe,Africa,2021-02-24,20,1.70,0.08,1464.0,16529904,19,3,1804.0,35960.0,1456.0
38468,Zimbabwe,Africa,2021-02-25,20,1.70,0.08,1464.0,16529904,19,3,2965.0,35994.0,1458.0
38469,Zimbabwe,Africa,2021-02-26,20,1.70,0.08,1464.0,16529904,19,3,0.0,36044.0,1463.0
38470,Zimbabwe,Africa,2021-02-27,20,1.70,0.08,1464.0,16529904,19,3,0.0,36058.0,1463.0


In [51]:
# 3NF Remove transistive dependency (no attribute is determined by another attribute other than primary key)
## Country -> Latitude, Longtitude, Average temperature per year, Hospital beds per 1000 people, Medical doctors per 1000 people, GDP/Capita, Population, Median age, Population aged 65 and over (%)
final_normalized = second_normalized_table.copy()
final_normalized

Unnamed: 0,Country,Continent,Date,Average temperature per year,Hospital beds per 1000 people,Medical doctors per 1000 people,GDP/Capita,Population,Median age,Population aged 65 and over (%),Daily tests,Cases,Deaths
0,Albania,Europe,2020-02-25,14,2.89,1.29,5353.2,2873457,38,14,8.0,0.0,0.0
1,Albania,Europe,2020-02-26,14,2.89,1.29,5353.2,2873457,38,14,5.0,0.0,0.0
2,Albania,Europe,2020-02-27,14,2.89,1.29,5353.2,2873457,38,14,4.0,0.0,0.0
3,Albania,Europe,2020-02-28,14,2.89,1.29,5353.2,2873457,38,14,1.0,0.0,0.0
4,Albania,Europe,2020-02-29,14,2.89,1.29,5353.2,2873457,38,14,8.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38467,Zimbabwe,Africa,2021-02-24,20,1.70,0.08,1464.0,16529904,19,3,1804.0,35960.0,1456.0
38468,Zimbabwe,Africa,2021-02-25,20,1.70,0.08,1464.0,16529904,19,3,2965.0,35994.0,1458.0
38469,Zimbabwe,Africa,2021-02-26,20,1.70,0.08,1464.0,16529904,19,3,0.0,36044.0,1463.0
38470,Zimbabwe,Africa,2021-02-27,20,1.70,0.08,1464.0,16529904,19,3,0.0,36058.0,1463.0


In [52]:
# Save cleaned data into data/clean folder

final_normalized.to_csv("../data/clean/CleanData.csv")
country_details.to_csv("../data/clean/CountryDetails.csv")