In [1]:
'''
https://www.kaggle.com/helgejo/an-interactive-data-science-tutorial
'''

import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

# Modelling Helpers
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer , scale
from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [2]:
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))

`time_series_covid19_confirmed_global.csv`
- Latitude/longitude, time series confirmed cases
- Country code not in this file.
- Some country strings are not standard like:  "Korea, South"

`API_EN.POP.DNST_DS2_en_csv_v2_988966.csv`
- Population density for 2019. The 2020 information is not there.

`covid-19-tests-vs-cases-positivity-comparisons.csv`
- Tests that came out positivve
- Missing a lot of numbers for how many tests were performed. We need to grab these numbers from `full-list-covid-19-tests-per-day.csv`


**Plan of action?**

- Are we planning to use the regions/province data? Or will we consolidate it into one country.
- Should we combined all the the fields we want into a single file?
- We need mapping of country codes to country names. Let’s pick one file that looks sane and use it. Then we have to map country names from `time_series_covid19_confirmed_global.csv` back to the correct code.
- We should normalize the start/end dates. We need to get the intersection of start/end dates for all files and only use that.

In [3]:
confirmed_df = pd.read_csv("datasets/time_series_covid19_confirmed_global.csv")
# For the these two files we want to automatically convert dates from column 3 (index 2) to DateTime objects
tests_performed_df = pd.read_csv("datasets/full-list-covid-19-tests-per-day.csv", parse_dates=[2])
tests_positive_df = pd.read_csv("datasets/covid-19-tests-vs-cases-positivity-comparisons.csv", parse_dates=[2])
print("Confirmed shape: ", confirmed_df.shape)
print("Tests performed shape: ", tests_performed_df.shape)
print("Tests positive shape: ", tests_positive_df.shape)

Confirmed shape:  (264, 96)
Tests performed shape:  (2683, 4)
Tests positive shape:  (14537, 5)


In [4]:
# Basic sanity check
print("CONFIRMED CASES:")
confirmed_df.head()

CONFIRMED CASES:


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/13/20,4/14/20,4/15/20,4/16/20,4/17/20,4/18/20,4/19/20,4/20/20,4/21/20,4/22/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,665,714,784,840,906,933,996,1026,1092,1176
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,467,475,494,518,539,548,562,584,609,634
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,1983,2070,2160,2268,2418,2534,2629,2718,2811,2910
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,646,659,673,673,696,704,713,717,717,723
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,19,19,19,19,19,24,24,24,24,25


In [5]:
print("TESTS PERFORMED:")
tests_performed_df.head()

TESTS PERFORMED:


Unnamed: 0,Entity,Code,Date,Daily change in total tests
0,Argentina,ARG,2020-04-09,1520
1,Argentina,ARG,2020-04-10,1529
2,Argentina,ARG,2020-04-11,1648
3,Argentina,ARG,2020-04-14,3047
4,Argentina,ARG,2020-04-15,1569


In [6]:
print("TESTS POSITIVE:")
tests_positive_df.head()

TESTS POSITIVE:


Unnamed: 0,Entity,Code,Date,Total tests,Total confirmed cases of COVID-19 (cases)
0,Afghanistan,AFG,2019-12-31,,0.0
1,Afghanistan,AFG,2020-01-01,,0.0
2,Afghanistan,AFG,2020-01-02,,0.0
3,Afghanistan,AFG,2020-01-03,,0.0
4,Afghanistan,AFG,2020-01-04,,0.0


In [7]:
# Convert PERFORMED tests to time series
pivoted_performed_df = tests_performed_df.pivot(index="Entity", columns="Date", values=["Daily change in total tests"])
unwanted_indexes = list(filter(lambda x: "," in x, pivoted_performed_df.index))
print("Unwanted PERFORMED indexes: ", unwanted_indexes)
filtered_performed_df = pivoted_performed_df.drop(unwanted_indexes)
filtered_performed_df.head()

Unwanted PERFORMED indexes:  ['India, people tested', 'Italy, people tested', 'Japan, tests performed', 'United Kingdom, tests performed', 'United States, specimens tested (CDC)']


Unnamed: 0_level_0,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests,Daily change in total tests
Date,2020-01-18,2020-01-20,2020-01-21,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,...,2020-04-13,2020-04-14,2020-04-15,2020-04-16,2020-04-17,2020-04-18,2020-04-19,2020-04-20,2020-04-21,2020-04-22
Entity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Argentina,,,,,,,,,,,...,,3047.0,1569.0,2083.0,2193.0,2292.0,1770.0,1856.0,2043.0,2617.0
Australia,,,,,,,,,,,...,8195.0,4357.0,4884.0,8626.0,11527.0,14980.0,14486.0,10749.0,8097.0,12599.0
Austria,,,,,,,,,,,...,3535.0,3384.0,5005.0,6015.0,6456.0,6660.0,3311.0,3706.0,6069.0,12776.0
Bahrain,,,,,,,,,,,...,3354.0,3486.0,2459.0,3672.0,2668.0,2956.0,4164.0,3575.0,4073.0,3416.0
Bangladesh,,,,,,,,,,,...,1570.0,1905.0,1740.0,2135.0,2190.0,2114.0,2634.0,2663.0,2974.0,3096.0


In [8]:
# Convert POSITIVE tests to time series
pivoted_positive_df = tests_positive_df.pivot(index="Entity", columns="Date", values=["Total confirmed cases of COVID-19 (cases)"])
unwanted_indexes = list(filter(lambda x: "," in x, pivoted_positive_df.index))
print("Unwanted POSITIVE indexes: ", unwanted_indexes)
filtered_positive_df = pivoted_positive_df.drop(unwanted_indexes)
filtered_positive_df.head()

Unwanted POSITIVE indexes:  ['India, people tested', 'Italy, people tested', 'Japan, tests performed', 'Singapore, swabs tested', 'United Kingdom, tests performed', 'United States, specimens tested (CDC)', 'World excl. China, South Korea, Japan and Singapore']


Unnamed: 0_level_0,Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases),Total confirmed cases of COVID-19 (cases)
Date,2019-12-31,2020-01-01,2020-01-02,2020-01-03,2020-01-04,2020-01-05,2020-01-06,2020-01-07,2020-01-08,2020-01-09,...,2020-04-13,2020-04-14,2020-04-15,2020-04-16,2020-04-17,2020-04-18,2020-04-19,2020-04-20,2020-04-21,2020-04-22
Entity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,607.0,665.0,714.0,784.0,794.0,845.0,908.0,996.0,1031.0,1092.0
Africa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14522.0,15291.0,16281.0,17243.0,18329.0,19897.0,21057.0,22303.0,23267.0,24617.0
Albania,,,,,,,,,,,...,446.0,467.0,475.0,494.0,518.0,539.0,548.0,562.0,584.0,609.0
Algeria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1914.0,1983.0,2070.0,2160.0,2268.0,2418.0,2535.0,2629.0,2718.0,2811.0
Andorra,,,,,,,,,,,...,638.0,646.0,659.0,673.0,682.0,696.0,704.0,713.0,717.0,717.0


In [9]:
countries = confirmed_df["Country/Region"]
bad_countries = list(filter(lambda x: "," in x or "*" in x, countries))
print("Bad country strings: ", bad_countries)

# Through manual inspection, we determined that we need to properly change these inconsistent values
fix_dict = {
    "Korea, South": "South Korea",
    "Taiwan*": "Taiwan",
}
print("Bad country fix dict: ", fix_dict)

# Fix the values
for k, v in fix_dict.items():
    confirmed_df.loc[confirmed_df['Country/Region'] == k] = v

# Make sure the bad strings are no longer there
countries = confirmed_df["Country/Region"]
bad_countries = list(filter(lambda x: "," in x or "*" in x, countries))
print("Remaining bad country strings: ", bad_countries)

# Inspect that the strings are expected
countries[143], countries[207]

Bad country strings:  ['Korea, South', 'Taiwan*']
Bad country fix dict:  {'Korea, South': 'South Korea', 'Taiwan*': 'Taiwan'}
Remaining bad country strings:  []


('South Korea', 'Taiwan')