In [1]:
'''
https://www.kaggle.com/helgejo/an-interactive-data-science-tutorial
'''

import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

# Modelling Helpers
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer , scale
from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [2]:
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))

`time_series_covid19_confirmed_global.csv`
- Latitude/longitude, time series confirmed cases
- Country code not in this file.
- Some country strings are not standard like:  "Korea, South"

`API_EN.POP.DNST_DS2_en_csv_v2_988966.csv`
- Population density for 2019. The 2020 information is not there.

`covid-19-tests-vs-cases-positivity-comparisons.csv`
- Tests that came out positivve
- Missing a lot of numbers for how many tests were performed. We need to grab these numbers from `full-list-covid-19-tests-per-day.csv`


**Plan of action?**

- Are we planning to use the regions/province data? Or will we consolidate it into one country.
- Should we combined all the the fields we want into a single file?
- We need mapping of country codes to country names. Let’s pick one file that looks sane and use it. Then we have to map country names from `time_series_covid19_confirmed_global.csv` back to the correct code.
- We should normalize the start/end dates. We need to get the intersection of start/end dates for all files and only use that.

In [3]:
confirmed_df = pd.read_csv("datasets/time_series_covid19_confirmed_global.csv")
tests_performed_df = pd.read_csv("datasets/full-list-covid-19-tests-per-day.csv")
tests_positive_df = pd.read_csv("datasets/covid-19-tests-vs-cases-positivity-comparisons.csv")
print("Tests performed shape: ", tests_performed_df.shape)
print("Tests positive shaped: ", tests_positive_df.shape)
print("Confirmed shape: ", confirmed_df.shape)

Tests performed shape:  (2683, 4)
Tests positive shaped:  (14537, 5)
Confirmed shape:  (264, 96)


In [4]:
# Convert the date field to a pandas DateTime object
tests_performed_df['Date'] =  pd.to_datetime(tests_performed_df['Date'])
tests_positive_df['Date'] =  pd.to_datetime(tests_positive_df['Date'])

NameError: name 'tests_performed' is not defined

In [None]:
# Basic sanity check
print("CONFIRMED CASES:\n", confirmed_df)
print("TESTS PERFORMED:\n", tests_performed_df)
print("TESST POSITIVE:\n", tests_positive_df)

In [None]:
# Convert PERFORMED tests to time series
pivoted_performed_df = tests_performed_df.pivot(index="Entity", columns="Date", values=["Daily change in total tests"])
unwanted_indexes = list(filter(lambda x: "," in x, pivoted_performed_df.index))
print("Unwanted PERFORMED indexes: ", unwanted_indexes)
filtered_performed_df = pivoted_performed_df.drop(unwanted_indexes)
filtered_performed_df.head()

In [None]:
# Convert POSITIVE tests to time series
pivoted_positive_df = tests_positive_df.pivot(index="Entity", columns="Date", values=["Total confirmed cases of COVID-19 (cases)"])
unwanted_indexes = list(filter(lambda x: "," in x, pivoted_positive_df.index))
print("Unwanted POSITIVE indexes: ", unwanted_indexes)
filtered_positive_df = pivoted_positive_df.drop(unwanted_indexes)
filtered_positive_df.head()

In [None]:
countries = confirmed_df["Country/Region"]
bad_countries = list(filter(lambda x: "," in x or "*" in x, countries))
print("Bad countries: ", bad_countries)
# TODO: Have to convert the bad country to the right one.