In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random
import math
import time
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

import datetime
import operator
%matplotlib inline

In [2]:
# Project Topic/Title: Predicting Covid-19 cases for healthcare systems globally.

# Problem Statement: Covid-19 is a global pandemic that has caused great stress and 
#                    impacted the world in various ways. Most notably, it has placed 
#                    great stress on the healthcare systems worldwide.

# Motivation: There is a need to predict cases as accurately as possible 
#             so that the healthcare systems can better accommodate the 
#             necessary resources for treatment, enabling them to better cope.


In [3]:
confirmed_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
death_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
recovered_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

In [4]:
confirmed_cases.head()


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/9/21,9/10/21,9/11/21,9/12/21,9/13/21,9/14/21,9/15/21,9/16/21,9/17/21,9/18/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,153840,153962,153982,153990,154094,154180,154283,154361,154487,154487
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,154316,155293,156162,157026,157436,158431,159423,160365,161324,162173
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,199275,199560,199822,200068,200301,200528,200770,200989,201224,201425
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,15078,15083,15083,15083,15096,15099,15108,15113,15124,15124
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,49628,49943,50348,50446,50738,51047,51407,51827,52208,52307


In [5]:
death_cases.head()


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/9/21,9/10/21,9/11/21,9/12/21,9/13/21,9/14/21,9/15/21,9/16/21,9/17/21,9/18/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,7157,7164,7167,7167,7169,7171,7174,7183,7186,7186
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2531,2535,2539,2543,2548,2553,2557,2563,2569,2574
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,5519,5539,5558,5578,5596,5614,5630,5651,5670,5681
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,130,130,130,130,130,130,130,130,130,130
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,1313,1322,1327,1339,1345,1358,1360,1371,1378,1388


In [6]:
recovered_cases.head()


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/9/21,9/10/21,9/11/21,9/12/21,9/13/21,9/14/21,9/15/21,9/16/21,9/17/21,9/18/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
cols = confirmed_cases.keys()
cols

Index(['Province/State', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       ...
       '9/9/21', '9/10/21', '9/11/21', '9/12/21', '9/13/21', '9/14/21',
       '9/15/21', '9/16/21', '9/17/21', '9/18/21'],
      dtype='object', length=610)

In [8]:
confirmed_filtered = confirmed_cases.loc[:, cols[4]:cols[-1]]
confirmed_filtered.head()

Unnamed: 0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,9/9/21,9/10/21,9/11/21,9/12/21,9/13/21,9/14/21,9/15/21,9/16/21,9/17/21,9/18/21
0,0,0,0,0,0,0,0,0,0,0,...,153840,153962,153982,153990,154094,154180,154283,154361,154487,154487
1,0,0,0,0,0,0,0,0,0,0,...,154316,155293,156162,157026,157436,158431,159423,160365,161324,162173
2,0,0,0,0,0,0,0,0,0,0,...,199275,199560,199822,200068,200301,200528,200770,200989,201224,201425
3,0,0,0,0,0,0,0,0,0,0,...,15078,15083,15083,15083,15096,15099,15108,15113,15124,15124
4,0,0,0,0,0,0,0,0,0,0,...,49628,49943,50348,50446,50738,51047,51407,51827,52208,52307


In [9]:
singapore_confirmed_filtered = confirmed_cases[confirmed_cases["Country/Region"] == "Singapore"].loc[:, cols[4]:cols[-1]]
singapore_confirmed_filtered

Unnamed: 0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,9/9/21,9/10/21,9/11/21,9/12/21,9/13/21,9/14/21,9/15/21,9/16/21,9/17/21,9/18/21
230,0,1,3,3,4,5,7,7,10,13,...,70039,70612,71167,71687,72294,73131,73938,74848,75783,76792


In [10]:
deaths_filtered = death_cases.loc[:, cols[4]:cols[-1]]
deaths_filtered.head()

Unnamed: 0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,9/9/21,9/10/21,9/11/21,9/12/21,9/13/21,9/14/21,9/15/21,9/16/21,9/17/21,9/18/21
0,0,0,0,0,0,0,0,0,0,0,...,7157,7164,7167,7167,7169,7171,7174,7183,7186,7186
1,0,0,0,0,0,0,0,0,0,0,...,2531,2535,2539,2543,2548,2553,2557,2563,2569,2574
2,0,0,0,0,0,0,0,0,0,0,...,5519,5539,5558,5578,5596,5614,5630,5651,5670,5681
3,0,0,0,0,0,0,0,0,0,0,...,130,130,130,130,130,130,130,130,130,130
4,0,0,0,0,0,0,0,0,0,0,...,1313,1322,1327,1339,1345,1358,1360,1371,1378,1388


In [11]:
recovered_filtered = recovered_cases.loc[:, cols[4]:cols[-1]]
recovered_filtered.head()

Unnamed: 0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,9/9/21,9/10/21,9/11/21,9/12/21,9/13/21,9/14/21,9/15/21,9/16/21,9/17/21,9/18/21
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:

dates = confirmed_filtered.keys()
world_cases = []
singapore_cases = []
total_deaths = [] 
mortality_rate = []
total_recovered = [] 

for i in dates:
    confirmed_sum = confirmed_filtered[i].sum()
    singapore_confirmed_sum = singapore_confirmed_filtered[i].sum()
    death_sum = deaths_filtered[i].sum()
    recovered_sum = recovered_filtered[i].sum()
    world_cases.append(confirmed_sum)
    singapore_cases.append(singapore_confirmed_sum)
    total_deaths.append(death_sum)
    mortality_rate.append(death_sum/confirmed_sum)
    total_recovered.append(recovered_sum)

In [13]:
confirmed_sum, death_sum, recovered_sum, world_cases, singapore_cases

(228182335,
 4685838,
 0,
 [557,
  655,
  941,
  1433,
  2118,
  2927,
  5578,
  6167,
  8235,
  9927,
  12038,
  16787,
  19887,
  23898,
  27643,
  30803,
  34396,
  37130,
  40160,
  42769,
  44811,
  45229,
  60382,
  66909,
  69052,
  71235,
  73270,
  75152,
  75652,
  76212,
  76841,
  78602,
  78982,
  79550,
  80404,
  81381,
  82740,
  84126,
  86018,
  88400,
  90379,
  92980,
  95282,
  98095,
  102016,
  106113,
  110051,
  114232,
  119056,
  126719,
  132507,
  146879,
  157983,
  169249,
  184042,
  200033,
  219607,
  246706,
  277580,
  309731,
  344829,
  387512,
  428581,
  479654,
  542593,
  607576,
  677332,
  734030,
  799349,
  876287,
  959294,
  1042432,
  1126245,
  1185380,
  1256206,
  1330154,
  1399798,
  1483095,
  1570086,
  1655401,
  1729777,
  1849402,
  1920435,
  2004305,
  2082514,
  2178207,
  2266243,
  2343442,
  2420188,
  2495745,
  2571756,
  2653241,
  2737238,
  2821356,
  2903704,
  2975173,
  3045420,
  3121234,
  3198423,
  3281710,
  

In [14]:

days_since_1_22 = np.array([i for i in range(len(dates))]).reshape(-1, 1)
world_cases = np.array(world_cases).reshape(-1, 1)
singapore_cases = np.array(singapore_cases).reshape(-1, 1)
total_deaths = np.array(total_deaths).reshape(-1, 1)
total_recovered = np.array(total_recovered).reshape(-1, 1)

days_since_1_22

array([[  0],
       [  1],
       [  2],
       [  3],
       [  4],
       [  5],
       [  6],
       [  7],
       [  8],
       [  9],
       [ 10],
       [ 11],
       [ 12],
       [ 13],
       [ 14],
       [ 15],
       [ 16],
       [ 17],
       [ 18],
       [ 19],
       [ 20],
       [ 21],
       [ 22],
       [ 23],
       [ 24],
       [ 25],
       [ 26],
       [ 27],
       [ 28],
       [ 29],
       [ 30],
       [ 31],
       [ 32],
       [ 33],
       [ 34],
       [ 35],
       [ 36],
       [ 37],
       [ 38],
       [ 39],
       [ 40],
       [ 41],
       [ 42],
       [ 43],
       [ 44],
       [ 45],
       [ 46],
       [ 47],
       [ 48],
       [ 49],
       [ 50],
       [ 51],
       [ 52],
       [ 53],
       [ 54],
       [ 55],
       [ 56],
       [ 57],
       [ 58],
       [ 59],
       [ 60],
       [ 61],
       [ 62],
       [ 63],
       [ 64],
       [ 65],
       [ 66],
       [ 67],
       [ 68],
       [ 69],
       [ 70],
      

In [15]:
world_cases

array([[      557],
       [      655],
       [      941],
       [     1433],
       [     2118],
       [     2927],
       [     5578],
       [     6167],
       [     8235],
       [     9927],
       [    12038],
       [    16787],
       [    19887],
       [    23898],
       [    27643],
       [    30803],
       [    34396],
       [    37130],
       [    40160],
       [    42769],
       [    44811],
       [    45229],
       [    60382],
       [    66909],
       [    69052],
       [    71235],
       [    73270],
       [    75152],
       [    75652],
       [    76212],
       [    76841],
       [    78602],
       [    78982],
       [    79550],
       [    80404],
       [    81381],
       [    82740],
       [    84126],
       [    86018],
       [    88400],
       [    90379],
       [    92980],
       [    95282],
       [    98095],
       [   102016],
       [   106113],
       [   110051],
       [   114232],
       [   119056],
       [   126719],


In [16]:
days_in_future = 10
future_forecast = np.array([i for i in range(len(dates)+days_in_future)]).reshape(-1, 1)
adjusted_dates = future_forecast[:-10]

future_forecast

array([[  0],
       [  1],
       [  2],
       [  3],
       [  4],
       [  5],
       [  6],
       [  7],
       [  8],
       [  9],
       [ 10],
       [ 11],
       [ 12],
       [ 13],
       [ 14],
       [ 15],
       [ 16],
       [ 17],
       [ 18],
       [ 19],
       [ 20],
       [ 21],
       [ 22],
       [ 23],
       [ 24],
       [ 25],
       [ 26],
       [ 27],
       [ 28],
       [ 29],
       [ 30],
       [ 31],
       [ 32],
       [ 33],
       [ 34],
       [ 35],
       [ 36],
       [ 37],
       [ 38],
       [ 39],
       [ 40],
       [ 41],
       [ 42],
       [ 43],
       [ 44],
       [ 45],
       [ 46],
       [ 47],
       [ 48],
       [ 49],
       [ 50],
       [ 51],
       [ 52],
       [ 53],
       [ 54],
       [ 55],
       [ 56],
       [ 57],
       [ 58],
       [ 59],
       [ 60],
       [ 61],
       [ 62],
       [ 63],
       [ 64],
       [ 65],
       [ 66],
       [ 67],
       [ 68],
       [ 69],
       [ 70],
      

In [17]:
# Convert all the integers into datetime for better visualization

start = '1/22/2020'
start_date = datetime.datetime.strptime(start, '%m/%d/%Y')
future_forecast_dates = []
for i in range(len(future_forecast)):
    future_forecast_dates.append((start_date + datetime.timedelta(days=i)).strftime('%m/%d/%Y'))

In [18]:
latest_confirmed = confirmed_filtered[dates[-1]]
latest_deaths = deaths_filtered[dates[-1]]
latest_recoveries = recovered_filtered[dates[-1]]

latest_confirmed, latest_deaths, latest_recoveries

(0      154487
 1      162173
 2      201425
 3       15124
 4       52307
         ...  
 274    677023
 275    382584
 276      8630
 277    208422
 278    127739
 Name: 9/18/21, Length: 279, dtype: int64,
 0       7186
 1       2574
 2       5681
 3        130
 4       1388
        ...  
 274    16857
 275     3909
 276     1638
 277     3638
 278     4563
 Name: 9/18/21, Length: 279, dtype: int64,
 0      0
 1      0
 2      0
 3      0
 4      0
       ..
 259    0
 260    0
 261    0
 262    0
 263    0
 Name: 9/18/21, Length: 264, dtype: int64)

In [19]:
unique_countries = list(confirmed_cases['Country/Region'].unique())
unique_countries

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Diamond Princess',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',

In [20]:
start = '1/22/2020'
start_date = datetime.datetime.strptime(start, '%m/%d/%Y')
future_forecast_dates = []
for i in range(len(future_forecast)):
    future_forecast_dates.append((start_date + datetime.timedelta(days=i)).strftime('%m/%d/%Y'))

**Random Forest**

In [23]:
def model_and_accuracy(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

    #put a trail loop inside 
    for i in range(4):
        
        n_estimators = 5 ** i

        rf_model = RandomForestRegressor(
            n_estimators = n_estimators,
            bootstrap = True,
            n_jobs = -1
        )

        rf_model.fit(X_train, y_train)
        y_pred = rf_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test,y_pred) 
        rsq = r2_score(y_test,y_pred)
        print(f'Estimators: {n_estimators}')
        print("Prediction Score:", rf_model.score(X_test, y_test))
        print('Mean Squared Error : ',mse)
        print('Root Mean Squared Error : ',rmse)
        print('Mean Absolute Error : ',mae)
        print('R Square:', rsq)
        print('--------------------------------------')
        print()

def get_most_important_features(X, y, n_estimators):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =101)

    rf_model = RandomForestRegressor(
        n_estimators = n_estimators,
        bootstrap = True,
        n_jobs = -1
    )

    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)

    #plt.plot(pd.DataFrame(rf_model.feature_importances_, index=X.columns, columns=['score']).sort_values(by='score', ascending=False))

In [24]:
# X = cfm_year_df.drop(columns=['y_year'])
# y = cfm_year_df['y_year']

model_and_accuracy(days_since_1_22, world_cases)

  rf_model.fit(X_train, y_train)
  rf_model.fit(X_train, y_train)
  rf_model.fit(X_train, y_train)


Estimators: 1
Prediction Score: 0.9999441865983104
Mean Squared Error :  280641927878.9426
Root Mean Squared Error :  529756.4797894808
Mean Absolute Error :  412251.84426229505
R Square: 0.9999441865983104
--------------------------------------

Estimators: 5
Prediction Score: 0.9999704103653001
Mean Squared Error :  148783121545.1022
Root Mean Squared Error :  385724.15214127075
Mean Absolute Error :  253829.6786885244
R Square: 0.9999704103653001
--------------------------------------

Estimators: 25
Prediction Score: 0.9999807963644902
Mean Squared Error :  96560057774.6637
Root Mean Squared Error :  310741.142713133
Mean Absolute Error :  208833.35278688517
R Square: 0.9999807963644902
--------------------------------------



  rf_model.fit(X_train, y_train)


Estimators: 125
Prediction Score: 0.999982136546287
Mean Squared Error :  89821332096.39029
Root Mean Squared Error :  299702.07222571934
Mean Absolute Error :  209690.0956721317
R Square: 0.999982136546287
--------------------------------------

