In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from datetime import datetime

# from sklearn import preprocessing
# from sklearn.model_selection import KFold
# import lightgbm as lgb
# import xgboost as xgb
# import catboost as cb

import warnings
warnings.filterwarnings('ignore')

In [4]:
confirmed_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
death_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
recovered_cases = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')

In [5]:
def convert_date_str(df):
    try:
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]
    except:
        print('_convert_date_str failed with %y, try %Y')
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%Y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]

convert_date_str(confirmed_cases)
convert_date_str(death_cases)
convert_date_str(recovered_cases)

In [6]:
confirmed_cases.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)
death_cases.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)
recovered_cases.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)

In [7]:
confirmed_cases.to_csv("time_series_covid19_confirmed_global.csv")
death_cases.to_csv("time_series_covid19_deaths_global.csv")
recovered_cases.to_csv("time_series_covid19_recovered_global.csv")

In [8]:
confirmed_cases.head()

Unnamed: 0,Province_State,Country_Region,Lat,Long,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2021-09-08,2021-09-09,2021-09-10,2021-09-11,2021-09-12,2021-09-13,2021-09-14,2021-09-15,2021-09-16,2021-09-17
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,153736,153840,153962,153982,153990,154094,154180,154283,154361,154487
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,153318,154316,155293,156162,157026,157436,158431,159423,160365,161324
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,198962,199275,199560,199822,200068,200301,200528,200770,200989,201224
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,15070,15078,15083,15083,15083,15096,15099,15108,15113,15124
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,49349,49628,49943,50348,50446,50738,51047,51407,51827,52208


In [9]:
death_cases.head()

Unnamed: 0,Province_State,Country_Region,Lat,Long,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2021-09-08,2021-09-09,2021-09-10,2021-09-11,2021-09-12,2021-09-13,2021-09-14,2021-09-15,2021-09-16,2021-09-17
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,7151,7157,7164,7167,7167,7169,7171,7174,7183,7186
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,2528,2531,2535,2539,2543,2548,2553,2557,2563,2569
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,5489,5519,5539,5558,5578,5596,5614,5630,5651,5670
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,130,130,130,130,130,130,130,130,130,130
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,1309,1313,1322,1327,1339,1345,1358,1360,1371,1378


In [10]:
recovered_cases.head()

Unnamed: 0,Province_State,Country_Region,Lat,Long,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2021-09-08,2021-09-09,2021-09-10,2021-09-11,2021-09-12,2021-09-13,2021-09-14,2021-09-15,2021-09-16,2021-09-17
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
confirmed_cases_melt = confirmed_cases.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_cases.columns[4:], var_name='Date', value_name='ConfirmedCases')
death_cases_melt = death_cases.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_cases.columns[4:], var_name='Date', value_name='Deaths')
recovered_cases_melt = death_cases.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_cases.columns[4:], var_name='Date', value_name='Recovered')

In [12]:
confirmed_cases_melt

Unnamed: 0,Country_Region,Province_State,Lat,Long,Date,ConfirmedCases
0,Afghanistan,,33.939110,67.709953,2020-01-22,0
1,Albania,,41.153300,20.168300,2020-01-22,0
2,Algeria,,28.033900,1.659600,2020-01-22,0
3,Andorra,,42.506300,1.521800,2020-01-22,0
4,Angola,,-11.202700,17.873900,2020-01-22,0
...,...,...,...,...,...,...
168790,Vietnam,,14.058324,108.277199,2021-09-17,667650
168791,West Bank and Gaza,,31.952200,35.233200,2021-09-17,381854
168792,Yemen,,15.552727,48.516388,2021-09-17,8593
168793,Zambia,,-13.133897,27.849332,2021-09-17,208353


In [13]:
death_cases_melt

Unnamed: 0,Country_Region,Province_State,Lat,Long,Date,Deaths
0,Afghanistan,,33.939110,67.709953,2020-01-22,0
1,Albania,,41.153300,20.168300,2020-01-22,0
2,Algeria,,28.033900,1.659600,2020-01-22,0
3,Andorra,,42.506300,1.521800,2020-01-22,0
4,Angola,,-11.202700,17.873900,2020-01-22,0
...,...,...,...,...,...,...
168790,Vietnam,,14.058324,108.277199,2021-09-17,16637
168791,West Bank and Gaza,,31.952200,35.233200,2021-09-17,3890
168792,Yemen,,15.552727,48.516388,2021-09-17,1628
168793,Zambia,,-13.133897,27.849332,2021-09-17,3637


In [14]:
merge_df = confirmed_cases_melt.merge(death_cases_melt, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])
merge_df = merge_df.merge(recovered_cases_melt, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])

In [15]:
merge_df.tail()

Unnamed: 0,Country_Region,Province_State,Lat,Long,Date,ConfirmedCases,Deaths,Recovered
168790,Vietnam,,14.058324,108.277199,2021-09-17,667650,16637,16637
168791,West Bank and Gaza,,31.9522,35.2332,2021-09-17,381854,3890,3890
168792,Yemen,,15.552727,48.516388,2021-09-17,8593,1628,1628
168793,Zambia,,-13.133897,27.849332,2021-09-17,208353,3637,3637
168794,Zimbabwe,,-19.015438,29.154857,2021-09-17,127632,4562,4562


In [16]:
global_df = merge_df.groupby('Date')[['ConfirmedCases', 'Deaths']].sum().reset_index()
global_df['new_case'] = global_df['ConfirmedCases'] - global_df['ConfirmedCases'].shift(1)
global_df['growth_factor'] = global_df['new_case'] / global_df['new_case'].shift(1)
global_df.tail()

Unnamed: 0,Date,ConfirmedCases,Deaths,new_case,growth_factor
600,2021-09-13,225366201,4639619,609291.0,1.674405
601,2021-09-14,225915368,4649482,549167.0,0.901321
602,2021-09-15,226478640,4659842,563272.0,1.025684
603,2021-09-16,227056250,4670256,577610.0,1.025455
604,2021-09-17,227649349,4679137,593099.0,1.026816


In [17]:
global_melt_df = pd.melt(global_df, id_vars=['Date'], value_vars=['ConfirmedCases', 'Deaths', 'new_case'])
global_melt_df.head()

Unnamed: 0,Date,variable,value
0,2020-01-22,ConfirmedCases,557.0
1,2020-01-23,ConfirmedCases,655.0
2,2020-01-24,ConfirmedCases,941.0
3,2020-01-25,ConfirmedCases,1433.0
4,2020-01-26,ConfirmedCases,2118.0


In [18]:
fig = px.line(global_melt_df, x="Date", y="value", color='variable', 
              title="Global Cases")
fig.show()

In [19]:
fig = px.line(global_melt_df, x="Date", y="value", color='variable',
              title="Global Cases (Log)",
             log_y=True)
fig.show()

In [20]:
global_df['mortality'] = global_df['Deaths'] / global_df['ConfirmedCases']

fig = px.line(global_df, x="Date", y="mortality", 
              title="Mortality Rate")
fig.show()

In [21]:
prediction_week = []
prediction_month = []
prediction_quarter = []
prediction_year = []

for index, df in merge_df.groupby('Country_Region'):
    prediction_week_df = df[['ConfirmedCases']].shift(-5)
    prediction_month_df = df[['ConfirmedCases']].shift(-20)
    prediction_quarter_df = df[['ConfirmedCases']].shift(-60)
    prediction_year_df = df[['ConfirmedCases']].shift(-240)
    
    prediction_week.append(prediction_week_df)
    prediction_month.append(prediction_month_df)
    prediction_quarter.append(prediction_quarter_df)
    prediction_year.append(prediction_year_df)

y_week = pd.concat(prediction_week).rename(columns={'ConfirmedCases': 'y_week'})
y_month = pd.concat(prediction_month).rename(columns={'ConfirmedCases': 'y_month'})
y_quarter = pd.concat(prediction_quarter).rename(columns={'ConfirmedCases': 'y_quarter'})
y_year = pd.concat(prediction_year).rename(columns={'ConfirmedCases': 'y_year'})

# Join labels back to main dataframe
merge_df = pd.concat([merge_df, y_week, y_month, y_quarter, y_year], axis=1)

merge_df2 = merge_df.copy()

merge_df

Unnamed: 0,Country_Region,Province_State,Lat,Long,Date,ConfirmedCases,Deaths,Recovered,y_week,y_month,y_quarter,y_year
0,Afghanistan,,33.939110,67.709953,2020-01-22,0,0,0,0.0,0.0,34.0,38969.0
1,Albania,,41.153300,20.168300,2020-01-22,0,0,0,0.0,0.0,89.0,12073.0
2,Algeria,,28.033900,1.659600,2020-01-22,0,0,0,0.0,0.0,201.0,49413.0
3,Andorra,,42.506300,1.521800,2020-01-22,0,0,0,0.0,0.0,113.0,1564.0
4,Angola,,-11.202700,17.873900,2020-01-22,0,0,0,0.0,0.0,2.0,3848.0
...,...,...,...,...,...,...,...,...,...,...,...,...
168790,Vietnam,,14.058324,108.277199,2021-09-17,667650,16637,16637,,,,
168791,West Bank and Gaza,,31.952200,35.233200,2021-09-17,381854,3890,3890,,,,
168792,Yemen,,15.552727,48.516388,2021-09-17,8593,1628,1628,,,,
168793,Zambia,,-13.133897,27.849332,2021-09-17,208353,3637,3637,,,,


In [22]:
prediction_week_df

Unnamed: 0,ConfirmedCases
278,0.0
557,0.0
836,0.0
1115,0.0
1394,0.0
...,...
167678,
167957,
168236,
168515,


In [23]:
# Apply one hot encoding on categorical features 
merge_df = pd.concat([merge_df.drop(columns=['Country_Region']), pd.get_dummies(merge_df['Country_Region'])], axis=1)

In [24]:
merge_df = pd.concat([merge_df.drop(columns=['Province_State']), pd.get_dummies(merge_df['Province_State'])], axis=1)

In [25]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168795 entries, 0 to 168794
Columns: 292 entries, Lat to Zhejiang
dtypes: float64(6), int64(3), object(1), uint8(282)
memory usage: 59.6+ MB


In [26]:
merge_df

Unnamed: 0,Lat,Long,Date,ConfirmedCases,Deaths,Recovered,y_week,y_month,y_quarter,y_year,...,Tibet,Turks and Caicos Islands,Unknown,Victoria,Wallis and Futuna,Western Australia,Xinjiang,Yukon,Yunnan,Zhejiang
0,33.939110,67.709953,2020-01-22,0,0,0,0.0,0.0,34.0,38969.0,...,0,0,0,0,0,0,0,0,0,0
1,41.153300,20.168300,2020-01-22,0,0,0,0.0,0.0,89.0,12073.0,...,0,0,0,0,0,0,0,0,0,0
2,28.033900,1.659600,2020-01-22,0,0,0,0.0,0.0,201.0,49413.0,...,0,0,0,0,0,0,0,0,0,0
3,42.506300,1.521800,2020-01-22,0,0,0,0.0,0.0,113.0,1564.0,...,0,0,0,0,0,0,0,0,0,0
4,-11.202700,17.873900,2020-01-22,0,0,0,0.0,0.0,2.0,3848.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168790,14.058324,108.277199,2021-09-17,667650,16637,16637,,,,,...,0,0,0,0,0,0,0,0,0,0
168791,31.952200,35.233200,2021-09-17,381854,3890,3890,,,,,...,0,0,0,0,0,0,0,0,0,0
168792,15.552727,48.516388,2021-09-17,8593,1628,1628,,,,,...,0,0,0,0,0,0,0,0,0,0
168793,-13.133897,27.849332,2021-09-17,208353,3637,3637,,,,,...,0,0,0,0,0,0,0,0,0,0


In [27]:
def to_integer(dt_time):
    return 10000 * dt_time.year + 100 * dt_time.month + dt_time.day
merge_df['Date'] = pd.to_datetime(merge_df['Date'])
merge_df['Date'] = merge_df['Date'].dt.strftime("%Y%m%d").astype(int)

In [28]:
cfm_week_df = merge_df.drop(columns=['y_month', 'y_quarter', 'y_year'])
cfm_week_df = cfm_week_df.dropna()

cfm_month_df = merge_df.drop(columns=['y_week', 'y_quarter', 'y_year'])
cfm_month_df = cfm_month_df.dropna()

cfm_quarter_df = merge_df.drop(columns=['y_week', 'y_month', 'y_year'])
cfm_quarter_df = cfm_quarter_df.dropna()

cfm_year_df = merge_df.drop(columns=['y_week', 'y_month', 'y_quarter'])
cfm_year_df = cfm_year_df.dropna()

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split 
# from sklearn import metrics 
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.metrics import r2_score

In [30]:
def model_and_accuracy(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

    #put a trail loop inside 
    for i in range(4):
        
        n_estimators = 5 ** i

        rf_model = RandomForestRegressor(
            n_estimators = n_estimators,
            bootstrap = True,
            n_jobs = -1
        )

        rf_model.fit(X_train, y_train)
        y_pred = rf_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test,y_pred) 
        rsq = r2_score(y_test,y_pred)
        print(f'Estimators: {n_estimators}')
        print("Prediction Score:", rf_model.score(X_test, y_test))
        print('Mean Squared Error : ',mse)
        print('Root Mean Squared Error : ',rmse)
        print('Mean Absolute Error : ',mae)
        print('R Square:', rsq)
        print('--------------------------------------')
        print()

def get_most_important_features(X, y, n_estimators):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =101)

    rf_model = RandomForestRegressor(
        n_estimators = n_estimators,
        bootstrap = True,
        n_jobs = -1
    )

    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)

    display(pd.DataFrame(rf_model.feature_importances_, index=X.columns, columns=['score']).sort_values(by='score', ascending=False))

In [31]:
# X = cfm_week_df.drop(columns=['y_week'])
# y = cfm_week_df['y_week']

# model_and_accuracy(X, y)

In [32]:
X = cfm_year_df.drop(columns=['y_year'])
y = cfm_year_df['y_year']

model_and_accuracy(X, y)

Estimators: 1
Prediction Score: 0.9999161713535712
Mean Squared Error :  401095107.9211297
Root Mean Squared Error :  20027.358985176495
Mean Absolute Error :  2963.55410233569
R Square: 0.9999161713535712
--------------------------------------

Estimators: 5
Prediction Score: 0.9997432313229736
Mean Squared Error :  1228561650.5835066
Root Mean Squared Error :  35050.84379274637
Mean Absolute Error :  2250.4011515721254
R Square: 0.9997432313229736
--------------------------------------

Estimators: 25
Prediction Score: 0.9999666075236275
Mean Squared Error :  159773054.7367412
Root Mean Squared Error :  12640.136658151334
Mean Absolute Error :  1744.1212937736411
R Square: 0.9999666075236275
--------------------------------------

Estimators: 125
Prediction Score: 0.9999575819453665
Mean Squared Error :  202957758.7830986
Root Mean Squared Error :  14246.32439554493
Mean Absolute Error :  1714.622978352316
R Square: 0.9999575819453665
--------------------------------------



In [33]:
# def get_most_important_features(X, y, n_estimators):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =101)

#     rf_model = RandomForestRegressor(
#         n_estimators = n_estimators,
#         bootstrap = True,
#         n_jobs = -1
#     )

#     rf_model.fit(X_train, y_train)
#     y_pred = rf_model.predict(X_test)

#     display(pd.DataFrame(rf_model.feature_importances_, index=X.columns, columns=['score']).sort_values(by='score', ascending=False))

#     plt.figure(figsize=(8,6))
#     plt.plot(y_test,y_test,color='deeppink')
#     plt.scatter(y_test,y_pred,color='dodgerblue')
#     plt.xlabel('Actual Target Value',fontsize=15)
#     plt.ylabel('Predicted Target Value',fontsize=15)
#     plt.title('Random Forest Regressor (R2 Score= 0.95)',fontsize=14)
#     plt.show()

In [34]:
# get_most_important_features(X, y, 125)

In [35]:
import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'

In [None]:
def booster(X, y, n_estimators):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =101)
    xgbr= xgb.XGBRegressor(n_estimators=n_estimators, learning_rate=0.01, gamma=0, subsample=.7,
                           colsample_bytree=.7, max_depth=10,
                           min_child_weight=0, 
                           objective='reg:squarederror', nthread=-1, scale_pos_weight=1,
                           seed=27, reg_alpha=0.00006, n_jobs=-1)
    
    xgbr.fit(X_train,y_train)
    y_pred2 = rf_model.predict(X_test)
    
    print('RMSE_XGBoost Regression=', np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
    print('R2 Score_XGBoost Regression=',metrics.r2_score(y_test,y_pred)

SyntaxError: unexpected EOF while parsing (<ipython-input-34-03296acf7c22>, line 13)

In [None]:
booster(X, y, 125)