In [141]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', None)
np.set_printoptions(suppress=True)
df = pd.read_csv('data/covid_19_data.csv')

In [142]:
# I referred code regarding data preprocessing from https://www.kaggle.com/chirag9073/coronavirus-covid-19-outbreak-data-analysis

df.drop(['SNo'], axis=1, inplace=True)
df['ObservationDate'] = df['ObservationDate'].apply(pd.to_datetime)

In [143]:
df.head(10)

Unnamed: 0,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,2020-01-22,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2020-01-22,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,2020-01-22,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,2020-01-22,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,2020-01-22,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0
5,2020-01-22,Guangdong,Mainland China,1/22/2020 17:00,26.0,0.0,0.0
6,2020-01-22,Guangxi,Mainland China,1/22/2020 17:00,2.0,0.0,0.0
7,2020-01-22,Guizhou,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
8,2020-01-22,Hainan,Mainland China,1/22/2020 17:00,4.0,0.0,0.0
9,2020-01-22,Hebei,Mainland China,1/22/2020 17:00,1.0,0.0,0.0


In [144]:
df.isnull().sum()

ObservationDate      0
Province/State     800
Country/Region       0
Last Update          0
Confirmed            0
Deaths               0
Recovered            0
dtype: int64

In [145]:
df[df['Province/State'].isnull()].head(10)

Unnamed: 0,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
35,2020-01-22,,Japan,1/22/2020 17:00,2.0,0.0,0.0
36,2020-01-22,,Thailand,1/22/2020 17:00,2.0,0.0,0.0
37,2020-01-22,,South Korea,1/22/2020 17:00,1.0,0.0,0.0
73,2020-01-23,,Japan,1/23/20 17:00,1.0,0.0,0.0
74,2020-01-23,,Thailand,1/23/20 17:00,3.0,0.0,0.0
75,2020-01-23,,South Korea,1/23/20 17:00,1.0,0.0,0.0
76,2020-01-23,,Singapore,1/23/20 17:00,1.0,0.0,0.0
77,2020-01-23,,Philippines,1/23/20 17:00,0.0,0.0,0.0
78,2020-01-23,,Malaysia,1/23/20 17:00,0.0,0.0,0.0
79,2020-01-23,,Vietnam,1/23/20 17:00,2.0,0.0,0.0


In [146]:
# Current situation
grouped_df = df.groupby(['Country/Region', 'Province/State'])['Confirmed', 'Deaths', 'Recovered'].max()
grouped_df.style.background_gradient(cmap='Pastel1_r')

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Deaths,Recovered
Country/Region,Province/State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia,From Diamond Princess,8,0,0
Australia,New South Wales,4,0,4
Australia,Queensland,5,0,1
Australia,South Australia,2,0,2
Australia,Victoria,4,0,4
Austria,,2,0,0
Canada,"Montreal, QC",1,0,0
Canada,British Columbia,7,0,3
Canada,"London, ON",1,0,1
Canada,Ontario,3,0,0


In [147]:
# Top 10 Countries with most no. of reported cases
latest_df = df[df['ObservationDate'] == max(df['ObservationDate'])].reset_index()
grouped_df = latest_df.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered'].sum().reset_index()

temp = grouped_df[['Country/Region', 'Confirmed']]
temp = temp.sort_values(by='Confirmed', ascending=False)
temp = temp.reset_index(drop=True)
temp.head(10).style.background_gradient(cmap='Pastel1_r')

Unnamed: 0,Country/Region,Confirmed
0,Mainland China,78824
1,South Korea,2337
2,Italy,888
3,Others,705
4,Iran,388
5,Japan,228
6,Hong Kong,94
7,Singapore,93
8,US,62
9,France,57


In [148]:
# Countries with all the cases recovered
temp = grouped_df[grouped_df['Confirmed']==grouped_df['Recovered']]
temp = temp[['Country/Region', 'Confirmed', 'Recovered']]
temp = temp.sort_values('Confirmed', ascending=False)
temp = temp.reset_index(drop=True)
temp.style.background_gradient(cmap='Greens')

Unnamed: 0,Country/Region,Confirmed,Recovered
0,Vietnam,16,16
1,India,3,3
2,Russia,2,2
3,Belgium,1,1
4,Cambodia,1,1
5,Egypt,1,1
6,Nepal,1,1
7,Sri Lanka,1,1


In [149]:
# Most recent stats
daily_df = df.groupby('ObservationDate')['Confirmed', 'Deaths', 'Recovered'].sum()
daily_df = daily_df.reset_index()
daily_df = daily_df.sort_values('ObservationDate', ascending=False)
daily_df.head(1).style.background_gradient(cmap='Pastel1')

Unnamed: 0,ObservationDate,Confirmed,Deaths,Recovered
37,2020-02-28 00:00:00,84124,2867,36711


In [150]:
daily_df.style.background_gradient(cmap='Pastel1')

Unnamed: 0,ObservationDate,Confirmed,Deaths,Recovered
37,2020-02-28 00:00:00,84124,2867,36711
36,2020-02-27 00:00:00,82756,2814,33277
35,2020-02-26 00:00:00,81397,2770,30384
34,2020-02-25 00:00:00,80415,2708,27905
33,2020-02-24 00:00:00,79570,2629,25227
32,2020-02-23 00:00:00,78985,2469,23394
31,2020-02-22 00:00:00,78599,2458,22886
30,2020-02-21 00:00:00,76843,2251,18890
29,2020-02-20 00:00:00,76199,2247,18177
28,2020-02-19 00:00:00,75641,2122,16121


In [151]:
daily_df.isnull().sum()

ObservationDate    0
Confirmed          0
Deaths             0
Recovered          0
dtype: int64

In [152]:
daily_df['Confirmed']

37    84124.0
36    82756.0
35    81397.0
34    80415.0
33    79570.0
32    78985.0
31    78599.0
30    76843.0
29    76199.0
28    75641.0
27    75138.0
26    73260.0
25    71226.0
24    69032.0
23    66887.0
22    60370.0
21    45222.0
20    44803.0
19    42763.0
18    40151.0
17    37121.0
16    34392.0
15    30818.0
14    27636.0
13    23892.0
12    19881.0
11    16787.0
10    12038.0
9      9925.0
8      8235.0
7      6165.0
6      5578.0
5      2927.0
4      2118.0
3      1438.0
2       941.0
1       653.0
0       555.0
Name: Confirmed, dtype: float64

In [153]:
WINDOW_SIZE = 8

def seq2dataset(sr, window_size):
        seq_data = []
        for i in range(len(sr) - window_size):
            subset = list(sr[i:(i+window_size+1)])
            subset.reverse()
            seq_data.append(subset)
        seq_data.reverse()
        seq_data = np.array(seq_data)    
        return seq_data[:, 0:window_size], seq_data[:,window_size]

In [154]:
X, y = seq2dataset(daily_df['Confirmed'], WINDOW_SIZE)

In [155]:
X

array([[  555.,   653.,   941.,  1438.,  2118.,  2927.,  5578.,  6165.],
       [  653.,   941.,  1438.,  2118.,  2927.,  5578.,  6165.,  8235.],
       [  941.,  1438.,  2118.,  2927.,  5578.,  6165.,  8235.,  9925.],
       [ 1438.,  2118.,  2927.,  5578.,  6165.,  8235.,  9925., 12038.],
       [ 2118.,  2927.,  5578.,  6165.,  8235.,  9925., 12038., 16787.],
       [ 2927.,  5578.,  6165.,  8235.,  9925., 12038., 16787., 19881.],
       [ 5578.,  6165.,  8235.,  9925., 12038., 16787., 19881., 23892.],
       [ 6165.,  8235.,  9925., 12038., 16787., 19881., 23892., 27636.],
       [ 8235.,  9925., 12038., 16787., 19881., 23892., 27636., 30818.],
       [ 9925., 12038., 16787., 19881., 23892., 27636., 30818., 34392.],
       [12038., 16787., 19881., 23892., 27636., 30818., 34392., 37121.],
       [16787., 19881., 23892., 27636., 30818., 34392., 37121., 40151.],
       [19881., 23892., 27636., 30818., 34392., 37121., 40151., 42763.],
       [23892., 27636., 30818., 34392., 37121., 401

In [156]:
y

array([ 8235.,  9925., 12038., 16787., 19881., 23892., 27636., 30818.,
       34392., 37121., 40151., 42763., 44803., 45222., 60370., 66887.,
       69032., 71226., 73260., 75138., 75641., 76199., 76843., 78599.,
       78985., 79570., 80415., 81397., 82756., 84124.])

In [157]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)

In [158]:
X_train

array([[  555.,   653.,   941.,  1438.,  2118.,  2927.,  5578.,  6165.],
       [  653.,   941.,  1438.,  2118.,  2927.,  5578.,  6165.,  8235.],
       [  941.,  1438.,  2118.,  2927.,  5578.,  6165.,  8235.,  9925.],
       [ 1438.,  2118.,  2927.,  5578.,  6165.,  8235.,  9925., 12038.],
       [ 2118.,  2927.,  5578.,  6165.,  8235.,  9925., 12038., 16787.],
       [ 2927.,  5578.,  6165.,  8235.,  9925., 12038., 16787., 19881.],
       [ 5578.,  6165.,  8235.,  9925., 12038., 16787., 19881., 23892.],
       [ 6165.,  8235.,  9925., 12038., 16787., 19881., 23892., 27636.],
       [ 8235.,  9925., 12038., 16787., 19881., 23892., 27636., 30818.],
       [ 9925., 12038., 16787., 19881., 23892., 27636., 30818., 34392.],
       [12038., 16787., 19881., 23892., 27636., 30818., 34392., 37121.],
       [16787., 19881., 23892., 27636., 30818., 34392., 37121., 40151.],
       [19881., 23892., 27636., 30818., 34392., 37121., 40151., 42763.],
       [23892., 27636., 30818., 34392., 37121., 401

In [159]:
X_test

array([[71226., 73260., 75138., 75641., 76199., 76843., 78599., 78985.],
       [73260., 75138., 75641., 76199., 76843., 78599., 78985., 79570.],
       [75138., 75641., 76199., 76843., 78599., 78985., 79570., 80415.],
       [75641., 76199., 76843., 78599., 78985., 79570., 80415., 81397.],
       [76199., 76843., 78599., 78985., 79570., 80415., 81397., 82756.]])

In [160]:
y_train

array([ 8235.,  9925., 12038., 16787., 19881., 23892., 27636., 30818.,
       34392., 37121., 40151., 42763., 44803., 45222., 60370., 66887.,
       69032., 71226., 73260., 75138., 75641., 76199., 76843., 78599.,
       78985.])

In [161]:
y_test

array([79570., 80415., 81397., 82756., 84124.])

In [162]:
for column_name in ['Confirmed', 'Deaths', 'Recovered']:
    X, y = seq2dataset(daily_df[column_name], WINDOW_SIZE)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)
    



SyntaxError: positional argument follows keyword argument (<ipython-input-162-408b7c25d0cc>, line 3)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler


class PandemicRegressor():
    def __init__(self, df, window_size):
        self.df = df
        self.regressor = self.build_model()
        self.window_size = window_size
        self.
    
    def build_model(self):
        model = Sequential([
            LSTM(4, input_shape=(1, window_size)),
            Dense(1)
        ])
        model.compile(loss='mean_squared_error', optimizer='adam')
        return model
    
    

    def fit(self, dframe, column):
        df = dframe.values
        df = df.astype('float32')
        train_size = int(len(df) * 0.90)
        test_size = len(df) - train_size
        Train, Validate = df[0:train_size,:], df[train_size:len(df),:]
        look_back = 24
        trainX, trainY = self.create_dataset(Train, look_back, column)
        testX, testY = self.create_dataset(Validate, look_back, column)
        trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
        testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
        
        model.fit(trainX, trainY, epochs=300, batch_size=1, verbose=2)
        self.trainPredict = model.predict(trainX)
        self.testPredict = model.predict(testX)
        trainScore = math.sqrt(mean_squared_error(trainY, self.trainPredict[:,0]))
        print('Train Score: %.2f RMSE' % (trainScore))
        testScore = math.sqrt(mean_squared_error(testY, self.testPredict[:,0]))
        print('Test Score: %.2f RMSE' % (testScore))
        Model_Prediciton_Resolved=[]
        lastDT=testX[0][0]
        print(lastDT)
        for i in range(168):
            predi = model.predict(np.array([[lastDT]]))
            Model_Prediciton_Resolved.append(predi[0][0])
            lastDT = lastDT[:-1]
            lastDT = np.append(predi, lastDT)
        
    
 