In [1]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch

CPython 3.7.6
IPython 7.13.0

numpy 1.18.1
pandas 0.23.4
torch 1.4.0


In [2]:
import torch

import os, wget, json
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.preprocessing import MinMaxScaler
from pandas.plotting import register_matplotlib_converters
from torch import nn, optim

In [3]:
from ML.lstm_torch import LSTM_data_loader, LSTM_Predictor, train_lstm, predict_future

import warnings
warnings.filterwarnings('ignore')

In [4]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [5]:
sns.set(style='whitegrid', palette='muted', font_scale=1.2)

In [6]:
sns.set_palette(sns.color_palette("husl", 8))

In [7]:
rcParams['figure.figsize'] = 14, 10
register_matplotlib_converters()

In [8]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x10fb5f6b0>

In [24]:
if os.path.isfile('time_series_19-covid-Confirmed.csv'):
    os.remove('time_series_19-covid-Confirmed.csv')
    
wget.download('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv')

'time_series_19-covid-Confirmed.csv'

In [11]:
tdf = pd.read_csv('time_series_19-covid-Confirmed.csv')

In [12]:
tdf.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/13/20,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20
0,,Thailand,15.0,101.0,2,3,5,7,8,8,...,75,82,114,147,177,212,272,322,411,599
1,,Japan,36.0,138.0,2,1,2,2,4,4,...,701,773,839,825,878,889,924,963,1007,1086
2,,Singapore,1.2833,103.8333,0,1,3,3,4,5,...,200,212,226,243,266,313,345,385,432,455
3,,Nepal,28.1667,84.25,0,0,0,1,1,1,...,1,1,1,1,1,1,1,1,1,2
4,,Malaysia,2.5,112.5,0,0,0,3,4,4,...,197,238,428,566,673,790,900,1030,1183,1306


In [13]:
with open('./data/us-states.json', 'r') as f:
    us_states = json.load(f)
    
state_abrs = [x['id'] for x in us_states['features']]

state_mapper_lst = [{x['properties']['name']:x['id']} for x in us_states['features']]

state_mapper_lst
state_mapper = {}
for s in state_mapper_lst:
    state_mapper.update(s)

In [15]:
state_data_loader = LSTM_data_loader(df=tdf,
                                       region_abr='NY',
                                       country='US',
                                       region_list=state_abrs,
                                       state_mapper=state_mapper)

In [25]:
state_data_loader.subset_df()

In [26]:
state_data_loader.df.head()

Unnamed: 0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/13/20,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20
0,0,0,0,0,0,0,0,0,0,0,...,421,525,732,967,1706,2495,5365,8310,11710,15793


In [27]:
# state_data_loader.df = state_data_loader.df.loc[:, (state_data_loader.df != 0).any(axis=0)]

In [28]:
state_data_loader.df.head()

Unnamed: 0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/13/20,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20
0,0,0,0,0,0,0,0,0,0,0,...,421,525,732,967,1706,2495,5365,8310,11710,15793


In [29]:
# df.loc[:, (df != 0).any(axis=0)]

In [30]:
state_data_loader.transform_df_datetime(delta=True)

Data is converted to daily delta


In [31]:
state_data_loader.gen_data_sets(test_data_size=0)

X_train, y_train = state_data_loader.set_seq()
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()

In [32]:
model = LSTM_Predictor(features=1,
                       neurons=512,
                       sequences=3,
                       layers=2,
                       dropout=0.0)

In [33]:
model, train_hist, test_hist = train_lstm(model,
                                      X_train,
                                      y_train,
                                      epochs=300)

Epoch 0 train loss: 1.6503336429595947
Epoch 40 train loss: 1.382882833480835
Epoch 80 train loss: 0.5032798051834106
Epoch 120 train loss: 0.4855053126811981
Epoch 160 train loss: 0.4836460053920746
Epoch 200 train loss: 0.48288995027542114
Epoch 240 train loss: 0.48266687989234924
Epoch 280 train loss: 0.4826158285140991


In [34]:
seq_length = model.sequences
days_to_predict = 10

outs = predict_future(n_future=days_to_predict, 
                      time_data=X_train, 
                      sequece_lenth=model.sequences, 
                      model=model)

predicted_cases = state_data_loader.scaler.inverse_transform(
  np.expand_dims(outs, axis=0)
).flatten()
print(predicted_cases)

[ 3003.86217248  3666.25486726  4671.27832818  6008.22275805
  7715.08363688  9834.20222712 12377.1525178  15289.23026562
 18427.81729174 21580.48254061]


In [26]:
data_saver_nodelta = {}

counter = 1

for state in state_abrs:
    
    print('{}: {} out of {}'.format(state, counter, len(state_abrs)))
    
    state_data_loader = LSTM_data_loader(df=tdf,
                                           region_abr=state,
                                           country='US',
                                           region_list=state_abrs,
                                           state_mapper=state_mapper)
    
    state_data_loader.subset_df()
    
#     state_data_loader.drop_empty_days()

    state_data_loader.transform_df_datetime(delta=False)

    state_data_loader.gen_data_sets(test_data_size=0)
    
    X_train, y_train = state_data_loader.set_seq()
    X_train = torch.from_numpy(X_train).float()
    y_train = torch.from_numpy(y_train).float()
    
    model = LSTM_Predictor(features=1,
                           neurons=512,
                           sequences=3,
                           layers=2,
                           dropout=0.3)

    model, train_hist, test_hist = train_lstm(model,
                                          X_train,
                                          y_train,
                                          epochs=300)
    
    seq_length = model.sequences
    days_to_predict = 10

    outs = predict_future(n_future=days_to_predict, 
                          time_data=X_train, 
                          sequece_lenth=model.sequences, 
                          model=model)

    predicted_cases = state_data_loader.scaler.inverse_transform(
      np.expand_dims(outs, axis=0)
    ).flatten()
    print(predicted_cases)
    
    data_saver_nodelta[state] = predicted_cases
    counter+=1

AL: 1 out of 50
Data in cumulative
Epoch 0 train loss: 1.840633749961853
Epoch 40 train loss: 1.2166332006454468
Epoch 80 train loss: 0.33005356788635254
Epoch 120 train loss: 0.32123637199401855
Epoch 160 train loss: 0.3199022710323334
Epoch 200 train loss: 0.3260878026485443
Epoch 240 train loss: 0.33394116163253784
Epoch 280 train loss: 0.30427086353302
[ 94.10637367 112.93426001 139.93814421 175.07719517 219.09715605
 272.85992146 336.72793722 409.90521097 489.91552305 572.62348366]
AK: 2 out of 50
Data in cumulative
Epoch 0 train loss: 1.1001687049865723
Epoch 40 train loss: 1.0096901655197144
Epoch 80 train loss: 0.9670840501785278
Epoch 120 train loss: 0.2620595693588257
Epoch 160 train loss: 0.2406560480594635
Epoch 200 train loss: 0.21714136004447937
Epoch 240 train loss: 0.22215820848941803
Epoch 280 train loss: 0.21491369605064392
[11.43580037 13.69935894 17.08621842 21.5384202  27.17175758 34.15551388
 42.63269305 52.63465762 63.98882604 76.26033354]
AZ: 3 out of 50
Data in

[ 45.38552812  51.71090466  59.67609179  69.3046959   80.69998205
  94.00727069 109.37991929 126.94241238 146.74718499 168.72679031]
LA: 18 out of 50
Data in cumulative
Epoch 0 train loss: 1.1894947290420532
Epoch 40 train loss: 0.9724841117858887
Epoch 80 train loss: 0.2605103552341461
Epoch 120 train loss: 0.24882975220680237
Epoch 160 train loss: 0.24182561039924622
Epoch 200 train loss: 0.2626684308052063
Epoch 240 train loss: 0.263649582862854
Epoch 280 train loss: 0.2561747431755066
[ 488.90627432  562.02367133  667.81807154  802.41174585  966.60646176
 1163.05891299 1394.68815994 1663.38749349 1968.69464135 2306.49245667]
ME: 19 out of 50
Data in cumulative
Epoch 0 train loss: 1.6901988983154297
Epoch 40 train loss: 0.60804682970047
Epoch 80 train loss: 0.26211467385292053
Epoch 120 train loss: 0.23888346552848816
Epoch 160 train loss: 0.23584289848804474
Epoch 200 train loss: 0.21497493982315063
Epoch 240 train loss: 0.22301700711250305
Epoch 280 train loss: 0.24453239142894745

Epoch 200 train loss: 0.47777873277664185
Epoch 240 train loss: 0.45783084630966187
Epoch 280 train loss: 0.4299822449684143
[ 22.63341856  29.04330158  39.19633102  53.3149519   72.00154114
  95.54708672 123.21822739 152.85644913 181.44902611 206.45236206]
OH: 35 out of 50
Data in cumulative
Epoch 0 train loss: 1.229668378829956
Epoch 40 train loss: 0.8569142818450928
Epoch 80 train loss: 0.8550047874450684
Epoch 120 train loss: 0.8533980250358582
Epoch 160 train loss: 0.8591839075088501
Epoch 200 train loss: 0.8648560643196106
Epoch 240 train loss: 0.8720873594284058
Epoch 280 train loss: 0.8738311529159546
[14.26499551 14.26577445 14.26594108 14.26590934 14.26583793 14.26577181
 14.2657242  14.26569378 14.26567394 14.26566204]
OK: 36 out of 50
Data in cumulative
Epoch 0 train loss: 1.869673728942871
Epoch 40 train loss: 1.2710238695144653
Epoch 80 train loss: 0.43728071451187134
Epoch 120 train loss: 0.40188536047935486
Epoch 160 train loss: 0.42275986075401306
Epoch 200 train loss:

In [19]:
data_try = data_saver_delta.copy()

In [20]:
data_try

{'AL': array([20.01230997, 20.17822817, 20.27628341, 20.33423495, 20.36848229,
        20.38872534, 20.4006874 , 20.40775818, 20.41193789, 20.41440734]),
 'AK': array([2.3265118 , 2.34048611, 2.34835893, 2.35279423, 2.35529315,
        2.35670096, 2.35749418, 2.35794121, 2.35819298, 2.3583346 ]),
 'AZ': array([ 31.1807251 ,  40.17219067,  54.98587132,  75.91169357,
        104.09345627, 140.22391319, 183.24567795, 229.55495834,
        274.03337479, 312.48937607]),
 'AR': array([14.49491167, 14.53693718, 14.56198537, 14.57691485, 14.58581486,
        14.59111899, 14.59428173, 14.59616554, 14.59728941, 14.59795836]),
 'CA': array([160.55788755, 174.33316779, 191.09898496, 210.54514694,
        232.58976436, 257.27895176, 284.72437501, 315.06116319,
        348.41566324, 384.87712169]),
 'CO': array([29.11274663, 29.11412966, 29.11486414, 29.11526325, 29.11547667,
        29.1155903 , 29.11565405, 29.11569008, 29.11570671, 29.11571503]),
 'CT': array([20.51810417, 20.52805996, 20.5334188

In [21]:
for key in data_try.keys():
    data_try[key] = data_try[key].tolist()

In [22]:
for_writing = json.dumps(data_try)

In [23]:
with open('state_predictions_delta.json', 'w') as fp:
    json.dump(for_writing, fp)

In [17]:
tdf[tdf['Province/State'] == 'New York']

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/13/20,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20
99,New York,US,42.1657,-74.9481,0,0,0,0,0,0,...,421,525,732,967,1706,2495,5365,8310,11710,15793


In [76]:
state_data_loader = LSTM_data_loader(df=tdf,
                                    region_abr='NY',
                                    country='US',
                                    region_list=state_abrs,
                                    state_mapper=state_mapper)

In [77]:
state_data_loader.subset_df()

In [78]:
state_data_loader.transform_df_datetime(delta=False)

Data in cumulative


In [79]:
state_data_loader.df

2020-01-22        0
2020-01-23        0
2020-01-24        0
2020-01-25        0
2020-01-26        0
2020-01-27        0
2020-01-28        0
2020-01-29        0
2020-01-30        0
2020-01-31        0
2020-02-01        0
2020-02-02        0
2020-02-03        0
2020-02-04        0
2020-02-05        0
2020-02-06        0
2020-02-07        0
2020-02-08        0
2020-02-09        0
2020-02-10        0
2020-02-11        0
2020-02-12        0
2020-02-13        0
2020-02-14        0
2020-02-15        0
2020-02-16        0
2020-02-17        0
2020-02-18        0
2020-02-19        0
2020-02-20        0
2020-02-21        0
2020-02-22        0
2020-02-23        0
2020-02-24        0
2020-02-25        0
2020-02-26        0
2020-02-27        0
2020-02-28        0
2020-02-29        0
2020-03-01        0
2020-03-02        1
2020-03-03        2
2020-03-04       11
2020-03-05       23
2020-03-06       31
2020-03-07       76
2020-03-08      106
2020-03-09      142
2020-03-10      173
2020-03-11      220


In [80]:
state_data_loader.gen_data_sets(test_data_size=0)

In [82]:
state_data_loader.train_data.shape

(60, 1)

In [83]:
X_train, y_train = state_data_loader.set_seq(sequence_lenth=5)

In [84]:
X_train.shape

(54, 5, 1)

In [74]:
state_data_loader = LSTM_data_loader(df=tdf,
                                    region_abr='NY',
                                    country='US',
                                    region_list=state_abrs,
                                    state_mapper=state_mapper)

state_data_loader.subset_df()
state_data_loader.transform_df_datetime(delta=False)
state_data_loader.gen_data_sets(test_data_size=0)
X_train, y_train = state_data_loader.set_seq(sequence_lenth=5)
X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()

model = LSTM_Predictor(features=1,
                       neurons=512,
                       sequences=5,
                       layers=2,
                       dropout=0.3)

model, train_hist, test_hist = train_lstm(model,
                                          X_train,
                                          y_train,
                                          epochs=300)

Data in cumulative
Epoch 0 train loss: 0.7386003732681274
Epoch 40 train loss: 0.7327730655670166
Epoch 80 train loss: 0.7332093715667725
Epoch 120 train loss: 0.7320506572723389
Epoch 160 train loss: 0.7329537272453308
Epoch 200 train loss: 0.7323644757270813
Epoch 240 train loss: 0.732552170753479
Epoch 280 train loss: 0.732114851474762


In [72]:
seq_length = model.sequences
days_to_predict = 10

outs = predict_future(n_future=days_to_predict, 
                      time_data=X_train, 
                      sequece_lenth=model.sequences, 
                      model=model)

predicted_cases = state_data_loader.scaler.inverse_transform(
  np.expand_dims(outs, axis=0)
).flatten()
print(predicted_cases)

[403.88222404 400.79047695 398.25841486 396.48295268 395.33165053
 394.61937174 394.19203937 393.941468   393.79681364 393.71436588]
