In [1]:
from time import time, process_time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from utility import *
from LSTM import *

homedir = get_homedir()

In [2]:
FIPS_mapping, FIPS_full = get_FIPS(reduced=True)

In [3]:
FIPS_cluster = []
for c in range(5):
    with open(f'{homedir}/JK/preprocessing/0509/FIPS_cluster_cls={c}.txt', 'r') as f:
        FIPS_cluster.append(eval(f.read()))
FIPS_cluster[0][:5]

['06049', '08009', '08017', '08061', '08073']

In [4]:
dataList = [np.load(f'{homedir}/JK/preprocessing/0509/dataList_cls={c}.npy', allow_pickle=True) for c in range(5)]
dataList[1].shape

(1004, 63, 43)

In [5]:
columns_demo = ['PopRatioMale2017',
 'PopRatio65+2017',
 'PopulationDensityperSqMile2010',
 'MedicareEnrollment,AgedTot2017',
 '#Hospitals',
 '#ICU_beds',
 'PopRatioMale<52010',
 'PopRatio<52010',
 'PopRatioMale5-92010',
 'PopRatio5-92010',
 'PopRatioMale10-142010',
 'PopRatio10-142010',
 'PopRatioMale15-192010',
 'PopRatio15-192010',
 'PopRatioMale20-242010',
 'PopRatio20-242010',
 'PopRatioMale25-292010',
 'PopRatio25-292010',
 'PopRatioMale30-342010',
 'PopRatio30-342010',
 'PopRatioMale35-442010',
 'PopRatio35-442010',
 'PopRatioMale45-542010',
 'PopRatio45-542010',
 'PopRatioMale55-592010',
 'PopRatio55-592010',
 'PopRatioMale60-642010',
 'PopRatio60-642010',
 'PopRatioMale65-742010',
 'PopRatio65-742010',
 'PopRatioMale75-842010',
 'PopRatio75-842010',
 'PopRatioMale>842010',
 'PopRatio>842010',
 'HeartDiseaseMortality',
 'StrokeMortality',
 'DiabetesPercentage',
 'Smokers_Percentage']
columns_mt = ['cases', 'deaths']
columns_mb = ['m50', 'm50_index']
columns_ss = ['seasonality']

In [6]:
target_idx = (columns_demo+columns_mt+columns_mb+columns_ss).index('deaths')
split_ratio = 0.1
QUANTILE = list(quantileList)
history_size = 7
target_size = 14
step_size = 1
NUM_CELLS = 128
lr = 0.001
dropout = 0.2
EPOCHS = 100
EVALUATION_INTERVAL = 200

In [7]:
import os

PATH = f"{homedir}/JK/prediction/0509"
try:
    os.mkdir(PATH)
except OSError as error:
    print(error)

for c in range(5):
    X_train, y_train, X_val, y_val = train_val_split(dataList[c], target_idx, history_size, target_size, split_ratio=split_ratio, step_size=step_size)

    scaler, *_ = get_StandardScaler(X_train)
    X_train, y_train = normalizer(scaler, X_train, y_train, target_idx)
    X_val, y_val = normalizer(scaler, X_val, y_val, target_idx)

    train_data, val_data = load_Dataset(X_train, y_train, X_val, y_val)

    history_size = train_data.element_spec[0].shape[1]
    feature_size = train_data.element_spec[0].shape[2]
    target_size = train_data.element_spec[1].shape[1]

    model_qntl = list(range(len(QUANTILE)))

    for i in range(len(QUANTILE)):
        FILEPATH = f"/LSTM_class={c}_qntl={10*(i+1)}"
        print(f'Class={c}, quantile={10*(i+1)} is trained')
        optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

        model_qntl[i] = tf.keras.models.Sequential()
        model_qntl[i].add(tf.keras.layers.LSTM(NUM_CELLS, return_sequences=True, input_shape=(history_size, feature_size)))
        model_qntl[i].add(tf.keras.layers.LSTM(round(NUM_CELLS/2), activation='relu', dropout=dropout))
        model_qntl[i].add(tf.keras.layers.Dense(target_size))

        model_qntl[i].compile(optimizer=optimizer, loss=lambda y_p, y: quantileLoss(QUANTILE[i], y_p, y))
        history = model_qntl[i].fit(train_data, epochs=EPOCHS, steps_per_epoch=EVALUATION_INTERVAL, validation_data=val_data, validation_steps=50)

        LOSS = np.asarray(history.history['loss'])
        VAL_LOSS = np.asarray(history.history['val_loss'])

        plt.figure()

        plt.plot(range(EPOCHS), LOSS, 'b', label='Training loss')
        plt.plot(range(EPOCHS), VAL_LOSS, 'r', label='Validation loss')
        plt.legend()
        
        plt.savefig(PATH+FILEPATH+'.png')

        np.save(PATH+FILEPATH+'.npy', np.vstack((LOSS, VAL_LOSS)).astype(np.float32))

    df_future = predict_future(model_qntl, dataList[c], scaler, target_idx, FIPS=FIPS_cluster[c], date_ed=pd.Timestamp('2020-05-02'))
    df_future.to_csv(PATH+f'/LSTM_class={c}_0509.csv', index=False)

Class=0, quantile=10 is trained
Train for 200 steps, validate for 50 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3
Class=0, quantile=20 is trained
Train for 200 steps, validate for 50 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3
Class=0, quantile=30 is trained
Train for 200 steps, validate for 50 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3
Class=0, quantile=40 is trained
Train for 200 steps, validate for 50 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3
Class=0, quantile=50 is trained
Train for 200 steps, validate for 50 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3
Class=0, quantile=60 is trained
Train for 200 steps, validate for 50 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3
Class=0, quantile=70 is trained
Train for 200 steps, validate for 50 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3
Class=0, quantile=80 is trained
Train for 200 steps, validate for 50 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3
Class=0, quantile=90 is trained
Train for 200 steps, validate for 50 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3
Saving future prediction.
Class=1, quantile=10 is train

In [6]:
for c in range(5):
    if c==0:
        forecasted = pd.read_csv(f'{homedir}/JK/prediction/0509/LSTM_class={c}_0509.csv', parse_dates=[0])
        print(len(forecasted))
    else:
        forecasted = forecasted.append(pd.read_csv(f'{homedir}/JK/prediction/0509/LSTM_class={c}_0509.csv', parse_dates=[0]))
        print(len(forecasted))
forecasted['fips'] = forecasted['fips'].apply(correct_FIPS)
forecasted.head()

2744
16800
41776
43176
43596


Unnamed: 0,date,fips,10,20,30,40,50,60,70,80,90
0,2020-05-03,6049,-1.5e-05,-4.793238e-05,-4e-06,-1.197355e-05,2e-06,3.127055e-05,2e-06,1.855148e-05,1.646625e-05
1,2020-05-04,6049,-4e-06,-1.884066e-05,5e-06,-3.372226e-05,-1e-06,-3.580935e-07,-6e-06,6.146729e-07,8.833595e-07
2,2020-05-05,6049,-1.5e-05,-4.255725e-05,-2.9e-05,-8.409843e-07,-1.4e-05,3.035646e-06,1.1e-05,2.536178e-05,4.005106e-05
3,2020-05-06,6049,5e-06,-3.119791e-05,-6e-06,-4.197983e-05,-1.5e-05,1.506833e-05,-8e-06,5.352264e-05,3.483007e-05
4,2020-05-07,6049,-3.1e-05,4.712492e-07,-2.7e-05,-4.528556e-06,-2.5e-05,1.567975e-05,5e-06,-4.617032e-06,3.235601e-05


In [7]:
forecasted = fix_FIPS(forecasted, fipslabel='fips', datelabel='date')
submission_df = to_multi_idx(forecasted, fipslabel='fips', datelabel='date')
submission_df.head()

Unnamed: 0_level_0,10,20,30,40,50,60,70,80,90
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-05-03-1001,2.087624,2.033535,1.936095,1.785388,1.848744,2.660135,2.985103,2.370857,2.59804
2020-05-03-1003,3.259027,2.849348,3.743574,2.131448,3.236057,4.783477,2.758943,1.873414,2.659592
2020-05-03-1005,-0.001873,0.058421,-0.001209,0.085599,-0.001467,0.009388,0.07606,0.27274,0.00983
2020-05-03-1007,5.3e-05,-0.000286,-0.000574,-0.003864,0.021152,-8.4e-05,0.437375,0.483777,0.950784
2020-05-03-1009,-1e-06,-0.000303,-0.000323,0.003381,-0.001314,0.003189,0.688213,0.000175,0.012162


In [9]:
base_pred = pd.read_csv(f'{homedir}/JK/prediction/0509/base_prediction.csv')
base_pred.set_index('id', inplace=True)
base_pred.head()

Unnamed: 0_level_0,10,20,30,40,50,60,70,80,90
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-04-01-10001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-01-10003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-01-10005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-01-1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04-01-1003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
base_pred.update(submission_df)

In [13]:
_ = base_pred.reset_index()

In [16]:
_.to_csv(f'{homedir}/submissions/0509_LSTM_for_comparison.csv', index=False)