In [1]:
!ls

[34mcovid19-global-forecasting-week-1[m[m [34mtmp[m[m
eda.ipynb


In [2]:
!ls covid19-global-forecasting-week-1

submission.csv test.csv       train.csv


In [1]:
data_dir = "./covid19-global-forecasting-week-1/"

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import tensorflow as tf
import math

# Read dataset

In [89]:
train_df_raw = pd.read_csv(data_dir+"train.csv")
test_df_raw = pd.read_csv(data_dir+"test.csv")
all_df = train_df_raw
test_df = test_df_raw

# Set parameters

In [87]:
# paramerers
maxlen = 30
hidden_number = 32
input_number = 5
output_number = 2
batch_size = 32
epochs = 50
lr = 0.001

# Normalization

In [17]:
def map_datetime(date):
    return (date - datetime.datetime.strptime('2020-01-22', "%Y-%m-%d")).days

In [18]:
def to_datetime(date):
    return datetime.datetime.strptime(date, "%Y-%m-%d")

In [90]:
all_df["ConfirmedCases"] = (all_df["ConfirmedCases"] + 1).map(math.log10)
cases_max = all_df["ConfirmedCases"].max()
fatal_max = all_df["Fatalities"].max()

all_df["Lat"] = all_df["Lat"]/180.
all_df["Long"] = all_df["Long"]/180.
all_df["ConfirmedCases"] = all_df["ConfirmedCases"] / cases_max
all_df["Fatalities"] = all_df["Fatalities"] / fatal_max

all_df["Date"] = all_df["Date"].map(to_datetime)
all_df["Date"] = all_df["Date"].map(map_datetime)
date_max = all_df["Date"].max()
all_df["Date"] = all_df["Date"] / date_max

date_unit = all_df.iloc[1]["Date"] - all_df.iloc[0]["Date"]

val_df = all_df[all_df["Date"] > (all_df.Date.max() - date_unit*(maxlen + 1))]
train_df = all_df.drop(all_df[all_df["Date"] == all_df["Date"].max()].index)

In [91]:
test_df["Lat"] = test_df["Lat"]/180.
test_df["Long"] = test_df["Long"]/180.
test_df["Date"] = test_df["Date"].map(to_datetime)
test_df["Date"] = test_df["Date"].map(map_datetime)
test_df["Date"] = test_df["Date"] / date_max

# Preprocessing
## for LSTM

In [92]:
def make_sequences(train_df):
    inputs = []
    targets = []
    for i in range(len(train_df) - maxlen - 1):
        if train_df.iloc[i]["Lat"] == train_df.iloc[i+maxlen]["Lat"] and \
           train_df.iloc[i]["Long"] == train_df.iloc[i+maxlen]["Long"]:
            inputs.append(np.array(train_df.iloc[i:i+maxlen][["Date", "Lat", "Long", "ConfirmedCases", "Fatalities"]]).tolist())
            targets.append(np.array(train_df.iloc[i+maxlen][["ConfirmedCases", "Fatalities"]]).tolist())
    return inputs, targets

In [93]:
train_inputs, train_targets = make_sequences(train_df)


In [94]:
val_inputs, val_targets = make_sequences(val_df)

# Make model

In [24]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(hidden_number, batch_input_shape=[None, maxlen, input_number], return_sequences=True))
model.add(tf.keras.layers.LSTM(hidden_number))
model.add(tf.keras.layers.Dense(output_number, activation="sigmoid"))

In [25]:
optimizer = tf.keras.optimizers.Adam(lr=lr)
model.compile(loss="mean_squared_error", optimizer=optimizer)

In [26]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='auto', patience=5)
model.fit(train_inputs, train_targets,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(val_inputs, val_targets),
          callbacks = [early_stopping]
          )

Train on 7953 samples, validate on 312 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50


<tensorflow.python.keras.callbacks.History at 0xb469d1ac8>

# stateful for prediction

In [74]:
model2 = tf.keras.Sequential()
model2.add(tf.keras.layers.LSTM(hidden_number, batch_input_shape=[1, maxlen, input_number], return_sequences=True, stateful=True))
model2.add(tf.keras.layers.LSTM(hidden_number, stateful=True))
model2.add(tf.keras.layers.Dense(output_number, activation="sigmoid"))

In [13]:
#model.save_weights('./tmp/param.hdf5')
model.load_weights('./tmp/param.hdf5')

In [76]:
model2.load_weights('./tmp/param.hdf5')

In [30]:
date_unit = train_df.iloc[2]["Date"] - train_df.iloc[1]["Date"]

In [76]:
idx = test_df.groupby(["Province/State", "Country/Region"]).count().index[0]

In [79]:
idx[1]

'US'

In [119]:
test_df = test_df.fillna({"Province/State": "NAN"})
train_df = train_df.fillna({"Province/State": "NAN"})

In [120]:
results = []
for idx in test_df.groupby(["Province/State", "Country/Region"]).count().index:
    test_df_on_idx = test_df[(test_df["Province/State"] == idx[0]) &
                             (test_df["Country/Region"] == idx[1])]
    train_df_on_idx = train_df[(train_df["Country/Region"] == idx[1]) &
                               (train_df["Province/State"] == idx[0])]
    inputs = np.array(train_df_on_idx[["Date", "Lat", "Long", "ConfirmedCases", "Fatalities"]])[-maxlen:]
    inputs = inputs.reshape(maxlen, input_number)
    for day in range(43):
        result = model.predict(np.array(inputs).reshape(1, maxlen, input_number)).reshape(-1)
        inputs = np.concatenate((inputs[1:], np.append(inputs[-1, :3], result).reshape(1, input_number)), axis=0)
        results.append([10**(result[0]*cases_max), result[1]*fatal_max])

In [121]:
test_df[], len(results)

(284, 12212)

In [123]:
submit_df = pd.read_csv(data_dir+"submission.csv")

In [134]:
np.array(results)[np.array(results) == np.nan]

array([], dtype=float64)

In [141]:
results[0]

[122.82014843514558, 6.496423825621605]

In [154]:
cases = []
fatals = []
for i in range(len(results)):
    n = results[i][0] 
    f = results[i][1]
    try:
        cases.append(int(n))
    except:
        cases.append(0)
    
    try:
        fatals.append(int(f))
    except:
        fatals.append(0)

In [155]:
submit_df["ConfirmedCases"] = cases
submit_df["Fatalities"] = fatals

In [157]:
submit_df.to_csv("submission.csv")