In [None]:
import scipy as sp
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels as stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import theano 
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
%matplotlib inline
import myutil_lstm as myutil

Using TensorFlow backend.


In [None]:
import importlib
_ = importlib.reload(myutil)

## Get data and take first look at dataset

In [None]:
dfx_train = myutil.get_indexed_dataset('data/dengue_features_train.csv')
dfy_train = myutil.get_indexed_dataset('data/dengue_labels_train.csv')
dfx_test = myutil.get_indexed_dataset('data/dengue_features_test.csv')
# combine training features with training labels for data exploration later on
dftrain = myutil.set_index(pd.merge(dfx_train, dfy_train))
dftrain.head()

## Deal with NaN on both training and test datasets together

In [None]:
#dftrain.dtypes
#dftrain.isnull().sum()
#dftest.isnull().sum()
# Will stack the train and test datasets to treat all NaN values together
# Need to add bogus total_cases column to test dataset so the files can be easily concatenated
# update total_cases = -1 to easily identify the records for later split data to original partitions
dfx_test['total_cases'] = -1
dfall = myutil.set_index(pd.concat((dftrain, dfx_test), axis=0))
dfall.sort_index(axis=0, inplace=True)
dfall.head()

In [None]:
dfall.isnull().sum()

In [None]:
dfall = myutil.set_nan_to_week_mean(dfall.copy())
dfall.isnull().sum()

## Split dataset

In [None]:
dfall_iq, dfall_sj = myutil.split_dataset_by_city(dfall)

## Continue preprosessing

In [None]:
# drop unnecessary columns
def drop_columns(df):
    df.drop(['city','year','week_start_date'], axis=1, inplace=True)
    return df

dfall_iq = drop_columns(dfall_iq.copy())
dfall_sj = drop_columns(dfall_sj.copy())

## Restore training and test partitions (now that NaNs have been properly filled)

In [None]:
dftrain_iq = dfall_iq[dfall_iq['total_cases']>0].copy()     # total_cases was set to -1 for test partition
dftrain_sj = dfall_sj[dfall_sj['total_cases']>0].copy()     # total_cases was set to -1 for test partition

dftest_iq = dfall_iq[dfall_iq['total_cases']<0].copy()
dftest_sj = dfall_sj[dfall_sj['total_cases']<0].copy()
dftest_iq.drop('total_cases', axis=1, inplace=True)
dftest_sj.drop('total_cases', axis=1, inplace=True)

In [None]:
# split dataset into test and validation partitions
def prep_LSTM_run(city_data, timesteps):
    
    X = city_data[:,:-1]
    y = city_data[:,-1]
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

    # LSTM exped 3D input of the form [no_samples, timesteps, no_features] 
    X_train = X_train.reshape(X_train.shape[0], timesteps, int(X_train.shape[1]/timesteps))
    X_valid = X_valid.reshape(X_valid.shape[0], timesteps, int(X_valid.shape[1]/timesteps))
    
    return X_train, X_valid, y_train, y_valid


def LSTM_run_iq(nptrain, timesteps=1, epochs=50, batch_size=32, exploring=False):
    
    # create partitions for training
    X_train, X_valid, y_train, y_valid = prep_LSTM_run(nptrain, timesteps)
    if exploring: print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)
    
    # build graph
    model = Sequential()
    
    #keras.layers.LSTM(units, activation='tanh', recurrent_activation='hard_sigmoid', 
    #              use_bias=True, kernel_initializer='glorot_uniform', 
    #              recurrent_initializer='orthogonal', bias_initializer='zeros', 
    #              unit_forget_bias=True, kernel_regularizer=None, recurrent_regularizer=None, 
    #              bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, 
    #              recurrent_constraint=None, bias_constraint=None, dropout=0.0, 
    #              recurrent_dropout=0.0, implementation=1, return_sequences=False, 
    #              return_state=False, go_backwards=False, stateful=False, unroll=False)

    model.add(LSTM((X_train.shape[1]*X_train.shape[2])*2, return_sequences=True,\
                   input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(X_train.shape[1]*X_train.shape[2], return_sequences=True))
    model.add(LSTM(X_train.shape[1], return_sequences=False))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mae', optimizer='rmsprop')  # 'rmsprop'
    
    # fit net
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, 
                        validation_data=(X_valid, y_valid), verbose=0, shuffle=False)
    
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    
    if exploring:
        print("Final loss train: {}".format(history.history['loss'][-1]))
        print("Final loss valid: {}".format(history.history['val_loss'][-1]))

    plt.legend()
    plt.show()
    
    return model

# Get test dataset, create predictions and save them in the proper submission file format
def LSTM_predict_and_save(df_iq, model_iq, ts_iq, df_sj, model_sj, ts_sj, dftest_iq, dftest_sj, filename):

    nptest_iq = myutil.preprocess_test(df_iq.copy(), dftest_iq, ts_iq)
    nptest_sj = myutil.preprocess_test(df_sj.copy(), dftest_sj, ts_sj)

    yhat_iq = model_iq.predict(nptest_iq.reshape(nptest_iq.shape[0], ts_iq, int(nptest_iq.shape[1]/ts_iq)))
    yhat_sj = model_sj.predict(nptest_sj.reshape(nptest_sj.shape[0], ts_sj, int(nptest_sj.shape[1]/ts_sj)))
    
    #print(yhat_iq.shape)
    #print(yhat_sj.shape)

    dfsubm = pd.read_csv('data/submission_format.csv')
    npsubm_sj = np.concatenate((dfsubm[dfsubm['city']=='sj'][['city','year','weekofyear']].values, \
                                yhat_sj.round().astype('int64')), axis=1)
    npsubm_iq = np.concatenate((dfsubm[dfsubm['city']=='iq'][['city','year','weekofyear']].values, \
                                yhat_iq.round().astype('int64')), axis=1)
    dfresults = pd.DataFrame(np.concatenate((npsubm_sj, npsubm_iq), axis=0), columns=dfsubm.columns)
    dfresults.to_csv(filename, index=False)
    

In [None]:
periods_iq = 2
nptrain_iq = myutil.preprocess(dftrain_iq.copy(), periods_iq)
model_iq = LSTM_run_iq(nptrain_iq, timesteps=periods_iq, epochs=25, batch_size=periods_iq*2, exploring=True)

In [None]:
def LSTM_run_sj(nptrain, timesteps=1, epochs=50, batch_size=32, exploring=False):
    
    # create partitions for training
    X_train, X_valid, y_train, y_valid = prep_LSTM_run(nptrain, timesteps)
    if exploring: print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)
    
    # build graph
    model = Sequential()
    
    #keras.layers.LSTM(units, activation='tanh', recurrent_activation='hard_sigmoid', 
    #              use_bias=True, kernel_initializer='glorot_uniform', 
    #              recurrent_initializer='orthogonal', bias_initializer='zeros', 
    #              unit_forget_bias=True, kernel_regularizer=None, recurrent_regularizer=None, 
    #              bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, 
    #              recurrent_constraint=None, bias_constraint=None, dropout=0.0, 
    #              recurrent_dropout=0.0, implementation=1, return_sequences=False, 
    #              return_state=False, go_backwards=False, stateful=False, unroll=False)

    model.add(LSTM((X_train.shape[1]*X_train.shape[2])*4, return_sequences=True,\
                   input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM((X_train.shape[1]*X_train.shape[2])*2, return_sequences=True))
    model.add(LSTM((X_train.shape[1]*X_train.shape[2])*1, return_sequences=True))
    model.add(LSTM(X_train.shape[1], return_sequences=False))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mae', optimizer='rmsprop')  # 'rmsprop'
    
    # fit net
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, 
                        validation_data=(X_valid, y_valid), verbose=0, shuffle=False)
    
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    
    if exploring:
        print("Final loss train: {}".format(history.history['loss'][-1]))
        print("Final loss valid: {}".format(history.history['val_loss'][-1]))

    plt.legend()
    plt.show()
    
    return model

In [None]:
periods_sj = 2
nptrain_sj = myutil.preprocess(dftrain_sj.copy(), periods_sj)
model_sj = LSTM_run_sj(nptrain_sj, timesteps=periods_sj, epochs=60, batch_size=periods_iq*2, exploring=True)

## Get test dataset and create predictions

In [None]:
LSTM_predict_and_save(dftrain_iq, model_iq, periods_iq, dftrain_sj, model_sj, periods_sj, dftest_iq, dftest_sj,\
                      "data/submission_20171108_lstm_1.csv")