# Settings

In [4]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler

from matplotlib import pyplot
%matplotlib inline

# Read dataset

In [2]:
df = pd.read_csv('../RNN_data_version1.csv')
df = df.drop(df.columns[0], axis=1)

In [None]:
df.head(20)

In [None]:
df.shape

# Make data ready for input

This data has 1181 samples with for every sample 5 timesteps. Every timestep has 19 different numerical features.

Use normalisation of range (0,1) to counter for vanishing gradient problem and other advantages.

In [5]:
# Split into train and test set
dataset = df.iloc[:,:].values
rand_split = np.random.rand(len(dataset))
train_list = rand_split < 0.8
#val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.8

train = dataset[train_list]

# normalize features
scaler = MinMaxScaler()
train = scaler.fit_transform(train)

test = dataset[test_list]

# split into input and output
train_X, train_y = train[:,:-1], train[:,-1]
test_X, test_y = test[:,:-1], test[:,-1]

# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 5, 19))
test_X = test_X.reshape((test_X.shape[0], 5, 19))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(933, 5, 19) (933,) (248, 5, 19) (248,)


In [22]:
np.square(scaler.data_max_[95] - scaler.data_min_[95])

36.0

# Train model

Many-to-one LSTM recurrent neural network with 32 neurons, dropout of 20% for regularization and adam optimizer.

### Long Short-Term Memory (LSTM) network

This following figure shows the operations of an LSTM-cell.

<img src="images/lstm.png" style="width:700px;height:300px;">

### Nested k-fold cross-validation

This is done to evaluate between different models.

In [8]:
# function to create model for kerasClassifier
def create_model(dropout_rate, neurons):
    from keras.models import Sequential
    from keras.layers import Reshape
    from keras.layers import Dense
    from keras.layers import LSTM
    from keras.layers import Dropout
    model = Sequential()
    #model.add(Reshape((5,19), input_shape=(95,)))
    model.add(LSTM(units=neurons, input_shape=(5, 19), dropout=dropout_rate))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam', metrics=['mse'])
    return model

# create model
def train_model():
    from keras.wrappers.scikit_learn import KerasRegressor
    model = KerasRegressor(build_fn=create_model, verbose=0)
    
    # define the grid search parameters
    batch_size = [64]
    epochs = [128, 256]
    dropout_rate = [0.0, 0.2]
    neurons = [32,64,128]
    param_grid = dict(batch_size=batch_size, epochs=epochs, dropout_rate=dropout_rate, neurons=neurons)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
    grid_result = cross_val_score(grid, train_X, train_y, cv=3, verbose=1)
    return grid_result
    
results = train_model()



[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 43.8min finished


In [47]:
metrics = (np.absolute(results) * np.square(scaler.data_max_[95] - scaler.data_min_[95]))
print(np.mean(metrics))
print(metrics)

0.536661780089
[ 0.70661271  0.37828904  0.5250836 ]


In [None]:
pyplot.boxplot(metrics)

### K-fold cross-validation

Hyperparameter grid search tuning

In [None]:
# function to create model for kerasClassifier
def create_model(dropout_rate, neurons):
    from keras.models import Sequential
    from keras.layers import Reshape
    from keras.layers import Dense
    from keras.layers import LSTM
    from keras.layers import Dropout
    model = Sequential()
    #model.add(Reshape((5,19), input_shape=(95,)))
    model.add(LSTM(units=neurons, input_shape=(5, 19), dropout=dropout_rate))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam', metrics=['mse'])
    return model

# create model
def train_model():
    from keras.wrappers.scikit_learn import KerasRegressor
    model = KerasRegressor(build_fn=create_model, verbose=0)
    
    # define the grid search parameters
    batch_size = [64]
    epochs = [128, 256]
    dropout_rate = [0.0, 0.2]
    neurons = [32, 64, 128]
    param_grid = dict(batch_size=batch_size, epochs=epochs, dropout_rate=dropout_rate, neurons=neurons)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
    grid_result = grid.fit(train_X, train_y)
    return grid_result
    
results = train_model()


In [None]:
print("Best: %f using %s" % (results.best_score_, results.best_params_))
means = results.cv_results_['mean_test_score']
stds = results.cv_results_['std_test_score']
params = results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# design neural net
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
model = Sequential()
model.add(LSTM(128, input_shape=(5, 19), dropout=0.0))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')

# fit neural net
history = model.fit(train_X, train_y, epochs=128, batch_size = 64, validation_split=0.2, verbose = 2, shuffle=True)

In [None]:
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='validation')
pyplot.legend()
pyplot.show()

# Calculate and compare model accuracy

In [32]:
MAE = lambda p,o: sum(abs(p - o)) / len(o)
RMSE = lambda p,o: (sum((p - o) ** 2) / len(o))

In [30]:
originals = df.iloc[:,92].values
predictions = df.iloc[:,95].values

## Baseline accuracy

Baseline accuracy is calculated by predicting mood of t+1 with mood of time t. Both MAE and RMSE are showed.

In [34]:
print('MAE: {}'.format(MAE(predictions, originals)))
print('MSE: {}'.format(RMSE(predictions, originals)))

MAE: 0.550179943390397
MSE: 0.5670021934142934


Check how to statistically define a confidence interval for RMSE baseline value. Two approaches can be considered.

- Use assumptions that predictions - originals are normal distributed

\begin{equation}
H← ​​​60 ​+​ \frac{​​30(B−R)​​}{Vmax−Vmin}  ​​, if V​max​​ = G
\end{equation}

- Use bootstrap to estimate confidence interval of RMSE

In [None]:
x = (predictions - originals)
np.mean(x)

In [None]:
import scipy.stats as stats
stats.probplot(x, dist="norm", plot=pyplot)
pyplot.show()

In [None]:
pyplot.hist(x)

## Model accuracy

Prediction of values with model for test set and then scale values back.

In [None]:
#make predictions
trainPredict = model.predict(train_X)
testPredict = model.predict(test_X)

inverse = np.concatenate((test_X.reshape((test_X.shape[0], 95)), testPredict), axis=1)
inverse2 = scaler.inverse_transform(inverse)
testPrediction = inverse2[:,-1]

inverse = np.concatenate((test_X.reshape((test_X.shape[0], 95)), test_y.reshape((len(test_y), 1))), axis=1)
inverse2 = scaler.inverse_transform(inverse)
testY = inverse2[:,-1]

#invert predictions
#trainPredict = scaler.inverse_transform(trainPredict)
#trainY = scaler.inverse_transform(train_y)
#testPredict = scaler.inverse_transform(testPredict)
#testY = scaler.inverse_transform(test_y)
original = test_y[np.newaxis].T

In [None]:
print('MAE: {}'.format(MAE(testPrediction, testY)))
print('MSE: {}'.format(RMSE(testPrediction, testY)))