# RNN
This file contains all code to train an RNN on our dataset.

## Classification

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Dense, Input
from tensorflow.keras.utils import to_categorical


df = pd.read_csv("../data/features_classification.csv")


In [55]:
participant_ids = df['id'].unique()
np.random.seed(1934)
np.random.shuffle(participant_ids)

train_ids = participant_ids[:int(0.7 * len(participant_ids))]
val_ids = participant_ids[int(0.7 * len(participant_ids)):int(0.85 * len(participant_ids))]
test_ids = participant_ids[int(0.85 * len(participant_ids)):]

train_df = df[df['id'].isin(train_ids)]
val_df = df[df['id'].isin(val_ids)]
test_df = df[df['id'].isin(test_ids)]

In [56]:
import numpy as np
import tensorflow as tf
import random
import os

def set_seeds(seed=1934):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)

In [57]:
def train_test_class_model (train_df, test_df, model_type, model_name, n_epochs, n_hidden, save_model=False):
    """This functions trains, tests and saves a RNN model with different settings."""

    set_seeds(1934)

    X_train = train_df.drop(columns=['Unnamed: 0', 'day', 'day_id', 'id', 'target', 'appCat.entertainment'])
    y_train = train_df['target']
    
    X_test = test_df.drop(columns=['Unnamed: 0', 'day', 'day_id', 'id', 'target', 'appCat.entertainment'])
    y_test = test_df['target']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)
    
    y_train_cat = to_categorical(y_train_encoded)
    y_test_cat = to_categorical(y_test_encoded)

    X_train_rnn = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
    X_test_rnn = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

    model = Sequential([
        Input(shape=(1, X_train.shape[1])),
        model_type(n_hidden, activation='relu'),
        Dense(y_train_cat.shape[1], activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.fit(X_train_rnn, y_train_cat, epochs=n_epochs, batch_size=16, validation_split=0.2, verbose=0)

    if save_model == True:
        model.save(f'./models/rnn_class_{model_name}.keras')

    y_pred_proba = model.predict(X_test_rnn)
    y_pred = np.argmax(y_pred_proba, axis=1)
    y_true = np.argmax(y_test_cat, axis=1)

    f1 = f1_score(y_true, y_pred, average='weighted')
    
    return f1

In [47]:
model_type = SimpleRNN
model_name = "simplernn"
n_hidden = 64
n_epochs = 30

train_test_class_model(train_df, test_df, model_type, model_name, n_epochs, n_hidden, save_model=True)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


0.46943569948968666

### Hyperparameter tuning
* Type of RNN
* Number of epochs
* Hidden units per layer

In [48]:
RNN_types = [SimpleRNN, LSTM, GRU]
n_epochs = [1, 10, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200]
n_hidden = [8, 16, 32, 64, 128]

with open("./hyperparameter_tuning/RNN_classification.csv", "w") as outfile:
    for type in RNN_types:
        print(type)
        for n in n_hidden:
            print(n)
            for e in n_epochs:
                print(e)
                model_name = f"{type.__name__.lower()}_n_hidden_{n}_n_epochs_{e}"
                f1 = train_test_class_model(train_df, val_df, type, model_name, e, n)
                outfile.write(f"{type},{n},{e},{f1}\n")
            

<class 'keras.src.layers.rnn.simple_rnn.SimpleRNN'>
8
1
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
40
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
60
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
80
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
120
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
140
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
160
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
180
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
16
1
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━

In [9]:
results_df = pd.read_csv("./hyperparameter_tuning/RNN_classification.csv", header=None)

best_F1 = results_df[3].max()
best_row = results_df[results_df[3] == best_F1]

print(f"Best F1 score: {best_F1}")
print(best_row)

Best F1 score: 0.5557085484796328
                                           0   1    2         3
83  <class 'keras.src.layers.rnn.lstm.LSTM'>  16  200  0.555709


### Final model

In [58]:
model_type = LSTM
model_name = "classification-final-model"
n_hidden = 16
n_epochs = 200

combined_df = pd.concat([train_df, val_df])

train_test_class_model(combined_df, test_df, model_type, model_name, n_epochs, n_hidden, save_model=True)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


0.4022540261989022

## Regression

In [17]:
df = pd.read_csv("../data/features_regression.csv")

def train_test_regr_model (train_df, test_df, model_type, model_name, n_epochs, n_hidden, save_model=False):
    """This functions trains, tests and saves a RNN model with different settings."""

    set_seeds(1934)
    
    X_train = train_df.drop(columns=['Unnamed: 0', 'day', 'day_id', 'id', 'target', 'appCat.entertainment'])
    y_train = train_df['target']
    
    X_test = test_df.drop(columns=['Unnamed: 0', 'day', 'day_id', 'id', 'target', 'appCat.entertainment'])
    y_test = test_df['target']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_rnn = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
    X_test_rnn = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

    model = Sequential([
        Input(shape=(1, X_train.shape[1])),
        model_type(n_hidden, activation='relu'), 
        Dense(1, activation='linear')
    ])

    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])

    model.fit(X_train_rnn, y_train, epochs=n_epochs, batch_size=16, validation_split=0.2, verbose=0)

    if save_model == True:
        model.save('./models/rnn_model_regression.keras')

    y_pred = model.predict(X_test_rnn)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mae, mse, r2


In [50]:
model_type = SimpleRNN
model_name = "simplernn"
n_hidden = 64
n_epochs = 30

train_test_regr_model(train_df, test_df, model_type, model_name, n_epochs, n_hidden, save_model=True)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


(1.009207010269165, 1.658814787864685, -1.2104926109313965)

### Hyperparameter tuning

In [51]:
RNN_types = [SimpleRNN, LSTM, GRU]
n_epochs = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200]
n_hidden = [8, 16, 32, 64, 128, 256, 512]

with open("./hyperparameter_tuning/RNN_regression.csv", "w") as outfile:
    for type in RNN_types:
        print(type)
        for n in n_hidden:
            print(n)
            for e in n_epochs:
                print(e)
                model_name = f"{type.__name__.lower()}_n_hidden_{n}_n_epochs_{e}"
                mae, mse, r2 = train_test_regr_model(train_df, val_df, type, model_name, e, n)
                outfile.write(f"{type},{n},{e},{mae},{mse},{r2}\n")
            

<class 'keras.src.layers.rnn.simple_rnn.SimpleRNN'>
8
10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
40
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
60
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
70
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
80
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
90
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
110
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
120
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
130
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [11]:
results_df = pd.read_csv("./hyperparameter_tuning/RNN_regression.csv", header=None)

best_F1 = results_df[3].min()
best_row = results_df[results_df[3] == best_F1]

print(f"Best F1 score: {best_F1}")
print(best_row)

Best F1 score: 0.6074029803276062
                                                    0  1    2         3  \
16  <class 'keras.src.layers.rnn.simple_rnn.Simple...  8  170  0.607403   

           4         5  
16  0.657825 -0.030179  


### Final model

In [61]:
model_type = SimpleRNN
model_name = "regression-final-model"
n_hidden = 8
n_epochs = 170

combined_df = pd.concat([train_df, val_df])

train_test_regr_model(combined_df, test_df, model_type, model_name, n_epochs, n_hidden, save_model=True)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


(0.6326197981834412, 0.5996939539909363, 0.20086371898651123)