In [1]:
import json

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.svm import SVC

from keras.models import Model
from keras.layers import Input, Conv1D, Dense, Activation, GlobalAveragePooling1D

from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

from imblearn.over_sampling import SMOTE

from utils import split_sequences

Using TensorFlow backend.


In [2]:
with open('config.json', 'r') as file:
    configs = json.load(file)

RANDOM_STATE = 42
MODEL_CHECKPOINT_PATH = 'models'
lookback_window = configs['LOOKBACK_WINDOW']
lookforward_window = configs['LOOKFORWARD_WINDOW']


In [3]:
df = pd.read_csv('dataset.csv')

df['readiness_group'] = pd.cut(df['readiness'], 3, labels=[0, 1, 2])

df.head()

Unnamed: 0,fatigue,mood,readiness,sleep_duration_h,sleep_quality,soreness,stress,pid,injury_severity,srpe,monotony,strain
0,2,3,5,6,3,2,3,1,0.0,0.0,-1.0,-1.0
1,2,3,6,6,3,2,3,1,0.0,0.0,-1.0,-1.0
2,3,3,8,6,3,3,3,1,0.0,0.0,-1.0,-1.0
3,3,3,8,6,3,3,3,1,0.0,0.0,-1.0,-1.0
4,3,3,8,5,3,3,3,1,0.0,210.0,-1.0,-1.0


In [67]:
X, y = split_sequences(
    df, 
    lookback_window, 
    lookforward_window, 
    y_variable_name='readiness_group', 
    exclude_columns=['pid', 'date', 'readiness', 'readiness_group']
)

nsamples, nx, ny = X.shape
X = X.reshape((nsamples, nx*ny))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)

# MinMaxScaler
minmax_scaler = MinMaxScaler()
X_train = minmax_scaler.fit_transform(X_train)
X_test = minmax_scaler.transform(X_test)

clf = LogisticRegressionCV(cv=5, max_iter=4000, random_state=RANDOM_STATE).fit(X_train, y_train)

y_pred = clf.predict(X_train)
print('On train data')
print('F1 macro: ', f1_score(y_train, y_pred, average='macro'))
print('F1 weighted: ', f1_score(y_train, y_pred, average='weighted'))

y_test_pred = clf.predict(X_test)
print('On test data')
print('F1 macro: ', f1_score(y_test, y_test_pred, average='macro'))
print('F1 weighted: ', f1_score(y_test, y_test_pred, average='weighted'))

In [None]:
# LogisticRegressionCV

X, y = split_sequences(
    df, 
    lookback_window, 
    lookforward_window, 
    y_variable_name='readiness_group', 
    exclude_columns=['pid', 'date', 'readiness', 'readiness_group']
)

nsamples, nx, ny = X.shape
X = X.reshape((nsamples, nx*ny))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)

sm = SMOTE(sampling_strategy='auto', random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)

# MinMaxScaler
minmax_scaler = MinMaxScaler()
X_train = minmax_scaler.fit_transform(X_train)
X_test = minmax_scaler.transform(X_test)

clf = LogisticRegressionCV(cv=5, max_iter=4000, random_state=RANDOM_STATE).fit(X_train, y_train)

y_pred = clf.predict(X_train)
print('On train data')
print('F1 macro: ', f1_score(y_train, y_pred, average='macro'))
print('F1 weighted: ', f1_score(y_train, y_pred, average='weighted'))

y_test_pred = clf.predict(X_test)
print('On test data')
print('F1 macro: ', f1_score(y_test, y_test_pred, average='macro'))
print('F1 weighted: ', f1_score(y_test, y_test_pred, average='weighted'))

In [8]:
# SVM Classifier

X, y = split_sequences(
    df, 
    lookback_window, 
    lookforward_window,
    y_variable_name='readiness_group',
    exclude_columns=['pid', 'date', 'readiness', 'readiness_group']
)

nsamples, nx, ny = X.shape
X = X.reshape((nsamples, nx*ny))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)

sm = SMOTE(sampling_strategy='auto', random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)

# MinMaxScaler
minmax_scaler = MinMaxScaler()
X_train = minmax_scaler.fit_transform(X_train)
X_test = minmax_scaler.transform(X_test)

svm_clf = SVC(decision_function_shape='ovo', max_iter=1000, gamma='scale')
svm_clf.fit(X_train, y_train)

y_pred = svm_clf.predict(X_train)
print('On train data')
print('F1 macro: ', f1_score(y_train, y_pred, average='macro'))
print('F1 weighted: ', f1_score(y_train, y_pred, average='weighted'))

y_test_pred = svm_clf.predict(X_test)
print('On test data')
print('F1 macro: ', f1_score(y_test, y_test_pred, average='macro'))
print('F1 weighted: ', f1_score(y_test, y_test_pred, average='weighted'))

On train data
F1 macro:  0.6652001843796245
F1 weighted:  0.6652001843796244
On test data
F1 macro:  0.16390057056588436
F1 weighted:  0.2378613417304261


On train data
F1 macro:  0.18350954147275553
F1 weighted:  0.27555744349757344
On test data
F1 macro:  0.11768428945470578
F1 weighted:  0.19027169451144277


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [28]:
# RandomForestClassifier
X, y = split_sequences(
    df, 
    lookback_window, 
    lookforward_window,
    y_variable_name='readiness_group',
    exclude_columns=['pid', 'date', 'readiness', 'readiness_group']
)

nsamples, nx, ny = X.shape
X = X.reshape((nsamples, nx*ny))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)

sm = SMOTE(sampling_strategy='auto', random_state=42)
# target_samples = {0: 150, 1: 100, 2: 150, 8: 150, 9: 100, 10: 100}
# sm = SMOTE(sampling_strategy=target_samples, random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)

# MinMaxScaler
minmax_scaler = MinMaxScaler()
X_train = minmax_scaler.fit_transform(X_train)
X_test = minmax_scaler.transform(X_test)

rf_clf = RandomForestClassifier(n_estimators=50, max_depth=4, random_state=42, max_features='auto')
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_train)
print('On train data')
print('F1 macro: ', f1_score(y_train, y_pred, average='macro'))
print('F1 weighted: ', f1_score(y_train, y_pred, average='weighted'))

y_test_pred = rf_clf.predict(X_test)
print('On test data')
print('F1 macro: ', f1_score(y_test, y_test_pred, average='macro'))
print('F1 weighted: ', f1_score(y_test, y_test_pred, average='weighted'))

On train data
F1 macro:  0.7044406483601623
F1 weighted:  0.7044406483601622
On test data
F1 macro:  0.5047273842654936
F1 weighted:  0.590355565279333


In [7]:
# CNN classifier
X, y = split_sequences(
    df, 
    lookback_window, 
    lookforward_window,
    y_variable_name='readiness_group',
    exclude_columns=['pid', 'date', 'readiness', 'readiness_group']
)

nsamples, nx, ny = X.shape
X = X.reshape((nsamples, nx*ny))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)

# MinMaxScaler
minmax_scaler = MinMaxScaler()
X_train = minmax_scaler.fit_transform(X_train)
X_test = minmax_scaler.transform(X_test)

sm = SMOTE(sampling_strategy='auto', random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)

nsamples = X_train.shape[0]
X_train = X_train.reshape((nsamples, nx, ny))

nsamples = X_test.shape[0]
X_test = X_test.reshape((nsamples, nx, ny))

# Internal neural network parameters
input_dim = X_train.shape[-1]
output_dim = len(np.bincount(y))

input_shape = (lookback_window, input_dim)

batch_size = configs['CNN_MODEL']['BATCH_SIZE']
epochs = configs['CNN_MODEL']['EPOCHS']

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

scores = []

for train, test in kfold.split(X_train, y_train):
    input_layer = Input(input_shape)
    
    conv1 = Conv1D(filters=128, kernel_size=8, padding='same')(input_layer)
    conv1 = BatchNormalization()(conv1)
    conv1 = Activation(activation='relu')(conv1)
    
    gap_layer = GlobalAveragePooling1D()(conv1)
    
    output_layer = Dense(output_dim, activation='softmax')(gap_layer)
    
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=configs['LEARNING_RATE']))    
    
    early_stop_callback = EarlyStopping(monitor='val_loss', patience=15, mode='min', verbose=1)
    
    model.fit(
            x=X_train[train], 
            y=y_train[train], 
            epochs=epochs,
            validation_data=(X_train[test], y_train[test]),
            batch_size=batch_size,        
            callbacks=[early_stop_callback],
            verbose=0
        )
        
    y_pred = np.argmax(model.predict(X_train[test]), axis=-1)    
    f1_weighted = f1_score(y_train[test], y_pred, average='weighted')    
    scores.append(f1_weighted)
    
print(f'F1 weighted on cross validation data. Mean: {np.mean(scores)}, std: {np.std(scores)}')



Epoch 00400: early stopping


Epoch 00432: early stopping


Epoch 00449: early stopping


Epoch 00446: early stopping


Epoch 00535: early stopping


On cross validation data. Mean F1 weighted : 0.7998759725615932, F1 weighted std: 0.025079057361894197
