# Traditional machine learning models for age prediction on EEG data

This notebook uses traditional ML methods to predict the age of infants using EEG data. The EEG data is preprocessed and features are extracted as shown in the notebook 'Deep learning EEG_dataset preprocessing'. 

In [1]:
import sys, os, fnmatch, csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.insert(0, os.path.dirname(os.getcwd()))

from config import ROOT, PATH_CODE, PATH_DATA, PATH_DATA_PROCESSED, PATH_MODELS, PATH_METADATA, PATH_OUTPUT

## Load preprocessed data

Steps:

1. Get all the files in the output folder
2. Get the full paths of the files without the .h5 or .csv extensions
3. Load the features from the .h5 files
4. Assign the proper labels to the files based on the metadata
5. Assign the subject's code to the files based on the metadata
6. Split the data into a training, validation and test set (NOTE: make sure data points from same subjects don't end up in same set

In [3]:
from sklearn.model_selection import train_test_split

# Step 1: Get all the files in the output folder
file_names = os.listdir(PATH_OUTPUT)

# Step 2: Get the full paths of the files (without extensions)
files = [os.path.splitext(os.path.join(PATH_OUTPUT, file_name))[0] for file_name in fnmatch.filter(file_names, "*.h5")]

# Step 3: Load the features
frames = []

for idx, feature_file in enumerate(files):
    df_features = pd.read_hdf(feature_file + ".h5")
    df_metadata = pd.read_csv(feature_file.replace("extracted_features_", "processed_data_") + ".csv")
    
    # Step 4: Assign labels
    df_features['label'] = df_metadata['age_months'][0]
    
    # Step 5: Assign subject code
    df_features['code'] = df_metadata['code'][0]
    frames.append(df_features)

df = pd.concat(frames) 

# Step 6: Split data in train, validation and test
# df_train, df_temp = train_test_split(df, test_size=0.3, random_state=42)
# df_test, df_val = train_test_split(df_temp, test_size=0.5, random_state=42)

df_train, df_test = train_test_split(df, test_size=0.15, random_state=42)

# TODO: Split train/validation/test based on 'code' (subject number) to make sure the same code isn't in multiple sets at once

In [4]:
# print(f"Train/val/test proportions: {len(df_train)/len(df)}/{len(df_val)/len(df)}/{len(df_test)/len(df)}")
print(f"Train/test proportions: {len(df_train)/len(df)}/{len(df_test)/len(df)}")

Train/test proportions: 0.849999378655425/0.15000062134457504


In [5]:
X_train = df_train.drop(['label', 'code'], axis=1)
y_train = df_train['label']

# X_val = df_val.drop(['label', 'code'], axis=1)
# y_val = df_val['label']

X_test = df_test.drop(['label', 'code'], axis=1)
y_test = df_test['label']

In [6]:
# del(df, df_temp, frames, df_features, df_metadata, df_train, df_val, df_test)
del(file_names, files, df, frames, df_features, df_metadata, df_train, df_test)

In [11]:
print(f"{X_train.memory_usage(deep=True).sum()/1000000000}")
print(f"{y_train.memory_usage(deep=True)/1000000000}")
print(f"{X_test.memory_usage(deep=True).sum()/1000000000}")
print(f"{y_test.memory_usage(deep=True)/1000000000}")

4.1953824
0.0186048
0.740365208
0.003283216


## Model 1: Random Forest regressor

In [None]:
%%time

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [10, 25, 50, 100], 
              'max_depth': [5, 10],
              'max_features': ['sqrt', 'log2'],
              'criterion' :['mse', 'mae'],
              'ccp_alpha': list(np.linspace(0, 1, 10))
             }

rf_clf = GridSearchCV(RandomForestRegressor(verbose=10), parameters, verbose=10, n_jobs=1)
# rf_clf.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))
rf_clf.fit(X_train, y_train)

Fitting 5 folds for each of 320 candidates, totalling 1600 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  2.2min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  4.4min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 13.6min


In [1]:
# R2
score = rf_clf.score(X_test, y_test)

# MSE
predictions = rf_clf.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)

print(f"Performance of Random Forest regressor: R-squared = {score} and MSE = {rmse}.")

In [None]:
from sklearn.svm import SVR

parameters = {'C': [0.1, 1, 100, 1000],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
              'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
              'gamma': ['scale', 'auto', 0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
}


svr_clf = GridSearchCV(SVR(verbose=True), parameters, verbose=10)
svr_clf.fit(X_train, y_train)

In [None]:
# R2
score = svr_clf.score(X_test, y_test)

# MSE
predictions = svr_clf.predict(X_test)
rmse = mean_squared_error(y_test, predictions, squared=False)

print(f"Performance of Support Vector regressor: R-squared = {score} and MSE = {rmse}.")