In [37]:
import re
import numpy as np
import pandas as pd

### Mega dataset

In [38]:
def compute_average(group_name, df):
    group_columns = [col for col in df.columns if group_name in col]
    return df[group_columns].mean(axis=1)

In [39]:
df2 = pd.read_csv('../build_dataset/paper_data.csv')

paper_df = pd.DataFrame()

# Sort
df2 = df2[df2['specific.disorder'].isin(['Acute stress disorder', 'Healthy control', 'Depressive disorder'])]

paper_df['specific.disorder'] = df2['specific.disorder']

groups = ['alpha', 'beta', 'delta', 'theta']
for group in groups:
    paper_df[f'{group}'] = compute_average(group, df2)

new_column_names = {'alpha': 'Alpha', 'beta': 'Beta', 'theta': 'Theta', 'delta': 'Delta'}
paper_df.rename(columns=new_column_names, inplace=True)

paper_df.head()

Unnamed: 0,specific.disorder,Alpha,Beta,Delta,Theta
31,Acute stress disorder,38.488424,40.882201,39.141091,45.713918
32,Acute stress disorder,47.368559,48.817112,56.639885,50.719569
33,Acute stress disorder,51.872822,22.322245,23.39819,37.223425
34,Acute stress disorder,36.092599,32.847147,35.471763,43.941132
35,Acute stress disorder,28.514535,29.422266,33.819716,28.055607


In [40]:
import pandas as pd

df1 = pd.read_csv('../build_dataset/mega_preprocessed_data.csv')

df = pd.concat([df1, paper_df], axis=0, ignore_index=True)
df.head()

Unnamed: 0,Delta,Theta,Alpha,Beta,specific.disorder
0,12.24546,8.674308,7.131809,10.013944,happy
1,18.08269,15.852509,12.739708,14.830145,happy
2,18.653089,15.062034,11.98957,15.140479,happy
3,21.838218,17.541182,13.624017,17.053419,happy
4,22.067904,17.228788,14.240932,17.33292,happy


In [41]:
df['specific.disorder'].value_counts()

Depressive disorder      199
Healthy control           95
Acute stress disorder     38
studious                  14
happy                      6
Name: specific.disorder, dtype: int64

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352 entries, 0 to 351
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Delta              352 non-null    float64
 1   Theta              352 non-null    float64
 2   Alpha              352 non-null    float64
 3   Beta               352 non-null    float64
 4   specific.disorder  352 non-null    object 
dtypes: float64(4), object(1)
memory usage: 13.9+ KB


In [43]:
print(df.head())

       Delta      Theta      Alpha       Beta specific.disorder
0  12.245460   8.674308   7.131809  10.013944             happy
1  18.082690  15.852509  12.739708  14.830145             happy
2  18.653089  15.062034  11.989570  15.140479             happy
3  21.838218  17.541182  13.624017  17.053419             happy
4  22.067904  17.228788  14.240932  17.332920             happy


In [44]:
df['specific.disorder'].value_counts()

Depressive disorder      199
Healthy control           95
Acute stress disorder     38
studious                  14
happy                      6
Name: specific.disorder, dtype: int64

### Build model

In [45]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

X = df[['Delta', 'Theta', 'Alpha', 'Beta']]
y = df['specific.disorder']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(objective='multi:softmax')

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100))


Accuracy: 54.93%


In [46]:
encoding_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print the mapping
print("Label Encoding Mapping:")
for class_name, encoded_value in encoding_mapping.items():
    print(f"{class_name} -> {encoded_value}")

Label Encoding Mapping:
Acute stress disorder -> 0
Depressive disorder -> 1
Healthy control -> 2
happy -> 3
studious -> 4


In [47]:
y_pred

array([1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1,
       1, 2, 1, 2, 2, 4, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2,
       3, 1, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 2, 1])

#### Optuna

In [28]:
import optuna

def objective(trial):
    params = {
        'objective': 'multi:softmax',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
best_accuracy = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Accuracy: %.2f%%" % (best_accuracy * 100))


[I 2023-10-14 14:46:52,180] A new study created in memory with name: no-name-450a8dd6-871f-4cd5-bc37-6ca5db307c86
[I 2023-10-14 14:46:52,535] Trial 0 finished with value: 0.5633802816901409 and parameters: {'max_depth': 8, 'learning_rate': 0.0817895288865087, 'n_estimators': 121, 'min_child_weight': 6}. Best is trial 0 with value: 0.5633802816901409.
[I 2023-10-14 14:46:52,829] Trial 1 finished with value: 0.4788732394366197 and parameters: {'max_depth': 8, 'learning_rate': 0.019200356360620622, 'n_estimators': 99, 'min_child_weight': 5}. Best is trial 0 with value: 0.5633802816901409.
[I 2023-10-14 14:46:52,975] Trial 2 finished with value: 0.49295774647887325 and parameters: {'max_depth': 5, 'learning_rate': 0.010728192709063318, 'n_estimators': 72, 'min_child_weight': 6}. Best is trial 0 with value: 0.5633802816901409.
[I 2023-10-14 14:46:53,290] Trial 3 finished with value: 0.4788732394366197 and parameters: {'max_depth': 7, 'learning_rate': 0.08376593388411818, 'n_estimators': 198

Best Hyperparameters: {'max_depth': 7, 'learning_rate': 0.040338409359276746, 'n_estimators': 63, 'min_child_weight': 1}
Best Accuracy: 59.15%


In [29]:
best_params = study.best_params
best_accuracy = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Accuracy: %.2f%%" % (best_accuracy * 100))

Best Hyperparameters: {'max_depth': 7, 'learning_rate': 0.040338409359276746, 'n_estimators': 63, 'min_child_weight': 1}
Best Accuracy: 59.15%


In [30]:
import plotly

fig = optuna.visualization.plot_slice(study, params=["max_depth", "learning_rate", 'n_estimators', 'min_child_weight'])
fig.show()

In [31]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()

In [32]:
best_params

{'max_depth': 7,
 'learning_rate': 0.040338409359276746,
 'n_estimators': 63,
 'min_child_weight': 1}

In [33]:
model = xgb.XGBClassifier(**best_params)
model.fit(X_train, y_train)

In [35]:
import pickle

with open('eeg_to_mh_state.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)