In [1]:
import re
import numpy as np
import pandas as pd

### Mega dataset

In [2]:
def compute_average(group_name, df):
    group_columns = [col for col in df.columns if group_name in col]
    return df[group_columns].mean(axis=1)

In [3]:
df2 = pd.read_csv('../build_dataset/paper_data.csv')

# Initialize a new DataFrame to store the average values
paper_df = pd.DataFrame()

# Sort
df2 = df2[df2['specific.disorder'].isin(['Acute stress disorder', 'Healthy control', 'Depressive disorder'])]

paper_df['specific.disorder'] = df2['specific.disorder']

# Iterate through each group and compute the average
groups = ['alpha', 'beta', 'delta', 'theta']
for group in groups:
    paper_df[f'{group}'] = compute_average(group, df2)

new_column_names = {'alpha': 'Alpha', 'beta': 'Beta', 'theta': 'Theta', 'delta': 'Delta'}
paper_df.rename(columns=new_column_names, inplace=True)

paper_df.head()

Unnamed: 0,specific.disorder,Alpha,Beta,Delta,Theta
31,Acute stress disorder,38.488424,40.882201,39.141091,45.713918
32,Acute stress disorder,47.368559,48.817112,56.639885,50.719569
33,Acute stress disorder,51.872822,22.322245,23.39819,37.223425
34,Acute stress disorder,36.092599,32.847147,35.471763,43.941132
35,Acute stress disorder,28.514535,29.422266,33.819716,28.055607


In [4]:
import pandas as pd

df1 = pd.read_csv('../build_dataset/mega_preprocessed_data.csv')

df = pd.concat([df1, paper_df], axis=0, ignore_index=True)
df.head()

Unnamed: 0,Delta,Theta,Alpha,Beta,specific.disorder
0,12.24546,8.674308,7.131809,10.013944,happy
1,18.08269,15.852509,12.739708,14.830145,happy
2,18.653089,15.062034,11.98957,15.140479,happy
3,21.838218,17.541182,13.624017,17.053419,happy
4,22.067904,17.228788,14.240932,17.33292,happy


In [5]:
df['specific.disorder'].value_counts()

Depressive disorder      199
Healthy control           95
Acute stress disorder     38
happy                      6
Name: specific.disorder, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Delta              338 non-null    float64
 1   Theta              338 non-null    float64
 2   Alpha              338 non-null    float64
 3   Beta               338 non-null    float64
 4   specific.disorder  338 non-null    object 
dtypes: float64(4), object(1)
memory usage: 13.3+ KB


In [7]:
print(df.head())

       Delta      Theta      Alpha       Beta specific.disorder
0  12.245460   8.674308   7.131809  10.013944             happy
1  18.082690  15.852509  12.739708  14.830145             happy
2  18.653089  15.062034  11.989570  15.140479             happy
3  21.838218  17.541182  13.624017  17.053419             happy
4  22.067904  17.228788  14.240932  17.332920             happy


In [8]:
df['specific.disorder'].value_counts()

Depressive disorder      199
Healthy control           95
Acute stress disorder     38
happy                      6
Name: specific.disorder, dtype: int64

### Build model

In [9]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

X = df[['Delta', 'Theta', 'Alpha', 'Beta']]
y = df['specific.disorder']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(objective='multi:softmax')

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100))


Accuracy: 52.94%


In [10]:
encoding_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print the mapping
print("Label Encoding Mapping:")
for class_name, encoded_value in encoding_mapping.items():
    print(f"{class_name} -> {encoded_value}")

Label Encoding Mapping:
Acute stress disorder -> 0
Depressive disorder -> 1
Healthy control -> 2
happy -> 3


In [11]:
y_pred

array([0, 1, 2, 2, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 2,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 3, 1, 1, 1, 1,
       1, 1, 0, 1, 2, 2, 1, 1, 2, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 2])

#### Optuna

In [12]:
import optuna

def objective(trial):
    params = {
        'objective': 'multi:softmax',
        'num_class': 4,
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
best_accuracy = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Accuracy: %.2f%%" % (best_accuracy * 100))


  from .autonotebook import tqdm as notebook_tqdm
[I 2023-10-14 14:25:43,399] A new study created in memory with name: no-name-ab106fce-0cf7-452b-875b-88315204cd4f
[I 2023-10-14 14:25:43,904] Trial 0 finished with value: 0.5294117647058824 and parameters: {'max_depth': 10, 'learning_rate': 0.04593273909656244, 'n_estimators': 188, 'min_child_weight': 3}. Best is trial 0 with value: 0.5294117647058824.
[I 2023-10-14 14:25:44,089] Trial 1 finished with value: 0.5294117647058824 and parameters: {'max_depth': 3, 'learning_rate': 0.17551448888924445, 'n_estimators': 157, 'min_child_weight': 5}. Best is trial 0 with value: 0.5294117647058824.
[I 2023-10-14 14:25:44,380] Trial 2 finished with value: 0.47058823529411764 and parameters: {'max_depth': 4, 'learning_rate': 0.13431661555086566, 'n_estimators': 187, 'min_child_weight': 7}. Best is trial 0 with value: 0.5294117647058824.
[I 2023-10-14 14:25:44,910] Trial 3 finished with value: 0.5 and parameters: {'max_depth': 10, 'learning_rate': 0.

Best Hyperparameters: {'max_depth': 9, 'learning_rate': 0.10249788397004489, 'n_estimators': 59, 'min_child_weight': 3}
Best Accuracy: 55.88%


In [13]:
best_params = study.best_params
best_accuracy = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Accuracy: %.2f%%" % (best_accuracy * 100))

Best Hyperparameters: {'max_depth': 9, 'learning_rate': 0.10249788397004489, 'n_estimators': 59, 'min_child_weight': 3}
Best Accuracy: 55.88%


In [14]:
import plotly

fig = optuna.visualization.plot_slice(study, params=["max_depth", "learning_rate", 'n_estimators', 'min_child_weight'])
fig.show()

In [16]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()