### Additional Preprocessing Steps

In [60]:
import pandas as pd
import numpy as np

# load the dataset
data = pd.read_csv('/content/sample_data/dataset.csv')

# basic info
print(data.info())
print(data.head())

# class distribution
print(data['main.disorder'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 945 entries, 0 to 944
Columns: 1149 entries, no. to COH.gamma.O1.O2
dtypes: float64(1144), int64(1), object(4)
memory usage: 8.3+ MB
None
   no. sex   age    eeg.date  education     IQ       main.disorder  \
0    1   M  57.0   2012.8.30       13.0  102.0  Addictive disorder   
1    2   M  37.0    2012.9.6        6.0  120.0  Addictive disorder   
2    3   M  32.0   2012.9.10       16.0  113.0  Addictive disorder   
3    4   M  35.0   2012.10.8       18.0  126.0  Addictive disorder   
4    5   M  36.0  2012.10.18       16.0  112.0  Addictive disorder   

      specific.disorder  delta.FP1  delta.FP2  ...  COH.gamma.Pz.P4  \
0  Alcohol use disorder  35.998557  21.717375  ...        55.989192   
1  Alcohol use disorder  13.425118  11.002916  ...        45.595619   
2  Alcohol use disorder  29.941780  27.544684  ...        99.475453   
3  Alcohol use disorder  21.496226  21.846832  ...        59.986561   
4  Alcohol use disorder  37.775667  

In [61]:
from sklearn.preprocessing import LabelEncoder

# drop irrelevant columns
# data = data.drop(['no.','eeg.date', 'Unnamed: 122'], axis= 1)

data = data.drop(columns=["education","IQ","Unnamed: 122","no.","sex","age","eeg.date","specific.disorder"])

# encode sex
# data['sex'] = data['sex'].map({'M': 1, 'F':0})

le = LabelEncoder()
data['main.disorder'] = le.fit_transform(data['main.disorder'])

# numeric_cols = data.select_dtypes(include=[np.number]).columns

# # Handle outliers
# for col in numeric_cols:
#     if col in data.columns:  # Ensure column still exists
#         data[col] = data[col].clip(lower=data[col].quantile(0.01), upper=data[col].quantile(0.99))

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

data_x = data.drop(['main.disorder'], axis=1)
data_y = data['main.disorder']

# split the data
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=42)

# Handle class imbalance with SMOTE
# Set sampling strategy to match the majority class
class_counts = y_train.value_counts()
sampling_strategy = {label: max(class_counts) for label in class_counts.index}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(x_train, y_train)
print("Class distribution after SMOTE:\n", pd.Series(y_train_balanced).value_counts())

x_train = X_train_balanced
y_train = y_train_balanced

Class distribution after SMOTE:
 main.disorder
3    210
5    210
0    210
1    210
6    210
2    210
4    210
Name: count, dtype: int64


In [64]:
# using random forest for classification
random_forest_model = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=2,
    min_samples_leaf=1,
    max_depth = None,
    random_state=42,
    class_weight='balanced'
    )
random_forest_model.fit(x_train, y_train)
y_test_pred = random_forest_model.predict(x_test)

In [65]:
# hyperparameter tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1
)
grid_search.fit(x_train, y_train)

# best parameters
print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [56]:
best_random_forest_model = grid_search.best_estimator_

# evaluate best model
y_test_pred = best_random_forest_model.predict(x_test)

In [58]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')
f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"accuracy: {accuracy * 100 :.4f} ")
print(f"precision: {precision:.4f}")
print(f"recall: {recall:.4f}")
print(f"f1-score: {f1:.4f}")

y_test_prob = best_random_forest_model.predict_proba(x_test)
auc = roc_auc_score(y_test, y_test_prob, multi_class='ovr')
print(f"AUC-ROC: {auc:.4f}")

accuracy: 27.5132 
precision: 0.2764
recall: 0.2751
f1-score: 0.2705
AUC-ROC: 0.6300
