In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
from xgboost import XGBClassifier

# For LSTM and CNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# === Step 1: Load your training and testing datasets ===
df_train = pd.read_csv('processed_train.csv')
df_test = pd.read_csv('processed_test.csv')

# === Step 2: Fix Academic_Level ===
def map_academic_level(val):
    if str(val).lower() in ['undergraduate', 'graduate', 'high school', 'student', 'college']:
        return 'Student'
    else:
        return val

df_train['Academic_Level'] = df_train['Academic_Level'].apply(map_academic_level)
df_test['Academic_Level'] = df_test['Academic_Level'].apply(map_academic_level)

# === Step 3: Fix Country column by converting all test countries to "Bangladesh" ===
df_test['Country'] = 'Bangladesh'
df_test['Most_Used_Platform'] = 'Facebook'
# === Step 4: Label Encoding categorical columns based on train data only ===
label_cols = ['Gender', 'Academic_Level', 'Country', 'Most_Used_Platform', 'Affects_Academic_Performance', 'Relationship_Status']
label_encoders = {}

for col in label_cols:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col].astype(str))
    label_encoders[col] = le
    df_test[col] = le.transform(df_test[col].astype(str))

# === Step 5: Create binary and multiclass target labels ===
df_train['Mental_Health_Binary'] = df_train['Mental_Health_Score'].apply(lambda x: 1 if x <= 5 else 0)
df_test['Mental_Health_Binary'] = df_test['Mental_Health_Score'].apply(lambda x: 1 if x <= 5 else 0)

def multi_class_label(score):
    if score <= 4:
        return 0
    elif score <= 7:
        return 1
    else:
        return 2

df_train['Mental_Health_Multiclass'] = df_train['Mental_Health_Score'].apply(multi_class_label)
df_test['Mental_Health_Multiclass'] = df_test['Mental_Health_Score'].apply(multi_class_label)

# === Step 6: Feature Scaling for continuous features ===
cont_cols = ['Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night', 'Addicted_Score']
scaler = StandardScaler()
df_train[cont_cols] = scaler.fit_transform(df_train[cont_cols])
df_test[cont_cols] = scaler.transform(df_test[cont_cols])

# === Step 7: Split features and targets ===
X_train = df_train.drop(columns=['Mental_Health_Score', 'Mental_Health_Binary', 'Mental_Health_Multiclass'])
y_train_binary = df_train['Mental_Health_Binary']
y_train_multi = df_train['Mental_Health_Multiclass']

X_test = df_test.drop(columns=['Mental_Health_Score', 'Mental_Health_Binary', 'Mental_Health_Multiclass'])
y_test_binary = df_test['Mental_Health_Binary']
y_test_multi = df_test['Mental_Health_Multiclass']

# === Step 8: Define models and hyperparameters ===
binary_models = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True),
    "MLP": MLPClassifier(max_iter=300),
    # LSTM separately below
}

binary_param_grid = {
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10],
        'min_samples_split': [2, 5],
    },
    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'solver': ['liblinear'],
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
    },
    'MLP': {
        'hidden_layer_sizes': [(64,), (64, 32)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001],
    }
}

multi_models = {
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "LightGBM": lgb.LGBMClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "MLP": MLPClassifier(max_iter=300),
    # CNN separately below
}

multi_param_grid = {
    'Decision Tree': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
    },
    'KNN': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
    },
    'LightGBM': {
        'num_leaves': [31, 64],
        'learning_rate': [0.1, 0.01],
        'n_estimators': [100, 200],
    },
    'XGBoost': {
        'max_depth': [3, 5],
        'learning_rate': [0.1, 0.01],
        'n_estimators': [100, 200],
    },
    'MLP': {
        'hidden_layer_sizes': [(64,), (128,)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001],
    }
}

# === Step 9: Train Binary models with GridSearchCV (skip Naive Bayes and LSTM) ===
binary_best_models = {}
for model_name, model in binary_models.items():
    if model_name == "Naive Bayes":
        model.fit(X_train, y_train_binary)
        binary_best_models[model_name] = model
        continue
    print(f"Training Binary model: {model_name}")
    grid_search = GridSearchCV(model, binary_param_grid[model_name], cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train_binary)
    binary_best_models[model_name] = grid_search.best_estimator_

# === Train LSTM binary model separately ===
def create_lstm_model_binary(input_shape):
    model = Sequential([
        LSTM(64, input_shape=input_shape, return_sequences=True),
        Dropout(0.5),
        LSTM(64),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

print("Training LSTM Binary Model...")
X_train_lstm = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
lstm_model_binary = create_lstm_model_binary((1, X_train.shape[1]))
lstm_model_binary.fit(X_train_lstm, y_train_binary, epochs=10, batch_size=32, verbose=0)
binary_best_models["LSTM"] = lstm_model_binary

# === Step 10: Evaluate binary models on external test dataset ===
print("\nBinary Classification External Validation Results:")
for model_name, model in binary_best_models.items():
    if model_name == "LSTM":
        X_test_lstm = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))
        y_pred_prob = model.predict(X_test_lstm)
        y_pred = (y_pred_prob > 0.5).astype(int).flatten()
    else:
        y_pred = model.predict(X_test)

    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test_binary, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test_binary, y_pred, zero_division=0):.4f}")
    print(f"Recall: {recall_score(y_test_binary, y_pred, zero_division=0):.4f}")
    print(f"F1-Score: {f1_score(y_test_binary, y_pred, zero_division=0):.4f}\n")

# === Step 11: Train Multiclass models with GridSearchCV (skip CNN) ===
multi_best_models = {}
for model_name, model in multi_models.items():
    print(f"Training Multiclass model: {model_name}")
    grid_search = GridSearchCV(model, multi_param_grid[model_name], cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train_multi)
    multi_best_models[model_name] = grid_search.best_estimator_

# === Train CNN multiclass model separately ===
def create_cnn_model_multiclass(input_shape):
    model = Sequential([
        Conv1D(32, 3, activation='relu', input_shape=input_shape),
        MaxPooling1D(),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(3, activation='softmax')
    ])
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

print("Training CNN Multiclass Model...")
X_train_cnn = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
y_train_cnn = to_categorical(y_train_multi, num_classes=3)
cnn_model_multiclass = create_cnn_model_multiclass((X_train.shape[1], 1))
cnn_model_multiclass.fit(X_train_cnn, y_train_cnn, epochs=10, batch_size=32, verbose=0)
multi_best_models["CNN"] = cnn_model_multiclass

# === Step 12: Evaluate multiclass models on external test dataset ===
print("\nMulticlass Classification External Validation Results:")
for model_name, model in multi_best_models.items():
    if model_name == "CNN":
        X_test_cnn = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))
        y_pred_prob = model.predict(X_test_cnn)
        y_pred = np.argmax(y_pred_prob, axis=1)
    else:
        y_pred = model.predict(X_test)

    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test_multi, y_pred):.4f}")
    print(f"Precision (weighted): {precision_score(y_test_multi, y_pred, average='weighted', zero_division=0):.4f}")
    print(f"Recall (weighted): {recall_score(y_test_multi, y_pred, average='weighted', zero_division=0):.4f}")
    print(f"F1-Score (weighted): {f1_score(y_test_multi, y_pred, average='weighted', zero_division=0):.4f}\n")


Training Binary model: Random Forest
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Training Binary model: Logistic Regression
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Training Binary model: SVM
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Training Binary model: MLP
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Training LSTM Binary Model...


  super().__init__(**kwargs)



Binary Classification External Validation Results:
Model: Random Forest
Accuracy: 0.7152
Precision: 0.5326
Recall: 1.0000
F1-Score: 0.6950

Model: Logistic Regression
Accuracy: 0.6954
Precision: 0.5158
Recall: 1.0000
F1-Score: 0.6806

Model: Naive Bayes
Accuracy: 0.7483
Precision: 0.5632
Recall: 1.0000
F1-Score: 0.7206

Model: SVM
Accuracy: 0.3245
Precision: 0.3245
Recall: 1.0000
F1-Score: 0.4900

Model: MLP
Accuracy: 0.7152
Precision: 0.5333
Recall: 0.9796
F1-Score: 0.6906

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 74ms/step
Model: LSTM
Accuracy: 0.4305
Precision: 0.3566
Recall: 0.9388
F1-Score: 0.5169

Training Multiclass model: Decision Tree
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Training Multiclass model: KNN
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Training Multiclass model: LightGBM
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of test

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training Multiclass model: MLP
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Training CNN Multiclass Model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Multiclass Classification External Validation Results:
Model: Decision Tree
Accuracy: 0.4768
Precision (weighted): 0.7582
Recall (weighted): 0.4768
F1-Score (weighted): 0.5319

Model: KNN
Accuracy: 0.5828
Precision (weighted): 0.6393
Recall (weighted): 0.5828
F1-Score (weighted): 0.6066

Model: LightGBM
Accuracy: 0.2583
Precision (weighted): 0.4674
Recall (weighted): 0.2583
F1-Score (weighted): 0.3220

Model: XGBoost
Accuracy: 0.4967
Precision (weighted): 0.5903
Recall (weighted): 0.4967
F1-Score (weighted): 0.5324

Model: MLP
Accuracy: 0.5166
Precision (weighted): 0.5967
Recall (weighted): 0.5166
F1-Score (weighted): 0.5469

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Model: CNN
Accuracy: 0.5894
Precision (weighted): 0.6133
Recall (weighted): 0.5894
F1-Score (weighted): 0.5982



In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

binary_results = {}

for model_name, model in binary_best_models.items():
    if model_name == "LSTM":
        X_test_lstm = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))
        y_pred_prob = model.predict(X_test_lstm)
        y_pred = (y_pred_prob > 0.5).astype(int).flatten()
    else:
        y_pred = model.predict(X_test)

    binary_results[model_name] = {
        'accuracy': accuracy_score(y_test_binary, y_pred),
        'precision': precision_score(y_test_binary, y_pred, zero_division=0),
        'recall': recall_score(y_test_binary, y_pred, zero_division=0),
        'f1_score': f1_score(y_test_binary, y_pred, zero_division=0)
    }

binary_result_table = pd.DataFrame([
    [model,
     round(metrics['f1_score'], 6),
     round(metrics['accuracy'], 6),
     round(metrics['recall'], 6),
     round(metrics['precision'], 6)]
    for model, metrics in binary_results.items()
], columns=["Model", "F1-Score", "Accuracy", "Recall", "Precision"])

binary_result_table = binary_result_table.sort_values(by="F1-Score", ascending=False).reset_index(drop=True)

print("Binary Classification Results:")
print(binary_result_table.to_string(index=False))


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Binary Classification Results:
              Model  F1-Score  Accuracy   Recall  Precision
        Naive Bayes  0.720588  0.748344 1.000000   0.563218
      Random Forest  0.695035  0.715232 1.000000   0.532609
                MLP  0.690647  0.715232 0.979592   0.533333
Logistic Regression  0.680556  0.695364 1.000000   0.515789
               LSTM  0.516854  0.430464 0.938776   0.356589
                SVM  0.490000  0.324503 1.000000   0.324503


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

multi_results = {}

for model_name, model in multi_best_models.items():
    if model_name == "CNN":
        X_test_cnn = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))
        y_pred_prob = model.predict(X_test_cnn)
        y_pred = np.argmax(y_pred_prob, axis=1)
    else:
        y_pred = model.predict(X_test)

    multi_results[model_name] = {
        'accuracy': accuracy_score(y_test_multi, y_pred),
        'precision': precision_score(y_test_multi, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_test_multi, y_pred, average='weighted', zero_division=0),
        'f1_score': f1_score(y_test_multi, y_pred, average='weighted', zero_division=0)
    }

multi_result_table = pd.DataFrame([
    [model,
     round(metrics['f1_score'], 6),
     round(metrics['accuracy'], 6),
     round(metrics['recall'], 6),
     round(metrics['precision'], 6)]
    for model, metrics in multi_results.items()
], columns=["Model", "F1-Score", "Accuracy", "Recall", "Precision"])

multi_result_table = multi_result_table.sort_values(by="F1-Score", ascending=False).reset_index(drop=True)

print("\nMulticlass Classification Results:")
print(multi_result_table.to_string(index=False))


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 

Multiclass Classification Results:
        Model  F1-Score  Accuracy   Recall  Precision
          KNN  0.606576  0.582781 0.582781   0.639336
          CNN  0.598189  0.589404 0.589404   0.613304
          MLP  0.546873  0.516556 0.516556   0.596707
      XGBoost  0.532352  0.496689 0.496689   0.590321
Decision Tree  0.531905  0.476821 0.476821   0.758245
     LightGBM  0.322037  0.258278 0.258278   0.467399
