In [1]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np 

In [2]:
# Data Visualization (optional but useful)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Model Selection & Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import fbeta_score, make_scorer, accuracy_score

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [3]:
df1 = pd.read_csv("mental_health_400.csv")
df2 = pd.read_csv("mental_health_1000.csv")
merged_df = pd.concat([df1, df2], ignore_index=True)

merged_df.drop_duplicates(inplace=True)


In [4]:
merged_df.head(5)

Unnamed: 0,Age,Gender,Education_Level,Profession,Sleep_Hours,Physical_Activity_min/wk,Social_Interactions,Screen_Time_hrs/day,Work_Hours_hrs/wk,Mental_Health_Status
0,56,Female,Bachelor's,Unemployed,6.3,70.0,Low,6.0,24.4,Normal
1,46,Male,Bachelor's,Engineer,6.4,150.0,Medium,6.3,43.6,Good
2,32,Male,Bachelor's,Manager,9.1,267.0,High,5.9,12.7,Good
3,60,Non-binary,Master's/PhD,Tech,7.9,56.0,Low,3.3,8.9,Normal
4,25,Female,High School,Doctor,6.4,99.0,High,4.3,38.8,Good


In [9]:
merged_df.shape
df=merged_df

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1400 non-null   int64  
 1   Gender                    1400 non-null   object 
 2   Education_Level           1400 non-null   object 
 3   Profession                1400 non-null   object 
 4   Sleep_Hours               1400 non-null   float64
 5   Physical_Activity_min/wk  1400 non-null   float64
 6   Social_Interactions       1400 non-null   object 
 7   Screen_Time_hrs/day       1400 non-null   float64
 8   Work_Hours_hrs/wk         1400 non-null   float64
 9   Mental_Health_Status      1400 non-null   object 
dtypes: float64(4), int64(1), object(5)
memory usage: 109.5+ KB


In [11]:
df=merged_df

In [12]:
df['Mental_Health_Status'].value_counts()

Mental_Health_Status
Good      818
Normal    549
Bad        33
Name: count, dtype: int64

In [13]:
df[df['Mental_Health_Status']== 'Bad']

Unnamed: 0,Age,Gender,Education_Level,Profession,Sleep_Hours,Physical_Activity_min/wk,Social_Interactions,Screen_Time_hrs/day,Work_Hours_hrs/wk,Mental_Health_Status
58,34,Female,Associate's,Engineer,5.6,115.0,Low,6.4,64.8,Bad
150,47,Male,High School,Teacher,5.3,33.0,Low,6.5,48.0,Bad
166,62,Male,Associate's,Manager,4.7,85.0,Medium,8.9,49.4,Bad
201,19,Female,Master's/PhD,Artist,4.0,122.0,Low,4.5,68.1,Bad
203,54,Non-binary,Bachelor's,Finance,4.2,59.0,Medium,6.4,54.0,Bad
235,46,Male,High School,Manager,5.1,106.0,Medium,7.7,67.8,Bad
250,35,Female,Bachelor's,Student,6.1,58.0,Low,8.7,30.3,Bad
276,37,Female,High School,Tech,5.6,0.0,Medium,4.6,65.5,Bad
281,25,Female,Master's/PhD,Tech,9.7,41.0,Low,7.2,64.9,Bad
337,63,Female,High School,Healthcare,5.9,92.0,High,11.2,67.3,Bad


In [14]:
df.isnull().sum()

Age                         0
Gender                      0
Education_Level             0
Profession                  0
Sleep_Hours                 0
Physical_Activity_min/wk    0
Social_Interactions         0
Screen_Time_hrs/day         0
Work_Hours_hrs/wk           0
Mental_Health_Status        0
dtype: int64

In [15]:
df.duplicated().sum()

0

In [16]:
import pandas as pd
from sklearn.utils import resample

df_good = df[df["Mental_Health_Status"] == "Good"]
df_normal = df[df["Mental_Health_Status"] == "Normal"]
df_bad = df[df["Mental_Health_Status"] == "Bad"]

df_bad_oversampled = resample(
    df_bad,
    replace=True, 
    n_samples=len(df_good)-350,  
    random_state=42
)

df_balanced = pd.concat([df_good, df_normal, df_bad_oversampled])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [17]:
df_balanced.head(6)

Unnamed: 0,Age,Gender,Education_Level,Profession,Sleep_Hours,Physical_Activity_min/wk,Social_Interactions,Screen_Time_hrs/day,Work_Hours_hrs/wk,Mental_Health_Status
0,43,Female,Bachelor's,Engineer,7.0,216.0,High,4.5,48.4,Good
1,19,Male,Master's/PhD,Teacher,8.3,201.0,High,8.0,11.4,Good
2,43,Male,High School,Healthcare,8.6,46.0,High,10.3,13.8,Normal
3,61,Male,Master's/PhD,Teacher,8.6,167.0,Medium,3.3,60.3,Good
4,42,Non-binary,Associate's,Engineer,4.1,24.0,Medium,4.3,74.1,Bad
5,60,Male,Master's/PhD,Manager,6.8,132.0,High,4.7,56.4,Good


In [18]:
label_mapping = {"Bad": 0, "Normal": 1, "Good": 2}
df_balanced["Mental_Health_Status"] = df_balanced["Mental_Health_Status"].map(label_mapping)


In [19]:
df_balanced.head(4)

Unnamed: 0,Age,Gender,Education_Level,Profession,Sleep_Hours,Physical_Activity_min/wk,Social_Interactions,Screen_Time_hrs/day,Work_Hours_hrs/wk,Mental_Health_Status
0,43,Female,Bachelor's,Engineer,7.0,216.0,High,4.5,48.4,2
1,19,Male,Master's/PhD,Teacher,8.3,201.0,High,8.0,11.4,2
2,43,Male,High School,Healthcare,8.6,46.0,High,10.3,13.8,1
3,61,Male,Master's/PhD,Teacher,8.6,167.0,Medium,3.3,60.3,2


In [20]:
df_balanced.shape

(1835, 10)

In [21]:
df_balanced['Mental_Health_Status'].value_counts()

Mental_Health_Status
2    818
1    549
0    468
Name: count, dtype: int64

In [22]:
df_train, df_test = train_test_split(df_balanced, test_size=0.2, stratify = df_balanced['Mental_Health_Status']) # Make it balance with this target variable in both sides
X_train = df_train.copy()
y_train = X_train.pop("Mental_Health_Status")

In [23]:
X_test = df_test.copy()
y_test = X_test.pop("Mental_Health_Status")
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1468, 9) (1468,)
(367, 9) (367,)


In [24]:
categorical_cols = ['Gender', 'Education_Level', 'Profession', 'Social_Interactions']
numeric_cols = ['Age', 'Sleep_Hours', 'Physical_Activity_min/wk', 'Screen_Time_hrs/day', 'Work_Hours_hrs/wk']

In [25]:
num_pl = Pipeline(
    steps=[
           ('imputer', KNNImputer(n_neighbors=7)), # Handle missing data = KNN predict with n=7
           ('scaler', MinMaxScaler()) # Handle missing data = MinMax scaler
    ]
)

In [26]:
cat_pl = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing data = most frequent 
        ('onehot', OneHotEncoder()), # categorical data ==> One hot vector
    ]
)

In [27]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Pipeline
completed_pl = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ]
)

# Train
completed_pl.fit(X_train, y_train)

# Evaluate
y_train_pred = completed_pl.predict(X_train)
print(f"Accuracy on train: {accuracy_score(y_train, y_train_pred):.2f}")

y_pred = completed_pl.predict(X_test)
print(f"Accuracy on test: {accuracy_score(y_test, y_pred):.2f}")

Accuracy on train: 1.00
Accuracy on test: 0.95


In [58]:
log_reg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=200))
])

log_reg_pipeline.fit(X_train, y_train)
y_pred = log_reg_pipeline.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

Logistic Regression Accuracy: 0.8201634877384196


In [59]:
from sklearn.svm import SVC
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='rbf', C=1, gamma='scale'))
])

svm_pipeline.fit(X_train, y_train)
y_pred = svm_pipeline.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred))

SVM Accuracy: 0.9155313351498637


In [60]:
from xgboost import XGBClassifier
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    ))
])

xgb_pipeline.fit(X_train, y_train)

y_train_pred = xgb_pipeline.predict(X_train)
y_test_pred = xgb_pipeline.predict(X_test)

print(f"XGBoost Train Accuracy: {accuracy_score(y_train, y_train_pred):.2f}")
print(f"XGBoost Test Accuracy: {accuracy_score(y_test, y_test_pred):.2f}")

XGBoost Train Accuracy: 1.00
XGBoost Test Accuracy: 0.97


In [None]:
# saving model for SVM and Xgboost 

In [62]:
import pickle

# Example: trained pipelines
# svm_pipeline -> SVM model pipeline
# xgb_pipeline -> XGBoost model pipeline

# Save SVM pipeline
with open("svm_mental_health.pkl", "wb") as f:
    pickle.dump(svm_pipeline, f)

# Save XGBoost pipeline
with open("xgb_mental_health.pkl", "wb") as f:
    pickle.dump(xgb_pipeline, f)



In [29]:
import pickle
with open("rf_mental_health.pkl", "wb") as f:
    pickle.dump(completed_pl, f)
