In [6]:
# Mental Health Prediction Using 5 Features

# ---------------------
# 1. Load the dataset
# ---------------------
import pandas as pd

df = pd.read_csv('survey.csv')

# ---------------------
# 2. Select important features + target
# ---------------------
selected_features = ['Age', 'Gender', 'self_employed', 'family_history', 'work_interfere']
df = df[selected_features + ['treatment']]

# ---------------------
# 3. Preprocessing
# ---------------------

# Fill missing values
df['self_employed'] = df['self_employed'].fillna('No')
df['work_interfere'] = df['work_interfere'].fillna('Never')

# Label Encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

# ---------------------
# 4. Train/Test Split
# ---------------------
from sklearn.model_selection import train_test_split

X = df[selected_features]
y = df['treatment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------
# 5. Train Model
# ---------------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ---------------------
# 6. Evaluate Model
# ---------------------
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ---------------------
# 7. Save Model
# ---------------------
import joblib
joblib.dump(model, 'app/model.pkl')  # Save into the app/ folder


Accuracy: 0.7698412698412699

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.72      0.76       129
           1       0.74      0.82      0.78       123

    accuracy                           0.77       252
   macro avg       0.77      0.77      0.77       252
weighted avg       0.77      0.77      0.77       252



['app/model.pkl']

In [7]:
print(X.columns.tolist())



['Age', 'Gender', 'self_employed', 'family_history', 'work_interfere']


In [8]:
y.value_counts(normalize=True)


treatment
1    0.505957
0    0.494043
Name: proportion, dtype: float64

In [9]:
# Try predicting on a few custom samples
test1 = model.predict([[22, 1, 1, 1, 3]])  # Young, male, self-employed, family history, often interferes
test2 = model.predict([[40, 0, 0, 0, 0]])  # Older, female, no family history, no interference

print("Test 1:", test1)
print("Test 2:", test2)


Test 1: [1]
Test 2: [0]


