In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# --- 1. Load the Data ---
filename = 'Student-Mental-health.csv.csv' # Corrected filename
print(f"Loading '{filename}'...")

try:
    df = pd.read_csv(filename)
    print(f"Successfully loaded '{filename}'")
except FileNotFoundError:
    print(f"--- ERROR: File not found: '{filename}' ---")
    exit()

# --- 2. Data Pre-processing & Feature Engineering ---
df = df.dropna()
df.columns = df.columns.str.replace(' ', '_')

# --- 3. Define Target and Features ---
target_col = 'Do_you_have_Depression?'
X = df.drop(columns=[target_col, 'Timestamp']) 
y = df[target_col]

print(f"\nTarget (y): {target_col}")
print(f"Features (X): {X.columns.to_list()}")

# --- 4. Convert All Text Columns to Numbers ---
print("Converting text columns to numbers...")
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

# --- 5. Split and Train ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Added stratify
print(f"\nTraining data shape: {X_train.shape}")

# --- 6. THE FIX: Create a "Class-Aware" Model ---
model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42,
    class_weight='balanced'  # <-- THIS IS THE FIX
)
print("Training the student questionnaire model (with balancing)...")
model.fit(X_train, y_train)
print("Model trained successfully!")

# --- 7. Evaluate the New Model ---
print("\n--- Model Evaluation ---")
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%") 
print("\nClassification Report:")
# Look for a better 'Yes' score in this new report
print(classification_report(y_test, y_pred))

# --- 8. Save the Model ---
if accuracy > 0.70: # Let's lower the bar to 70%
    model_filename = 'questionnaire_model.joblib'
    joblib.dump(model, model_filename)
    print(f"\nModel accuracy is high enough! Saved to {model_filename}")
else:
    print(f"\nModel accuracy ({accuracy*100:.2f}%) is still not high enough. Not saving.")
    

Loading 'Student-Mental-health.csv.csv'...
Successfully loaded 'Student-Mental-health.csv.csv'

Target (y): Do_you_have_Depression?
Features (X): ['Choose_your_gender', 'Age', 'What_is_your_course?', 'Your_current_year_of_Study', 'What_is_your_CGPA?', 'Marital_status', 'Do_you_have_Anxiety?', 'Do_you_have_Panic_attack?', 'Did_you_seek_any_specialist_for_a_treatment?']
Converting text columns to numbers...

Training data shape: (80, 9)
Training the student questionnaire model (with balancing)...
Model trained successfully!

--- Model Evaluation ---
Model Accuracy: 85.00%

Classification Report:
              precision    recall  f1-score   support

          No       0.81      1.00      0.90        13
         Yes       1.00      0.57      0.73         7

    accuracy                           0.85        20
   macro avg       0.91      0.79      0.81        20
weighted avg       0.88      0.85      0.84        20


Model accuracy is high enough! Saved to questionnaire_model.joblib


In [1]:
!dir

 Volume in drive C is Windows
 Volume Serial Number is 20A2-098C

 Directory of C:\Users\smile\OneDrive\miniproject1

02-11-2025  00:11    <DIR>          .
31-10-2025  16:02    <DIR>          ..
01-11-2025  23:53    <DIR>          .ipynb_checkpoints
01-11-2025  23:41        31,469,558 Combined Data.csv.csv
01-11-2025  23:41             7,339 Student-Mental-health.csv.csv
01-11-2025  18:30             6,194 train_chatbot.ipynb
02-11-2025  00:11             4,231 train_questionnaire.ipynb
               4 File(s)     31,487,322 bytes
               3 Dir(s)  47,378,456,576 bytes free
