In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# --- 1. Load the Data ---
filename = 'Student-Mental-health.csv' 
print(f"Loading '{filename}'...")

try:
    df = pd.read_csv(filename)
    print(f"Successfully loaded '{filename}'")
except FileNotFoundError:
    print(f"--- ERROR: File not found: '{filename}' ---")
    print("Please make sure the file is in the same folder as your notebook.")
    exit()

# --- 2. Data Pre-processing & Feature Engineering ---
# Drop rows with any missing values - this dataset is small so it's fine.
df = df.dropna()

# Rename columns to be code-friendly (remove spaces)
df.columns = df.columns.str.replace(' ', '_')

# --- 3. Define Target and Features ---
# TARGET (y): A clear Yes/No question
target_col = 'Do_you_have_Depression?'

# FEATURES (X): All other columns
X = df.drop(columns=[target_col, 'Timestamp']) # Timestamp is not a useful feature
y = df[target_col]

print(f"\nTarget (y): {target_col}")
print(f"Features (X): {X.columns.to_list()}")

# --- 4. Convert All Text Columns to Numbers ---
print("Converting text columns to numbers...")
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

# --- 5. Split and Train ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining data shape: {X_train.shape}")

# 
model = RandomForestClassifier(n_estimators=100, random_state=42)
print("Training the student questionnaire model...")
model.fit(X_train, y_train)
print("Model trained successfully!")

# --- 6. Evaluate the New Model ---
print("\n--- Model Evaluation ---")
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%") # <-- This should be very high!
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# --- 7. Save the Model ---
if accuracy > 0.75: # Save if it's over 75%
    model_filename = 'questionnaire_model.joblib'
    joblib.dump(model, model_filename)
    print(f"\nModel accuracy is high! Saved to {model_filename}")
else:
    print(f"\nModel accuracy ({accuracy*100:.2f}%) is not high enough. Not saving.")

Loading 'Student-Mental-health.csv'...
--- ERROR: File not found: 'Student-Mental-health.csv' ---
Please make sure the file is in the same folder as your notebook.


NameError: name 'df' is not defined

In [None]:
!dir