In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import time # We'll import time to measure it

# --- 1. Load the Data ---
try:
    data = pd.read_csv('data.csv')
    print(f"Data loaded. Your file has {len(data)} rows.")
except FileNotFoundError:
    print("Error: 'data.csv' not found.")
    print("Please make sure 'data.csv' is in the same folder as this notebook.")
    exit()

# --- 2. Define Features (X) and Target (y) ---
# We are using the correct lowercase columns
required_cols = ['q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'PHQ9 score']
if not all(col in data.columns for col in required_cols):
    print(f"Error: Your 'data.csv' file is missing one of these columns: {required_cols}")
    print(f"Your file only has these columns: {list(data.columns)}")
    exit()

features = ['q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9']
X = data[features]

def categorize_severity(score):
    if score <= 4:
        return 'None-minimal'
    elif score <= 9:
        return 'Mild'
    elif score <= 14:
        return 'Moderate'
    elif score <= 19:
        return 'Moderately Severe'
    else:
        return 'Severe'

y = data['PHQ9 score'].apply(categorize_severity)

print("Data preprocessed successfully.")

# --- 3. Split the Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Total samples: {len(X)}, Training samples: {len(X_train)}")

# --- 4. Train the Model ---
print("\nTraining the SUPER-FAST model...")

# --- THIS IS THE CHANGE ---
# We changed n_estimators from 50 to 10. This will be very fast.
start_time = time.time() # Start timer
model = RandomForestClassifier(n_estimators=10, random_state=42)
# --- END OF CHANGE ---

model.fit(X_train, y_train)
end_time = time.time() # Stop timer
print(f"Model training complete. It took {end_time - start_time:.2f} seconds.")

# --- 5. Check Model Accuracy ---
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Model Accuracy on Test Data: {acc * 100:.2f}%")

# --- 6. Save the Model ---
joblib.dump(model, 'questionnaire_model.joblib')

print("\n-------------------------------------------------")
print("SUCCESS! New model 'questionnaire_model.joblib' has been saved.")
print("This model is now trained on the PHQ-9 questions.")
print("-------------------------------------------------")

IndentationError: unexpected indent (947724311.py, line 62)