In [2]:
%pip install xgboost

Collecting xgboost
  Using cached xgboost-3.1.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Using cached xgboost-3.1.3-py3-none-win_amd64.whl (72.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.1.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [2]:
# 1. Setup Directories
os.makedirs("data", exist_ok=True)
os.makedirs("artifacts", exist_ok=True)

In [3]:
# Dataset Load 
df = pd.read_csv("data/student_lifestyle_100k.csv")
print("Dataset Loaded Successfully.")
df.head()


Dataset Loaded Successfully.


Unnamed: 0,Student_ID,Age,Gender,Department,CGPA,Sleep_Duration,Study_Hours,Social_Media_Hours,Physical_Activity,Stress_Level,Depression
0,1001,22,Female,Science,3.5,7.3,3.3,3.4,114,5,False
1,1002,20,Male,Engineering,2.72,5.5,7.2,6.0,142,2,False
2,1003,20,Male,Medical,3.01,5.4,2.3,1.8,137,3,False
3,1004,21,Male,Engineering,3.63,8.1,2.0,4.6,130,3,False
4,1005,19,Male,Arts,3.14,6.8,2.6,4.3,4,6,False


In [6]:
# Preprocessing Setup
# Target column define
target_col = 'Depression'
X = df.drop(columns=[target_col])
y = df[target_col]
y.head()

0    False
1    False
2    False
3    False
4    False
Name: Depression, dtype: bool

In [7]:
# Target column
target_col = 'Depression' 
X = df.drop(columns=[target_col])
y = df[target_col].astype(int)  # True/False -> 1/0

# Feature Identification

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns


print(f"Numeric Features: {len(numeric_features)}")
print(f"Categorical Features: {len(categorical_features)}")

Numeric Features: 8
Categorical Features: 2


In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [10]:
# Pipeline Creation
# We use ColumnTransformer to apply different processing to different data types
preprocessor = ColumnTransformer(
    transformers=[
        # Apply Scaling to numeric data (Mean=0, Variance=1)
        ('num', StandardScaler(), numeric_features),
        
        # Apply One-Hot Encoding to categorical data
        # handle_unknown='ignore' ensures the code doesn't crash if new categories appear later
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Create the full pipeline: Preprocessing -> Dimensionality Reduction (PCA)
# PCA is required by your project instructions to create embeddings
embedding_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)) # Keep 95% of the variance/information
])

In [11]:
# Transform Data & Save Reference Data
# This step converts the raw data into vectors (PCA components)
X_embedded = embedding_pipeline.fit_transform(X)

# Create column names for the PCA data
pca_columns = [f"PCA_{i+1}" for i in range(X_embedded.shape[1])]
ref_data = pd.DataFrame(X_embedded, columns=pca_columns)
ref_data['target'] = y # Add target column back for reporting

# Save ref_data.csv for the Evidently AI (Reporting) step later
ref_data.to_csv("data/ref_data.csv", index=False)
print(f"ref_data.csv saved successfully with {X_embedded.shape[1]} PCA features.")

ref_data.csv saved successfully with 11 PCA features.


In [12]:
# Model Comparison
# Split data into Training and Testing sets (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X_embedded, y, test_size=0.2, random_state=42)

# Define the models we want to test
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

best_model = None
best_score = 0
best_model_name = ""

print("\n--- Model Comparison Results ---")
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Test the model
    y_pred = model.predict(X_test)
    
    # Calculate scores
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted') 
    
    print(f"{name} -> Accuracy: {acc:.4f}, F1-Score: {f1:.4f}")
    
    # Keep track of the best performing model
    if acc > best_score:
        best_score = acc
        best_model = model
        best_model_name = name

print(f"\nWinner Model: {best_model_name} with Accuracy: {best_score:.4f}")


--- Model Comparison Results ---
Logistic Regression -> Accuracy: 0.8998, F1-Score: 0.8523
Random Forest -> Accuracy: 0.8996, F1-Score: 0.8552


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost -> Accuracy: 0.8964, F1-Score: 0.8549

Winner Model: Logistic Regression with Accuracy: 0.8998


In [13]:
# Save Artifacts
# We save the best model found in the comparison
with open("artifacts/model.pickle", "wb") as f:
    pickle.dump(best_model, f)

# We save the full pipeline (Scaler + OneHotEncoder + PCA)
# This is crucial for the API to process new data exactly like training data
with open("artifacts/preprocessing_pipeline.pickle", "wb") as f:
    pickle.dump(embedding_pipeline, f)


# The API will just interpret 0 as 'False' and 1 as 'True' automatically.

print("All artifacts saved successfully in 'artifacts' folder.")

All artifacts saved successfully in 'artifacts' folder.
