# Student Performance Classification - Model Training

This notebook uses the already cleaned and preprocessed datasets (`train_cleaned.csv` and `test_cleaned.csv`).
We directly load the data, train the baseline models (Logistic Regression, Random Forest, K-Means), and save them without redundant preprocessing.

In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Use relative paths since notebook runs from src/ml/
TRAIN_PATH = "../../datasets/train_cleaned.csv"
TEST_PATH = "../../datasets/test_cleaned.csv"
MODEL_DIR = "models"

os.makedirs(MODEL_DIR, exist_ok=True)


## 1. Load Data

In [2]:
print(f"Loading training data from {TRAIN_PATH}")
train_df = pd.read_csv(TRAIN_PATH)

print(f"Loading test data from {TEST_PATH}")
test_df = pd.read_csv(TEST_PATH)

target_col = 'final_grade'

X_train = train_df.drop(columns=[target_col])
y_train = train_df[target_col]

X_test = test_df.drop(columns=[target_col])
y_test = test_df[target_col]

feature_names = X_train.columns.tolist()
print(f"\nTrain Shape: {X_train.shape}")
print(f"Test Shape: {X_test.shape}")

# Save feature names for future inference
with open(os.path.join(MODEL_DIR, "feature_names.pkl"), "wb") as f:
    pickle.dump(feature_names, f)

Loading training data from ../../datasets/train_cleaned.csv
Loading test data from ../../datasets/test_cleaned.csv

Train Shape: (12000, 22)
Test Shape: (3000, 22)


## 2. Define Category Labels

In [3]:
labels_in_data = sorted(y_train.unique())
target_names = [f"Grade {i}" for i in labels_in_data]
print("Classes identified:", target_names)

Classes identified: ['Grade 0', 'Grade 1', 'Grade 2', 'Grade 3', 'Grade 4', 'Grade 5']


## 3. Logistic Regression

In [4]:
print("Training Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced")
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)

print(f"\nLogistic Regression Accuracy: {acc_lr*100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred_lr, target_names=target_names, zero_division=0))

# Save Model
with open(os.path.join(MODEL_DIR, "logistic_regression.pkl"), "wb") as f:
    pickle.dump(lr_model, f)

Training Logistic Regression...



Logistic Regression Accuracy: 93.83%
Classification Report:
              precision    recall  f1-score   support

     Grade 0       0.93      1.00      0.96       359
     Grade 1       0.96      0.92      0.94       676
     Grade 2       0.94      0.93      0.93       754
     Grade 3       0.95      0.92      0.94       739
     Grade 4       0.89      0.96      0.92       328
     Grade 5       0.94      1.00      0.97       144

    accuracy                           0.94      3000
   macro avg       0.93      0.95      0.94      3000
weighted avg       0.94      0.94      0.94      3000



## 4. Random Forest Classifier

In [5]:
print("Training Random Forest Classifier...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced", n_jobs=-1)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

print(f"\nRandom Forest Accuracy: {acc_rf*100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=target_names, zero_division=0))

# Save Model
with open(os.path.join(MODEL_DIR, "random_forest.pkl"), "wb") as f:
    pickle.dump(rf_model, f)

Training Random Forest Classifier...

Random Forest Accuracy: 99.70%
Classification Report:
              precision    recall  f1-score   support

     Grade 0       0.99      1.00      1.00       359
     Grade 1       1.00      1.00      1.00       676
     Grade 2       1.00      1.00      1.00       754
     Grade 3       1.00      1.00      1.00       739
     Grade 4       0.99      1.00      0.99       328
     Grade 5       1.00      0.99      1.00       144

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000



## 5. K-Means Clustering

In [6]:
print("Training K-Means Clustering (Unsupervised, k=5)...")
km_model = KMeans(n_clusters=5, random_state=42, n_init=10)
km_model.fit(X_train)

print("K-Means clustering complete.")

# Save Model
with open(os.path.join(MODEL_DIR, "kmeans.pkl"), "wb") as f:
    pickle.dump(km_model, f)

Training K-Means Clustering (Unsupervised, k=5)...


K-Means clustering complete.
