In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import joblib

In [2]:
df = pd.read_csv('../data/student_performance_with_PS_and_labels.csv')

In [3]:
model_df = df.drop(columns=['ps'])
model_df.head()

Unnamed: 0,last_sem_spi,internal_assessment_avg,attendance_percent,total_backlogs,pyq_solving_freq,study_hours_weekly,sleep_category,gaming_hours_weekly,assignment_delay_count,department,travel_time_category,extra_curricular_level,sleep_score,travel_score,extra_score,performance_category
0,4.37,52.4,78.4,1,3,28.9,6-8,21.7,2,EC,<30,Medium,1.0,1.0,0.7,Average
1,9.56,50.0,34.8,1,4,48.7,6-8,13.2,2,ME,<30,Low,1.0,1.0,0.3,Average
2,7.59,40.6,47.7,3,0,25.8,>8,6.7,2,EE,30-60,Medium,0.8,0.7,0.7,At Risk
3,6.39,66.4,73.1,1,4,5.6,>8,7.6,1,ICT,<30,Low,0.8,1.0,0.3,Average
4,2.4,58.6,58.6,3,2,26.2,6-8,4.5,1,CE,30-60,High,1.0,0.7,1.0,At Risk


In [4]:
categorical_cols = [
    "sleep_category",
    "department",
    "travel_time_category",
    "extra_curricular_level",
    "performance_category"
]

In [5]:
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    model_df[col] = le.fit_transform(model_df[col])
    encoders[col] = le

In [6]:
X = model_df.drop(columns=["performance_category"])
y = model_df["performance_category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
params = {
    'n_estimators': 995,
    'max_depth': 3,
    'min_child_weight': 2,
    'learning_rate': 0.2767290889210465,
    'subsample': 0.6725823903100828,
    'colsample_bytree': 0.688567048234252,
    'gamma': 0.26935104909682095,
    'tree_method': 'hist',
    'booster': 'gbtree'
}

In [8]:
xgb_model = XGBClassifier(**params)
xgb_model.fit(X_train, y_train)

In [9]:
xgb_model.score(X_test, y_test) * 100

96.0

In [9]:
joblib.dump(xgb_model, 'xgb_model.joblib')
joblib.dump(encoders, 'encoders.joblib')