Student Test Scores challenge - Predicting scores on students
---

In [231]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [232]:
def final_normalize(df):
    df.columns = [c.replace(' ', '_').strip().lower() for c in df.columns]
    df = df.loc[:, ~df.columns.duplicated()]
    return df

trn_df = pd.read_csv("KaggleData/playground-series-s6e1/train.csv")
test_df = pd.read_csv("KaggleData/playground-series-s6e1/test.csv")
orig_df = pd.read_csv("KaggleData/playground-series-s6e1/StudentPerformanceFactors.csv")

trn_df = final_normalize(trn_df)
test_df = final_normalize(test_df)
orig_df = final_normalize(orig_df)

orig_df.columns = [c.lower() for c in orig_df.columns]

mapping = {
    'hours_studied': 'study_hours',
    'attendance': 'class_attendance',
}
orig_df = orig_df.rename(columns=mapping)

cat_cols = ['gender', 'internet_access']
for col in cat_cols:
    for df in [trn_df, test_df, orig_df]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.lower()

ordinal_maps = {
    "sleep_quality": {"poor": 0, "average": 1, "good": 2},
    "facility_rating": {"low": 0, "medium": 1, "high": 2},
    "exam_difficulty": {"easy": 0, "moderate": 1, "hard": 2},
    "internet_access": {"no": 0, "yes": 1}
}

for df in [trn_df, test_df, orig_df]:
    for col, mapping in ordinal_maps.items():
        if col in df.columns:
            new_col = f"{col.split('_')[0]}_num" 
            df[new_col] = df[col].map(mapping)

trn_df['is_original'] = 0
orig_df['is_original'] = 1

common_cols = list(set(trn_df.columns) & set(orig_df.columns))

final_selection = list(set(common_cols + ['is_original']))
train_df = pd.concat([trn_df, orig_df[final_selection]], axis=0, ignore_index=True)

seed = 42

In [233]:
def make_features(df):
    df['high_study'] = (df['study_hours'] >= 7).astype(int)
    df['study_hours_sq'] = df['study_hours'] ** 2
    df['class_attendance_sq'] = df['class_attendance'] ** 2
    df['active_hours']        = 24 - df["sleep_hours"]
    df['free_time']         = df['active_hours'] - df['study_hours']       
    return df
    
train_df = make_features(train_df)
test_df = make_features(test_df)

kf = KFold(n_splits=5, shuffle=True, random_state=seed)
overall_mean = train_df['exam_score'].mean()

def target_encode_smooth(df, col, target, m=20):
    global_mean = df[target].mean()
    stats = df.groupby(col)[target].agg(['mean','count'])
    smooth = (stats['count'] * stats['mean'] + m * global_mean) / (stats['count'] + m)
    return smooth

for col in ["course", "study_method"]:
    new_col = f"{col}_te"
    train_df[new_col] = 0

    for tr_idx, val_idx in kf.split(train_df):
        smooth_map = target_encode_smooth(
            train_df.iloc[tr_idx], col, "exam_score", m=20
        )
        train_df.loc[val_idx, new_col] = (
            train_df.loc[val_idx, col].map(smooth_map)
        )

    train_df[new_col] = train_df[new_col].fillna(overall_mean)

    # test
    smooth_map = target_encode_smooth(train_df, col, "exam_score", m=20)
    test_df[new_col] = test_df[col].map(smooth_map).fillna(overall_mean)

In [237]:
nominal_cols = ["gender", "internet_access"]
ordinal_cols = ["sleep_quality", "facility_rating", "exam_difficulty"]

drop_cols = [
    "id", "age", "exam_num", "internet_num",
    "study_method", "course", "effort_index"
]

X = train_df.drop(["exam_score"] + [c for c in drop_cols if c in train_df.columns], axis=1)
X_test = test_df.drop([c for c in drop_cols if c in test_df.columns] + ["id"], axis=1)
y = train_df["exam_score"]

In [238]:
print(X.select_dtypes(include="object").columns.tolist())


['gender', 'internet_access', 'sleep_quality', 'facility_rating', 'exam_difficulty']


In [239]:
cat_features = [
    "gender", "internet_access",
    "sleep_quality", "facility_rating",
    "exam_difficulty"
]

for col in cat_features:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}")

    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    model = CatBoostRegressor(
        iterations=8000,
        learning_rate=0.03,
        depth=5,
        loss_function="RMSE",
        random_seed=42,
        subsample=0.8,
        verbose=200,
        early_stopping_rounds=200
    )

    model.fit(
        X_tr, y_tr,
        eval_set=(X_val, y_val),
        cat_features=cat_features,
        use_best_model=True
    )

    oof[val_idx] = model.predict(X_val)
    test_preds += model.predict(X_test) / kf.n_splits

rmse = np.sqrt(mean_squared_error(y, oof))
print("OOF RMSE:", rmse)

Fold 1
0:	learn: 18.4788489	test: 18.4407950	best: 18.4407950 (0)	total: 55.4ms	remaining: 7m 23s
200:	learn: 8.8423280	test: 8.8342912	best: 8.8342912 (200)	total: 9.11s	remaining: 5m 53s
400:	learn: 8.8028991	test: 8.7997633	best: 8.7997633 (400)	total: 17.8s	remaining: 5m 36s
600:	learn: 8.7849723	test: 8.7841017	best: 8.7841017 (600)	total: 26.7s	remaining: 5m 29s
800:	learn: 8.7726362	test: 8.7739380	best: 8.7739380 (800)	total: 35.9s	remaining: 5m 23s
1000:	learn: 8.7647456	test: 8.7678761	best: 8.7678761 (1000)	total: 44.3s	remaining: 5m 10s
1200:	learn: 8.7556172	test: 8.7607345	best: 8.7607345 (1200)	total: 53.8s	remaining: 5m 4s
1400:	learn: 8.7480775	test: 8.7554101	best: 8.7554101 (1400)	total: 1m 3s	remaining: 4m 58s
1600:	learn: 8.7404449	test: 8.7500692	best: 8.7500692 (1600)	total: 1m 13s	remaining: 4m 52s
1800:	learn: 8.7340476	test: 8.7459407	best: 8.7459407 (1800)	total: 1m 23s	remaining: 4m 45s
2000:	learn: 8.7282683	test: 8.7423496	best: 8.7423496 (2000)	total: 1m 

CatBoostError: catboost/libs/data/model_dataset_compatibility.cpp:81: At position 10 should be feature with name is_original (found high_study).

In [None]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "exam_score": test_preds
})
submission.to_csv("submission.csv", index=False)