In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import xgboost as xgb
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
np.random.seed(42)


In [2]:
train_file = "/kaggle/input/playground-series-s6e1/train.csv"
test_file = "/kaggle/input/playground-series-s6e1/test.csv"
original_file = "/kaggle/input/exam-score-prediction-dataset/Exam_Score_Prediction.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
original_df = pd.read_csv(original_file)
submission_df = pd.read_csv("/kaggle/input/playground-series-s6e1/sample_submission.csv")

TARGET = 'exam_score'
base_features = [col for col in train_df.columns if col not in [TARGET, 'id']]

In [3]:
def preprocess(df):
    df_temp = df.copy()
    
    df_temp['feature_formula'] = (
        5.9051154511950499 * df_temp['study_hours'] +
        0.34540967058057986 * df_temp['class_attendance'] +
        1.423461171860262 * df_temp['sleep_hours'] + 4.7819
    )
    
    df_temp['study_hours_squared'] = df_temp['study_hours'] ** 2
    df_temp['study_hours_cubed'] = df_temp['study_hours'] ** 3
    df_temp['class_attendance_squared'] = df_temp['class_attendance'] ** 2
    df_temp['sleep_hours_squared'] = df_temp['sleep_hours'] ** 2
    df_temp['age_squared'] = df_temp['age'] ** 2
    
    df_temp['log_study_hours'] = np.log1p(df_temp['study_hours'])
    df_temp['log_class_attendance'] = np.log1p(df_temp['class_attendance'])
    df_temp['log_sleep_hours'] = np.log1p(df_temp['sleep_hours'])
    df_temp['sqrt_study_hours'] = np.sqrt(df_temp['study_hours'])
    df_temp['sqrt_class_attendance'] = np.sqrt(df_temp['class_attendance'])
    
    for col in base_features:
        df_temp[col] = df_temp[col].astype(str)
    
    numeric_features = ['feature_formula', 'study_hours_squared', 'study_hours_cubed',
                       'class_attendance_squared', 'sleep_hours_squared', 'age_squared',
                       'log_study_hours', 'log_class_attendance', 'log_sleep_hours',
                       'sqrt_study_hours', 'sqrt_class_attendance']
    
    return df_temp[base_features + numeric_features]

In [4]:
X_raw = preprocess(train_df)
y = train_df[TARGET].reset_index(drop=True)

X_test_raw = preprocess(test_df)
X_orig_raw = preprocess(original_df)
y_orig = original_df[TARGET].reset_index(drop=True)

full_data = pd.concat([X_raw, X_test_raw, X_orig_raw], axis=0)

for col in base_features:
    full_data[col] = full_data[col].astype('category')

numeric_cols = ['feature_formula', 'study_hours_squared', 'study_hours_cubed',
                'class_attendance_squared', 'sleep_hours_squared', 'age_squared',
                'log_study_hours', 'log_class_attendance', 'log_sleep_hours',
                'sqrt_study_hours', 'sqrt_class_attendance']
for col in numeric_cols:
    full_data[col] = full_data[col].astype(float)

X = full_data.iloc[:len(train_df)].copy()
X_test = full_data.iloc[len(train_df):len(train_df)+len(test_df)].copy()
X_original = full_data.iloc[len(train_df)+len(test_df):].copy()

In [5]:
xgb_params = {
    'n_estimators': 10000,
    'learning_rate': 0.007,
    'max_depth': 7,
    'subsample': 0.8,
    'reg_lambda': 3,
    'colsample_bytree': 0.6,
    'colsample_bynode': 0.7,
    'tree_method': 'hist',
    'random_state': 42,
    'early_stopping_rounds': 100,
    'eval_metric': 'rmse',
    'enable_categorical': True
}

test_predictions = []
oof_predictions = np.zeros(len(X))
kf = KFold(n_splits=7, shuffle=True, random_state=42)

In [6]:
for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"\n--- Fold {fold+1} ---")

    X_train_fold, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index]

    X_train_combined = pd.concat([X_train_fold, X_original], axis=0)
    y_train_combined = pd.concat([y_train_fold, y_orig], axis=0)

    model = xgb.XGBRegressor(**xgb_params)
    model.fit(X_train_combined, y_train_combined, eval_set=[(X_val, y_val)], verbose=1000)

    val_preds = model.predict(X_val)
    oof_predictions[val_index] = val_preds
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f"RMSE: {rmse:.5f}")

    test_preds = model.predict(X_test)
    test_predictions.append(test_preds)

oof_rmse = np.sqrt(mean_squared_error(y, oof_predictions))

print("\n-----------------------")
print(f"OOF RMSE: {oof_rmse:.5f}")
print(f"Improvement from Step 2: {8.64444 - oof_rmse:.5f}")

oof_df = pd.DataFrame({'id': train_df['id'], TARGET: oof_predictions})
oof_df.to_csv('xgb_oof.csv', index=False)

submission_df[TARGET] = np.mean(test_predictions, axis=0)
submission_df.to_csv('submission_xgb.csv', index=False)
submission_df.head()


--- Fold 1 ---
[0]	validation_0-rmse:18.75075
[1000]	validation_0-rmse:8.61330
[1831]	validation_0-rmse:8.60827
RMSE: 8.60819

--- Fold 2 ---
[0]	validation_0-rmse:18.81103
[1000]	validation_0-rmse:8.65226
[2000]	validation_0-rmse:8.64677
[2096]	validation_0-rmse:8.64722
RMSE: 8.64676

--- Fold 3 ---
[0]	validation_0-rmse:18.79167
[1000]	validation_0-rmse:8.65697
[1752]	validation_0-rmse:8.65240
RMSE: 8.65198

--- Fold 4 ---
[0]	validation_0-rmse:18.78449
[1000]	validation_0-rmse:8.63120
[1804]	validation_0-rmse:8.62602
RMSE: 8.62591

--- Fold 5 ---
[0]	validation_0-rmse:18.90668
[1000]	validation_0-rmse:8.66786
[1616]	validation_0-rmse:8.66327
RMSE: 8.66324

--- Fold 6 ---
[0]	validation_0-rmse:18.80899
[1000]	validation_0-rmse:8.62592
[1673]	validation_0-rmse:8.62206
RMSE: 8.62178

--- Fold 7 ---
[0]	validation_0-rmse:18.87746
[1000]	validation_0-rmse:8.67392
[1849]	validation_0-rmse:8.66540
RMSE: 8.66524

-----------------------
OOF RMSE: 8.64047
Improvement from Step 2: 0.00397


Unnamed: 0,id,exam_score
0,630000,69.029686
1,630001,70.60379
2,630002,90.430923
3,630003,56.605061
4,630004,45.200356
