In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


## Data Set Overview

In [10]:
df_training = pd.read_csv("data/training_data.csv")

In [11]:
df_training.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,target
0,432.475954,289.373016,481.3156,358.755566,802.659004,176.761177,72.648102,720.969179,36.327684,83.768878,...,4.385848,516.789458,19.624422,13.16244,42.351948,35.920392,20.755984,13.8143,384.497136,14.364922
1,517.59625,330.448341,585.920055,22.684031,169.81324,335.60164,284.451476,748.101047,73.701438,358.147215,...,5.563334,2.960064,20.721878,17.740184,1.726915,167.576065,75.492679,2.480979,303.710869,19.984801
2,189.43935,553.88882,165.83379,202.465927,176.695586,321.155049,407.278389,161.245668,282.269025,221.570899,...,4.536947,581.823741,101.695639,0.653592,486.859084,117.491548,6.420465,20.713314,22.651537,12.944351
3,237.307878,195.894881,416.752252,468.729031,611.693517,301.411711,241.880655,49.597044,122.396821,13.828319,...,5.518968,45.014729,196.350455,47.638515,411.414213,67.142022,115.630943,8.927957,388.240433,14.79244
4,602.845256,16.103208,221.759979,345.765574,558.588369,276.704241,408.069566,19.390813,138.769765,146.662193,...,2.136214,133.59043,197.634584,26.278027,111.127557,172.181136,85.869642,30.537857,625.931837,11.802634


In [12]:
df_training.shape

(800, 21)

In [13]:
X = df_training.drop(columns=["target"])
y = df_training["target"]

In [14]:
X.shape, y.shape

((800, 20), (800,))

In [41]:
y.describe()

count    800.000000
mean      14.631342
std        5.089503
min        0.279805
25%       10.879914
50%       14.687955
75%       18.224713
max       27.360789
Name: target, dtype: float64

In [None]:
#split data into 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

## Baseline Model -  Linear Regression

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

rmse_scores = []

for train_index, val_index in kf.split(X):
    
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

print("RMSE per fold:", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))
print("Std RMSE:", np.std(rmse_scores))

RMSE per fold: [np.float64(2.6117882181457994), np.float64(2.9490783946861154), np.float64(2.974830679658575), np.float64(2.9340089582684405), np.float64(2.748458542860123)]
Mean RMSE: 2.8436329587238105
Std RMSE: 0.14093229325185003


## Random Forest Regressor

In [24]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

In [25]:
rmse_scores = []

for train_index, val_index in kf.split(X):
    
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # model = LinearRegression()
    model = RandomForestRegressor(
            n_estimators=200,
            random_state=42,
            n_jobs=-1)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

print("RMSE per fold:", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))
print("Std RMSE:", np.std(rmse_scores))

RMSE per fold: [np.float64(2.5394760307993374), np.float64(2.6745020690529153), np.float64(2.5586372490788882), np.float64(2.3233149086953833), np.float64(2.5014375494706247)]
Mean RMSE: 2.5194735614194297
Std RMSE: 0.11381673528702003


## Xgboost model

In [26]:
from xgboost import XGBRegressor

In [33]:
rmse_scores = []

for train_index, val_index in kf.split(X):
    
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # model = LinearRegression()
    # model = RandomForestRegressor(
    #         n_estimators=200,
    #         random_state=42,
    #         n_jobs=-1)
    
    

    model = XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

print("RMSE per fold:", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))
print("Std RMSE:", np.std(rmse_scores))

RMSE per fold: [np.float64(2.011800402733268), np.float64(2.284069100680037), np.float64(2.08698304024405), np.float64(2.0935944211946236), np.float64(1.9687668018013051)]
Mean RMSE: 2.0890427533306566
Std RMSE: 0.1081599939912026


XGBoost significantly outperformed previous models obtaining the best rmse metric 2.08 and std rmse 0.1

In [29]:
import joblib

joblib.dump(model, "models/xgb_model.pkl")

['models/xgb_model.pkl']

In [30]:
df_blind = pd.read_csv("data/blind_test_data.csv")

X_blind = df_blind.copy()

In [34]:
predictions = model.predict(X_blind)

In [35]:
submission = pd.DataFrame({
    "target_pred": predictions
})

submission.to_csv("blind_predictions.csv", index=False)

## Hyperparameter tunning

In [36]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

param_dist = {
    "n_estimators": [200, 300, 400, 500],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "max_depth": [3, 4, 5, 6],
    "subsample": [0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.8, 0.9, 1.0]
}

In [37]:
xgb = XGBRegressor(
    random_state=42,
    n_jobs=-1
)

In [38]:
search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_root_mean_squared_error",
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

In [39]:
search.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,param_distributions,"{'colsample_bytree': [0.7, 0.8, ...], 'learning_rate': [0.01, 0.03, ...], 'max_depth': [3, 4, ...], 'n_estimators': [200, 300, ...], ...}"
,n_iter,20
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
print("Best RMSE:", -search.best_score_)
print("Best Params:", search.best_params_)

Best RMSE: 1.9496858494216451
Best Params: {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.9}


The tuned XGBoost model achieved the lowest cross-validated RMSE.

Final configuration:

- n_estimators: 500
- learning_rate: 0.05
- max_depth: 3
- subsample: 0.8
- colsample_bytree: 0.9

This configuration balances model capacity and regularization