# Model Training 

Model Training Part of the Project

Based on the complexity and high-dimensionality of the data, the following models are chosen:
- XGBoost
- Random Forest
- Neural Network

These models were chosen because they don't have some strict assumptions to get a meaningful result,
can capture underlying patterns very well. Furthermore, might apply Principal Component Analysis, Cross-Validation, & Hyper-parameter tuning to the model, probably using optuna

## XGBoost

In [1]:
import pandas as pd

#Load pre-split data
train_df = pd.read_parquet("../data/splits/train.parquet")
test_df = pd.read_parquet("../data/splits/test.parquet")

#Separate features and target
X_train, y_train = train_df.drop(columns=["rating"]), train_df["rating"]
X_test, y_test = test_df.drop(columns=["rating"]), test_df["rating"]


In [2]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14151 entries, 0 to 14150
Data columns (total 82 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   anime_id                   14151 non-null  float64
 1   episodes                   14151 non-null  float64
 2   members                    14151 non-null  float64
 3   duration(mins)             14151 non-null  float64
 4   popularity                 14151 non-null  float64
 5   favorites                  14151 non-null  float64
 6   type_Movie                 14151 non-null  int64  
 7   type_Music                 14151 non-null  int64  
 8   type_ONA                   14151 non-null  int64  
 9   type_OVA                   14151 non-null  int64  
 10  type_Special               14151 non-null  int64  
 11  type_TV                    14151 non-null  int64  
 12  type_Unknown               14151 non-null  int64  
 13  demographics_Josei         14151 non-null  int

In [7]:
y_train.head()

0    7.20
1     NaN
2    7.27
3    7.33
4    5.60
Name: rating, dtype: float64

In [5]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
from xgboost.callback import EarlyStopping
import numpy as np
import optuna

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        
        #Required parameters for xgb.train
        "objective":"reg:squarederror",
        "eval_metic":"rmse",
        "tree_method":"auto"
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []

    for train_idx, valid_idx in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        
        D_train = xgb.DMatrix(X_tr, y_tr)
        D_valid = xgb.DMatrix(X_val, y_val)
        
        model = xgb.train(
            params = params,
            dtrain = D_train,
            num_boost_round=params["n_estimators"],
            evals=[(D_train, "Train"), (D_valid, "Valid")],
            callbacks=EarlyStopping(rounds=50,metric_name='rmse', data_name='Valid'),
            verbose_eval=False
        )

        preds = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, preds, squared=False)
        rmse_scores.append(rmse)
        
        #Report progress to Optuna
        trial.report(rmse, len(rmse_scores))
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return np.mean(rmse_scores)


In [6]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[I 2025-11-18 15:37:39,133] A new study created in memory with name: no-name-205389fc-928d-4fe6-a9d2-baf311c21281
[W 2025-11-18 15:37:39,174] Trial 0 failed with parameters: {'n_estimators': 570, 'learning_rate': 0.1237237148071676, 'max_depth': 10, 'subsample': 0.850213008741199, 'colsample_bytree': 0.7592922185153682, 'reg_lambda': 0.038939183199494506, 'reg_alpha': 0.7974803245442037} because of the following error: XGBoostError('[15:37:39] /Users/runner/work/xgboost/xgboost/src/data/data.cc:565: Check failed: valid: Label contains NaN, infinity or a value too large.\nStack trace:\n  [bt] (0) 1   libxgboost.dylib                    0x00000001589752dc dmlc::LogMessageFatal::~LogMessageFatal() + 124\n  [bt] (1) 2   libxgboost.dylib                    0x0000000158b0f274 xgboost::MetaInfo::SetInfoFromHost(xgboost::Context const*, xgboost::StringView, xgboost::Json) + 2820\n  [bt] (2) 3   libxgboost.dylib                    0x0000000158b0e5e4 xgboost::MetaInfo::SetInfo(xgboost::Context c

XGBoostError: [15:37:39] /Users/runner/work/xgboost/xgboost/src/data/data.cc:565: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x00000001589752dc dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x0000000158b0f274 xgboost::MetaInfo::SetInfoFromHost(xgboost::Context const*, xgboost::StringView, xgboost::Json) + 2820
  [bt] (2) 3   libxgboost.dylib                    0x0000000158b0e5e4 xgboost::MetaInfo::SetInfo(xgboost::Context const&, xgboost::StringView, xgboost::StringView) + 464
  [bt] (3) 4   libxgboost.dylib                    0x000000015898b900 XGDMatrixSetInfoFromInterface + 228
  [bt] (4) 5   libffi.8.dylib                      0x0000000110308050 ffi_call_SYSV + 80
  [bt] (5) 6   libffi.8.dylib                      0x000000011030589c ffi_call_int + 1444
  [bt] (6) 7   _ctypes.cpython-312-darwin.so       0x0000000107fe88fc _ctypes_callproc + 1172
  [bt] (7) 8   _ctypes.cpython-312-darwin.so       0x0000000107fe22d0 PyCFuncPtr_call + 1244
  [bt] (8) 9   python3.12                          0x0000000102ad0860 _PyObject_MakeTpCall + 312



In [None]:
print("Best RMSE(CV):", study.best_value)
print("Best hyperparameters:", study.best_params)

In [None]:


# Train final model on full training set
best_model = XGBRegressor(
    **study.best_params,
    objective="reg:squarederror",
    tree_method = "auto",
    n_jobs=-1
)
best_model.fit(
    X_train,y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=50,
    verbose=True
)

In [None]:
## Evaluate model on test set
from sklearn.metrics import mean_squared_error

test_preds = best_model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, test_preds))
print("Test RMSE:",rmse_test)