# XGBoost Model Exploration

## Training

### Import Modules

In [58]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

### Load datasets

In [59]:
# Load the dataset
train_df = pd.read_csv('./../../data/processed/train_dropped.csv')
test_df = pd.read_csv('./../../data/raw/test.csv')

# Separate features and target variable
X = train_df.drop(columns=['Price'])
y = train_df['Price']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['number']).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

Categorical columns: ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
Numerical columns: ['id', 'Compartments', 'Weight Capacity (kg)']


### Preprocessing

In [60]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough',
    force_int_remainder_cols=False
)

### Pipeline

In [61]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        max_depth=7,              # Maximum tree depth for base learners.
        learning_rate=0.1,        # Boosting learning rate.
        n_estimators=100,         # Number of boosting rounds.
        subsample=0.8,            # Fraction of training data used per tree.
        colsample_bytree=0.8,     # Fraction of features used per tree.
        reg_alpha=0.0,            # L1 regularization term.
        reg_lambda=1.0,           # L2 regularization term.
        gamma=0,                  # Minimum loss reduction to make a split.
        min_child_weight=1        # Minimum sum of instance weight in a child.
    ))
])

### Fit

In [62]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

## Prediction & Submissions

### Predict On Train Split

In [63]:
# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model performance using RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Test RMSE:", rmse)


Test RMSE: 38.92467031028846


### Predict On Test set

In [64]:
# Predict on test data
y_pred_test = pipeline.predict(test_df)

# Create a submission DataFrame with only the 'id' and predicted 'Price'
submission = pd.DataFrame({
    'id': test_df['id'],
    'Price': y_pred_test
})

# Save the submission to CSV
submission.to_csv('./../../submissions/xgboost_submission_1.csv', index=False)