In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

# Load the dataset
df = pd.read_csv('transactions.csv')

In [7]:
# Encode categorical features

le_region = LabelEncoder()
le_quarter = LabelEncoder()
df['region_encoded'] = le_region.fit_transform(df['region'].fillna('Unknown'))
df['quarter_encoded'] = le_quarter.fit_transform(df['quarter'].fillna('Unknown'))

target = 'total_transaction_amount'
features = ['region_encoded', 'quarter_encoded', 'number_of_purchases']

data = df[features + [target]].dropna()

X = data[features]
y = data[target]

print(f"Dateset loaded. Samples: {len(data)}")

Dateset loaded. Samples: 1088


In [8]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost Regressor
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("--- Initial Model Metrics ---")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R2 Score: {r2:.4f}")

--- Initial Model Metrics ---
MSE: 26796520.9925
RMSE: 5176.5356
MAE: 4263.1249
R2 Score: -0.5675


In [9]:
# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Best parameters and best model predictions
print("\n--- GridSearchCV Results ---")
print(f"Best Parameters: {grid_search.best_params_}")

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Calculate metrics for the best model
mse_best = mean_squared_error(y_test, y_pred_best)
rmse_best = np.sqrt(mse_best)
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print("\n--- Tuned Model Metrics ---")
print(f"MSE: {mse_best:.4f}")
print(f"RMSE: {rmse_best:.4f}")
print(f"MAE: {mae_best:.4f}")
print(f"R2 Score: {r2_best:.4f}")

Fitting 3 folds for each of 27 candidates, totalling 81 fits

--- GridSearchCV Results ---
Best Parameters: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50}

--- Tuned Model Metrics ---
MSE: 17190233.1845
RMSE: 4146.1106
MAE: 3478.6046
R2 Score: -0.0056
