In [None]:
#load the packages
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor #Gradient Boosting regressor
from sklearn.feature_selection import VarianceThreshold
import xgboost as xgb
import torch
import glob
import os
import joblib
from datetime import datetime
from joblib import dump

# Single Model Creation for Final Model

# Multi Model Creation for Performance Comparison

In [None]:
def run_xgb_optimizer(model_data, n_estimators_list, max_depths_list, lr_list, es_rounds_list, data_frac):
    # run_xgb_optimizer function begins
    
    # Process done by the function
    # Sample the data
    model_data = model_data.sample(frac=data_frac, random_state=42)
    
    # Split the data into features (X) and target (y)
    X = model_data.drop("Price", axis=1)
    y = model_data["Price"]

    # First split to create training + validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Second split to create training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

    # Initialize lists to store models and their specifications
    models_list = []
    overview_data = []

    # Loop over the parameters to test
    for curr_n_estimators in n_estimators_list:
        for curr_max_depth in max_depths_list:
            for curr_lr in lr_list:
                for curr_es_rounds in es_rounds_list:
                    # Create the model
                    curr_model = xgb.XGBRegressor(n_estimators=curr_n_estimators,
                                                  max_depth=curr_max_depth,
                                                  learning_rate=curr_lr,
                                                  n_jobs=-1,
                                                  tree_method='hist',
                                                  eval_metric='rmse',
                                                  random_state=42,
                                                  early_stopping_rounds=curr_es_rounds)
                    
                    # Fit the model
                    curr_model.fit(X_train, y_train, 
                                   eval_set=[(X_val, y_val)], 
                                   verbose=False)
                    
                    # Predict on the test data
                    y_pred = curr_model.predict(X_test)
                    
                    # Calculate performance metrics
                    rmse = mean_squared_error(y_test, y_pred, squared=False)
                    r2 = r2_score(y_test, y_pred)
                    
                    # Save the model and its specifications
                    models_list.append(curr_model)
                    overview_data.append({
                        'n_estimators': curr_n_estimators,
                        'max_depth': curr_max_depth,
                        'learning_rate': curr_lr,
                        'early_stopping_rounds': curr_es_rounds,
                        'rmse': rmse,
                        'r2': r2
                    })
    
    # Create an overview dataframe
    overview_df = pd.DataFrame(overview_data)

    return models_list, overview_df

In [None]:
def create_xgb_model(model_data, n_estimators, max_depth, learning_rate, early_stopping_rounds, data_frac):
    # create_xgb_model function begins
    # Sample the data
    model_data = model_data.sample(frac=data_frac, random_state=42)
    
    # Process done by the function
    # Split the data into features (X) and target (y)
    X = model_data.drop("Price", axis=1)
    y = model_data["Price"]

    # First split to create training + validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Second split to create training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2
    
    # Create the model
    model = xgb.XGBRegressor(n_estimators=n_estimators,
                             max_depth=max_depth,
                             learning_rate=learning_rate,
                             n_jobs=-1,
                             tree_method='hist',
                             eval_metric='rmse',
                             random_state=42,
                             early_stopping_rounds=early_stopping_rounds)
    
    # Fit the model
    model.fit(X_train, y_train, 
              eval_set=[(X_val, y_val)], 
              verbose=False)
    
    # Predict on the test data
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    
    return model, r2, rmse, y_test, y_pred