In [8]:
#load the packages
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor #Gradient Boosting regressor
from sklearn.feature_selection import VarianceThreshold
import xgboost as xgb
import torch
import glob
import os
import joblib
from datetime import datetime
from joblib import dump

%run Functions/Data_Transformation_Functions.ipynb
%run Functions/Basic_Functions.ipynb
%run Functions/XGB_Functions.ipynb

More Explanation found in readme.txt

# Import and Prepare Data

In [None]:
print("######################### 1. Importing Dummied Data #########################")

Import Data

In [2]:
# get the path of the input data
input_path = "Data\\Transformed Data\\Dummy Transformed\\"

In [3]:
# get the File Name
n_most_recent_file = 1 # define if you want the most recent file (1), second most recent (2), ...
most_recent_file = get_most_recent_file(input_path, n_most_recent_file)
print(f"{n_most_recent_file}. most recent file:\n\n{most_recent_file}")

# import the file
dummy_data = pd.read_csv(most_recent_file, low_memory = False)
print("Imported Dummy Data!")

1. most recent file:

Data\Transformed Data\Dummy Transformed\data_dummied_2024_07_13 12-23.csv
Imported Dummy Data!


Prepare Data

In [4]:
# extract URLS and Date Scraped
urls = dummy_data["URL"]
dates_scraped = dummy_data["Date_scraped"]

# get model data without Url and date scraped
model_data = dummy_data.drop(columns = ["URL", "Date_scraped"]) # model_data (pd.DataFrame): DataFrame containing the data to be used for training and testing.

In [None]:
print("")
print("######################### 1. Finished Importing Dummied Data #########################")
print("")
print("")

# Single Model Creation

In [None]:
print("######################### 2. Creating XGB Model #########################")

Set Parameters

In [5]:
n_estimators = 1000 # n_estimators (int): Number of boosting rounds. Default is 1000.
max_depth = 9 # max_depth (int): Maximum depth of a tree. Default is 9. (Controls the maximum depth of each tree, deeper trees can model more complex patterns but also increase the risk of overfitting)
learning_rate = 0.1 # learning_rate (float): Step size shrinkage used to prevent overfitting. Default is 0.1. (Also known as eta, this parameter scales the contribution of each tree, smaller values make the model more robust to overfitting but require more trees)
early_stopping_rounds = 10 # early_stopping_rounds (int): Activates early stopping. Validation error needs to decrease at least every <early_stopping_rounds> round(s) to continue training. Default is 10. (If the validation error does not improve for a given number of rounds, training is stopped early to prevent overfitting)
data_frac = 1 # data_frac (float): Fraction of data to be used for training and testing. Default is 1. (This parameter allows you to use a subset of your data for training/testing, useful for quick experiments)

Create Models

In [10]:
single_model, r2, rmse, y_test, y_pred = create_xgb_model(model_data = model_data,
                                   n_estimators = n_estimators,
                                   max_depth = max_depth,
                                   learning_rate = learning_rate,
                                   early_stopping_rounds = early_stopping_rounds,
                                   data_frac = data_frac)

Get Overview

In [7]:
print("")
print("Model Results:")
print(f"R-squared: {r2}, RMSE: {rmse}")

R-squared: 0.9661815253905058, RMSE: 4938.983994716927


Calculate Average Procentual Error

In [13]:
print(f"Average Procentual Prediction Error: {np.mean(abs(y_test-y_pred)/y_test)}")

0.1231206692388009

In [None]:
print("######################### 2. Finished Creating XGB Model #########################")
print("")
print("")

# Save Final Model

In [None]:
print("######################### 2. Saving XGB Model #########################")

In [13]:
save_model = True

if save_model:
    
    now = datetime.now()
    
    # Format the datetime into the desired string format
    formatted_time = now.strftime("%Y_%m_%d %H-%M")
    output_file = f"Models/xgb_model_{formatted_time}.joblib"
    dump(single_model, output_file)

In [None]:
print("")
print("######################### 2. Finished Saving XGB Model #########################")
print("")
print("")

# Clear Work Space

Automatically Clear Workspace at the end of the script to ensure that no temporary variables are kept when this script is run over the Major Pipeline Terminal

In [None]:
clear_workspace = True
if clear_workspace:
    %reset -f