In [8]:
#load the packages
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor #Gradient Boosting regressor
from sklearn.feature_selection import VarianceThreshold
import xgboost as xgb
import torch
import glob
import os
import joblib
from datetime import datetime
from joblib import dump

%run Functions/Data_Transformation_Functions.ipynb
%run Functions/Basic_Functions.ipynb
%run Functions/XGB_Functions.ipynb

# Import and Prepare Data

Import Data

In [2]:
# get the path of the input data
input_path = "Data\\Transformed Data\\Dummy Transformed\\"

In [3]:
# get the File Name
n_most_recent_file = 1 # define if you want the most recent file (1), second most recent (2), ...
most_recent_file = get_most_recent_file(input_path, n_most_recent_file)
print(f"{n_most_recent_file}. most recent file:\n\n{most_recent_file}")

# import the file
dummy_data = pd.read_csv(most_recent_file, low_memory = False)
print("Imported Dummy Data!")

1. most recent file:

Data\Transformed Data\Dummy Transformed\data_dummied_2024_07_13 12-23.csv
Imported Dummy Data!


Prepare Data

In [4]:
# extract URLS and Date Scraped
urls = dummy_data["URL"]
dates_scraped = dummy_data["Date_scraped"]

# get model data without Url and date scraped
model_data = dummy_data.drop(columns = ["URL", "Date_scraped"])

# Multi Model Creation

Set Parameters

In [5]:
n_estimators_list = [10000, 20000]
max_depths_list = [9]
lr_list = [0.01]
es_rounds_list = [10]
data_frac = 0.2

Create Models

In [9]:
models, overview_df = run_xgb_optimizer(model_data, n_estimators_list, max_depths_list, lr_list, es_rounds_list, data_frac)

Get Overview

In [None]:
overview_df

# Single Model Creation

Set Parameters

In [5]:
n_estimators = 2000
max_depth = 9
learning_rate = 0.1
early_stopping_rounds = 10
data_frac = 1

Create Models

In [10]:
single_model, r2, rmse, y_test, y_pred = create_xgb_model(model_data = model_data,
                                   n_estimators = n_estimators,
                                   max_depth = max_depth,
                                   learning_rate = learning_rate,
                                   early_stopping_rounds = early_stopping_rounds,
                                   data_frac = data_frac)

Get Overview

In [7]:
print(f"R-squared: {r2}, RMSE: {rmse}")

R-squared: 0.9661815253905058, RMSE: 4938.983994716927


Calculate Average Procentual Error

In [13]:
np.mean(abs(y_test-y_pred)/y_test)

0.1231206692388009

# Save Final Model

In [13]:
save_model = True

if save_model:
    
    now = datetime.now()
    
    # Format the datetime into the desired string format
    formatted_time = now.strftime("%Y_%m_%d %H-%M")
    output_file = f"Models/xgb_model_{formatted_time}.joblib"
    dump(single_model, output_file)