In [171]:
%pip install --upgrade pip --index-url https://pypi.org/simple -q
%pip install -r requirements.txt --index-url https://pypi.org/simple -q

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [172]:
import numpy as np
import pandas as pd
import sklearn
import xgboost
from scipy import stats
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("XGBoost version:", xgboost.__version__)

NumPy version: 1.24.3
Pandas version: 1.4.4
Scikit-learn version: 1.4.0
XGBoost version: 2.1.2


In [173]:
pd.set_option('display.max_columns', None)
file_path = '../resources/curated_input.csv'
df = pd.read_csv(file_path)
print(df.head())

          CarName fueltype aspiration doornumber    carbody drivewheel  \
0  toyota corolla   diesel        std       four  hatchback        fwd   
1   toyota carina      gas        std       four      wagon        4wd   
2  toyota corolla      gas        std        two  hatchback        rwd   
3   toyota corona      gas        std        two  hatchback        rwd   
4     nissan otti      gas        std       four      sedan        fwd   

  enginelocation  wheelbase    color  carlength  carwidth  carheight  \
0          front       95.7   yellow      166.3      64.4       52.8   
1          front       95.7   purple      169.7      63.6       59.1   
2          front       98.4     navy      176.2      65.6       52.0   
3          front      102.9  fuchsia      183.5      67.7       52.0   
4          front      100.4   yellow      184.6      66.5       55.1   

   curbweight  cylindernumber  enginesize  compressionratio  horsepower  \
0        2275             4.0         110      

In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CarName           235 non-null    object 
 1   fueltype          235 non-null    object 
 2   aspiration        235 non-null    object 
 3   doornumber        235 non-null    object 
 4   carbody           235 non-null    object 
 5   drivewheel        235 non-null    object 
 6   enginelocation    235 non-null    object 
 7   wheelbase         235 non-null    float64
 8   color             235 non-null    object 
 9   carlength         235 non-null    float64
 10  carwidth          235 non-null    float64
 11  carheight         235 non-null    float64
 12  curbweight        235 non-null    int64  
 13  cylindernumber    235 non-null    float64
 14  enginesize        235 non-null    int64  
 15  compressionratio  235 non-null    float64
 16  horsepower        235 non-null    float64
 1

In [175]:
# Data Preprocessing
label_encoders = {}
categorical_columns = df.select_dtypes(include=['object']).columns

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Splitting the data into train and test sets

X = df.drop('Price', axis=1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost Model

                xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.06, gamma=0, subsample=0.8, colsample_bytree=1, max_depth=7)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

RMSE: 1912.0344674779114
R^2: 0.9614432764528569




In [176]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
file_path = f'../models/xgb_model_{timestamp}.json'
xgb_model.save_model(file_path)

[CV] END colsample_bytree=0.75, gamma=0.2, learning_rate=0.08, max_depth=5, n_estimators=200, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.75, gamma=0.2, learning_rate=0.08, max_depth=5, n_estimators=300, subsample=0.5; total time=   0.4s
[CV] END colsample_bytree=0.75, gamma=0.2, learning_rate=0.08, max_depth=7, n_estimators=100, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.75, gamma=0.2, learning_rate=0.08, max_depth=7, n_estimators=100, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.75, gamma=0.2, learning_rate=0.08, max_depth=7, n_estimators=300, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.75, gamma=0.2, learning_rate=0.08, max_depth=7, n_estimators=300, subsample=0.5; total time=   0.5s
[CV] END colsample_bytree=0.75, gamma=0.2, learning_rate=0.08, max_depth=9, n_estimators=200, subsample=0.5; total time=   0.3s
[CV] END colsample_bytree=0.75, gamma=0.2, learning_rate=0.08, max_depth=9, n_estimators=200, subsample=

[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.08, max_depth=5, n_estimators=300, subsample=1.0; total time=   0.5s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.08, max_depth=7, n_estimators=100, subsample=0.5; total time=   0.2s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.08, max_depth=7, n_estimators=200, subsample=1.0; total time=   0.4s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.08, max_depth=7, n_estimators=200, subsample=1.0; total time=   0.4s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.08, max_depth=9, n_estimators=100, subsample=0.75; total time=   0.2s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.08, max_depth=9, n_estimators=100, subsample=0.75; total time=   0.2s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.08, max_depth=9, n_estimators=200, subsample=1.0; total time=   0.5s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.08, max_depth=9, n_estimators=200, subsample=1.0; total time=   0.5

[CV] END colsample_bytree=0.75, gamma=0.1, learning_rate=0.05, max_depth=7, n_estimators=200, subsample=0.75; total time=   0.3s
[CV] END colsample_bytree=0.75, gamma=0.1, learning_rate=0.05, max_depth=7, n_estimators=200, subsample=0.75; total time=   0.3s
[CV] END colsample_bytree=0.75, gamma=0.1, learning_rate=0.05, max_depth=9, n_estimators=100, subsample=0.5; total time=   0.2s
[CV] END colsample_bytree=0.75, gamma=0.1, learning_rate=0.05, max_depth=9, n_estimators=100, subsample=0.5; total time=   0.2s
[CV] END colsample_bytree=0.75, gamma=0.1, learning_rate=0.05, max_depth=9, n_estimators=100, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.75, gamma=0.1, learning_rate=0.05, max_depth=9, n_estimators=100, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.75, gamma=0.1, learning_rate=0.05, max_depth=9, n_estimators=300, subsample=0.75; total time=   0.5s
[CV] END colsample_bytree=0.75, gamma=0.1, learning_rate=0.05, max_depth=9, n_estimators=300, subsamp

[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.01, max_depth=9, n_estimators=100, subsample=0.75; total time=   0.2s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.01, max_depth=9, n_estimators=200, subsample=0.75; total time=   0.4s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.01, max_depth=9, n_estimators=200, subsample=1.0; total time=   0.4s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.05, max_depth=5, n_estimators=100, subsample=0.75; total time=   0.2s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.05, max_depth=5, n_estimators=100, subsample=0.75; total time=   0.2s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.05, max_depth=5, n_estimators=200, subsample=0.5; total time=   0.3s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.05, max_depth=5, n_estimators=200, subsample=0.75; total time=   0.3s
[CV] END colsample_bytree=1.0, gamma=0, learning_rate=0.05, max_depth=5, n_estimators=300, subsample=1.0; total time=   