In [1]:
# imports
import timeit
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, \
RandomizedSearchCV, StratifiedKFold, KFold

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from pickle import dump, load

from sklearn.metrics import r2_score, mean_squared_error


from sklearn.linear_model import LinearRegression

import warnings

# Filter out any warning
warnings.filterwarnings("ignore")

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Read in Data

In [2]:
df = pd.read_csv('Data/cleaned_vehicles_df.csv', index_col = 0)
df

Unnamed: 0,posting_year,year_manufactured,manufacturer,model,state,region,price,fuel,miles,cylinders,title_status,transmission,drive,type,paint_color
0,2021,2014,gmc,sierra 1500 crew cab slt,al,auburn,33590,gas,57923.0,8 cylinders,clean,other,,pickup,white
1,2021,2010,chevrolet,silverado 1500,al,auburn,22590,gas,71229.0,8 cylinders,clean,other,,pickup,blue
2,2021,2020,chevrolet,silverado 1500 crew,al,auburn,39590,gas,19160.0,8 cylinders,clean,other,,pickup,red
3,2021,2017,toyota,tundra double cab sr,al,auburn,30990,gas,41124.0,8 cylinders,clean,other,,pickup,red
4,2021,2013,ford,f-150 xlt,al,auburn,15000,gas,128000.0,6 cylinders,clean,automatic,rwd,truck,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399878,2021,2019,nissan,maxima s sedan 4d,wy,wyoming,23590,gas,32226.0,6 cylinders,clean,other,fwd,sedan,
399879,2021,2020,volvo,s60 t5 momentum sedan 4d,wy,wyoming,30590,gas,12029.0,,clean,other,fwd,sedan,red
399880,2021,2020,cadillac,xt4 sport suv 4d,wy,wyoming,34990,diesel,4174.0,,clean,other,,hatchback,white
399881,2021,2018,lexus,es 350 sedan 4d,wy,wyoming,28990,gas,30112.0,6 cylinders,clean,other,fwd,sedan,silver


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 399883 entries, 0 to 399882
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   posting_year       399883 non-null  int64  
 1   year_manufactured  399883 non-null  int64  
 2   manufacturer       399883 non-null  object 
 3   model              399883 non-null  object 
 4   state              399883 non-null  object 
 5   region             399883 non-null  object 
 6   price              399883 non-null  int64  
 7   fuel               397874 non-null  object 
 8   miles              399883 non-null  float64
 9   cylinders          235585 non-null  object 
 10  title_status       392710 non-null  object 
 11  transmission       398275 non-null  object 
 12  drive              280317 non-null  object 
 13  type               316647 non-null  object 
 14  paint_color        280565 non-null  object 
dtypes: float64(1), int64(3), object(11)
memory usage: 48.8+ 

### Filtering outliers

In [4]:
print(df['price'].min())
print(df['price'].max())

0
3736928711


In [5]:
# Function to filter outliers using IQR
def filter_iqr_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df


In [6]:
df = filter_iqr_outliers(df, ['price'])

#### Filter out 0 price vehicles

In [7]:
df = df[df['price']>0]

## Build Baseline Models to Predict Price

### Train-Test Split

#### Check number of unique values for categorical fields 

In [8]:
# Initialize an empty dictionary to store results
unique_counts = {}

# Loop through each column in the DataFrame
for col in df.columns:
    if df[col].dtype == 'object':  # Check if column is categorical (object type)
        unique_counts[col] = df[col].nunique()  # Count number of unique values

# Print the number of unique values for each categorical column
for col, count in unique_counts.items():
    print(f"Column '{col}' has {count} unique values.")

Column 'manufacturer' has 41 unique values.
Column 'model' has 21840 unique values.
Column 'state' has 51 unique values.
Column 'region' has 404 unique values.
Column 'fuel' has 5 unique values.
Column 'cylinders' has 8 unique values.
Column 'title_status' has 6 unique values.
Column 'transmission' has 3 unique values.
Column 'drive' has 3 unique values.
Column 'type' has 13 unique values.
Column 'paint_color' has 12 unique values.


In [9]:
# target variable is price; drop posting_year as not relevant
df = df.drop(['posting_year', 'region', 'model'], axis=1)
X = df.drop(['price'], axis = 1)

y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = 0.90)

In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(327672, 11)
(36409, 11)
(327672,)
(36409,)


## Data Cleaning

Will still need to impute missing values in the following columns:

In [11]:
# Fill missing values with mode for each column
X_train = X_train.apply(lambda x: x.fillna(x.mode()[0]))
X_train.head()

Unnamed: 0,year_manufactured,manufacturer,state,fuel,miles,cylinders,title_status,transmission,drive,type,paint_color
25784,2012,subaru,ca,gas,52355.0,4 cylinders,clean,automatic,4wd,sedan,white
20236,2015,toyota,ca,gas,66732.0,6 cylinders,clean,other,fwd,hatchback,white
276430,2002,toyota,oh,gas,155500.0,6 cylinders,clean,automatic,fwd,convertible,red
344512,2009,gmc,tx,gas,186168.0,8 cylinders,clean,automatic,4wd,pickup,grey
160207,2017,kia,ky,hybrid,30257.0,4 cylinders,clean,automatic,fwd,sedan,white


In [12]:
# helper function to impute missing values in dataframe columns
def impute_cols(dataset):
    '''
    Helper function takes in the dataset and identifies the columns with missing values and imputes the 
    mode value in each column.
    '''
    
    # filter for cols with missing values as a series
    cols = dataset.isnull().sum() >0

    # extract the cols from the series
    missing_cols = cols[cols].index.tolist()
    
    # fill missing cat cols with mode
    mode_values = dataset[missing_cols].mode()
#     display("Mode Values in each column:", mode_values)
    
    # Fill missing values with mode for each column
    imputed_dataset = dataset.apply(lambda x: x.fillna(x.mode()[0]))
    
    return imputed_dataset

In [13]:
X_train = impute_cols(X_train)

In [14]:
X_test = impute_cols(X_test)

#### Calculate Age of Vehicle

In [15]:
# calculate age
X_train['age'] = 2024 - X_train['year_manufactured']
X_test['age'] = 2024 - X_test['year_manufactured']

# drop year_manufactured
X_train = X_train.drop(columns = 'year_manufactured')
X_test = X_test.drop(columns = 'year_manufactured')

### Preprocessing
#### Build `ColumnTransformer`

In [16]:
# define categorical columns to be OHE
cat_cols = X_train.select_dtypes(['object']).columns

# define numeric columns to be standard scaled
num_cols = X_train.select_dtypes(['int', 'float']).columns

In [17]:
# define numeric transformation pipeline that scales the numbers
numeric_pipeline = Pipeline([('numnorm', StandardScaler())]) # apply a standard scaler

# define a nominal transformation pipeline that OHE the cats, and MaxAbsScales the set
nominal_pipeline = Pipeline([('onehotenc', OneHotEncoder(categories="auto", # ohe the cat variables
                                                         sparse_output = False))]) 

In [18]:
# transform the selected columns with nominal, and numeric pipelines
pp_ct = ColumnTransformer(transformers = 
                    [("nominalpipe", nominal_pipeline, cat_cols), 
                     ("numpipe", numeric_pipeline, num_cols)])

#### Save transformer

In [19]:
# save the transformer
dump(pp_ct, open('pkl/pp_ct.pkl', 'wb'))

In [20]:
# load the transformer
pp_ct = load(open('pkl/pp_ct.pkl', 'rb'))

### Transform the training and test sets

In [21]:
pp_X_train = pd.DataFrame(pp_ct.fit_transform(X_train))

In [22]:
pp_X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,134,135,136,137,138,139,140,141,142,143
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.249945,-0.073208
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.170850,-0.418874
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.317510,1.079010
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.486231,0.272457
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.371518,-0.649317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.199634,-0.188430
327668,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.365929,-0.994983
327669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.130557,-0.303652
327670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088960,0.157235


In [23]:
pp_X_test = pd.DataFrame(pp_ct.transform(X_test))

In [24]:
pp_X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,134,135,136,137,138,139,140,141,142,143
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.281904,0.042014
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.053614,1.539897
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.864912,0.733345
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.453018,-0.649317
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.425587,-0.994983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36404,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.347768,0.618123
36405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.012175,0.042014
36406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.012175,-0.073208
36407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.537978,0.733345


In [25]:
# log transform the target variable
log_y_train = np.log(y_train)
log_y_test = np.log(y_test)

#### Metrics

In [26]:
# create function that gives performance metrics
def metrics(y_true, y_predict):
    """ 
    Calculates and returns the two performance scores between 
    true and predicted values - first R-Squared, then RMSE
    """

    # Calculate the r2 score between 'y_true' and 'y_predict'
    r2 = r2_score(y_true, y_predict)

    # Calculate the root mean squared error between 'y_true' and 'y_predict'
    rmse = mean_squared_error(np.exp(y_true), np.exp(y_predict), squared = False) # False gives RMSE

    # Return the score
    return [r2, rmse]

#### Cross Validation

In [27]:
# define a simple function that returns cross validation score for a 5 fold
def get_cv_score(model, X, y):
    
    # instatiate the model
    instantiated_model = model
    # get model name
    model_name = type(instantiated_model).__name__
    
    # R2 score CV
    cv_r2_score = np.mean(cross_val_score(instantiated_model, 
                                       X, y, 
                                       scoring = 'r2', 
                                       cv = 3))    # 3 folds
    # RMSE score CV
    cv_rmse_score = np.mean(cross_val_score(instantiated_model,
                                           X,
                                           np.exp(y),
                                           scoring = 'neg_root_mean_squared_error',
                                           cv = 3))  # 3 folds
    cross_val_r2 = round(cv_r2_score, 4)
    cross_val_rmse = -round(cv_rmse_score)
    
    print(model_name, f"Cross Validation R2: {cross_val_r2}")
    print(model_name, f"Cross Validation RMSE: {cross_val_rmse}")

### Modeling
#### Baseline Model

In [47]:
def model_results(model, X_train, y_train, X_test, y_test, early_stopping_rounds=None):
    '''
    Helper function that takes input of model, and train-test split sets 
    and returns the model R2, RMSE scores
    '''
    
    # preprocess the training and test sets
    pp_X_train = pd.DataFrame(pp_ct.fit_transform(X_train))
    pp_X_test = pd.DataFrame(pp_ct.transform(X_test))
    # log scale the y_train and y_test
    log_y_train = np.log(y_train)
    log_y_test = np.log(y_test)
    
    # instatiate the model
    instantiated_model = model
    # get model name
    model_name = type(instantiated_model).__name__
    
    # Start the timer
    start_time = timeit.default_timer()
    print("BEGIN TRAINING")
    
    # Fit the model with early stopping
    if early_stopping_rounds:
        instantiated_model.fit(pp_X_train, log_y_train,
                               early_stopping_rounds=early_stopping_rounds,
                               eval_set=[(pp_X_test, log_y_test)],
                               verbose=False)
    else:
        instantiated_model.fit(pp_X_train, log_y_train)
    
#     # fit the model
#     instantiated_model.fit(pp_X_train, log_y_train)
    
    # Stop the timer
    end_time = timeit.default_timer()
    print("Training COMPLETE; Calculating predictions...")
    
    # Make predictions on the training and test data
    y_pred_train = instantiated_model.predict(pp_X_train)
    y_pred_test = instantiated_model.predict(pp_X_test)

    # Calculate performance using the metrics() function 
    train_scores = metrics(log_y_train, y_pred_train)
    test_scores = metrics(log_y_test, y_pred_test)
    # R2
    train_r2 = round(train_scores[0],4)
    test_r2 = round(test_scores[0],4)
    #RMSE
    train_rmse = round(train_scores[1])
    test_rmse = round(test_scores[1])
    
    # Calculate elapsed time
    elapsed_time = end_time - start_time
    
    # Print elapsed time
    print(f"Elapsed time: {elapsed_time:.3f} seconds")
    print("")
    # Training
    print(model_name, f"Training R2: {train_r2}") 
    print(model_name, f"Test R2: {test_r2}") 
    print('-----' * 11)
    # Testing
    print(model_name, f"Training RMSE: {train_rmse}") 
    print(model_name, f"Model Test RMSE: {test_rmse}")
    print('-----' * 11)
    # Validation
    if elapsed_time < 30:
        get_cv_score(instantiated_model, pp_X_train, log_y_train)
        print('----' * 5, 'END OF TRAINING', '----' * 5)
    else:
        print('Cross Validation Skipped')

### Linear Regression

In [31]:
model_results(LinearRegression(), X_train, y_train, X_test, y_test)

BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 12.282 seconds

LinearRegression Training R2: 0.2496
LinearRegression Test R2: 0.2595
-------------------------------------------------------
LinearRegression Training RMSE: 9945
LinearRegression Model Test RMSE: 9984
-------------------------------------------------------
LinearRegression Cross Validation R2: 0.2485
LinearRegression Cross Validation RMSE: 9223
-------------------- END OF TRAINING --------------------


### XGBoost

In [33]:
import xgboost as xgb
from xgboost import XGBRegressor

In [46]:
model_results(XGBRegressor(), X_train, y_train, X_test, y_test)

BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 4.135 seconds

XGBRegressor Training R2: 0.5772
XGBRegressor Test R2: 0.5584
-------------------------------------------------------
XGBRegressor Training RMSE: 7062
XGBRegressor Model Test RMSE: 7218
-------------------------------------------------------
XGBRegressor Cross Validation R2: 0.537
XGBRegressor Cross Validation RMSE: 6028
4.134692624999843
-------------------- END OF TRAINING --------------------


In [35]:
from sklearn.neural_network import MLPRegressor

In [43]:
model_results(MLPRegressor(), X_train, y_train, X_test, y_test)

BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 177.856 seconds

MLPRegressor Training R2: 0.5664
MLPRegressor Test R2: 0.5382
-------------------------------------------------------
MLPRegressor Training RMSE: 7778
MLPRegressor Model Test RMSE: 8162
-------------------------------------------------------
Cross Validation Skipped


In [48]:
from sklearn.ensemble import GradientBoostingRegressor

In [49]:
model_results(GradientBoostingRegressor(), X_train, y_train, X_test, y_test)

BEGIN TRAINING


KeyboardInterrupt: 

### Optuna Hyperparameter Tuning

In [39]:
import optuna

In [40]:
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'eta': trial.suggest_loguniform('eta', 1e-2, 0.5),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }
    
    model = XGBRegressor(**params) # need to specify model

    # Train XGBoost model
    model_results(model, X_train, y_train, X_test, y_test, early_stopping_rounds=100)

    # Get validation RMSE
    rmse = model.best_score

    return rmse

In [41]:
# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# Print best parameters and best score
xgb_best_params = study.best_params
xgb_best_score = study.best_value

print("Best parameters:", xgb_best_params)
print("Best RMSE:", xgb_best_score)

[I 2024-07-06 11:14:41,114] A new study created in memory with name: no-name-bbec12ab-2dad-4e99-9ce6-dbcccfac83bc


BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 8.422 seconds

XGBRegressor Training R2: 0.4175
XGBRegressor Test R2: 0.4211
-------------------------------------------------------


[I 2024-07-06 11:15:20,016] Trial 0 finished with value: 0.9175340106966285 and parameters: {'lambda': 1.4437306899267388e-05, 'alpha': 0.0023524153503647763, 'max_depth': 8, 'eta': 0.016178366904966726, 'gamma': 0.12715444777124466, 'colsample_bytree': 0.594068002177957, 'subsample': 0.7864959092234824, 'min_child_weight': 7}. Best is trial 0 with value: 0.9175340106966285.


XGBRegressor Cross Validation R2: 0.4072
XGBRegressor Cross Validation RMSE: 7472
-------------------------------------------------------
XGBRegressor Training RMSE: 9390
XGBRegressor Model Test RMSE: 9465
-------------------- END OF TRAINING --------------------
BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 8.270 seconds

XGBRegressor Training R2: 0.4146
XGBRegressor Test R2: 0.4197
-------------------------------------------------------


[I 2024-07-06 11:15:58,209] Trial 1 finished with value: 0.9186663769137461 and parameters: {'lambda': 0.07524142228515465, 'alpha': 0.0009656595363994852, 'max_depth': 7, 'eta': 0.01695689536379905, 'gamma': 0.00015366852816393, 'colsample_bytree': 0.9572732618993416, 'subsample': 0.5944331874971034, 'min_child_weight': 4}. Best is trial 0 with value: 0.9175340106966285.


XGBRegressor Cross Validation R2: 0.4085
XGBRegressor Cross Validation RMSE: 7356
-------------------------------------------------------
XGBRegressor Training RMSE: 9157
XGBRegressor Model Test RMSE: 9218
-------------------- END OF TRAINING --------------------
BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 7.956 seconds

XGBRegressor Training R2: 0.5357
XGBRegressor Test R2: 0.5301
-------------------------------------------------------


[I 2024-07-06 11:16:34,784] Trial 2 finished with value: 0.8267136023740482 and parameters: {'lambda': 1.657632253711965e-06, 'alpha': 1.7523531473707687e-08, 'max_depth': 8, 'eta': 0.04898855022073932, 'gamma': 1.3610014198978879e-06, 'colsample_bytree': 0.7774800575185468, 'subsample': 0.6604015729385491, 'min_child_weight': 6}. Best is trial 2 with value: 0.8267136023740482.


XGBRegressor Cross Validation R2: 0.5132
XGBRegressor Cross Validation RMSE: 6278
-------------------------------------------------------
XGBRegressor Training RMSE: 7425
XGBRegressor Model Test RMSE: 7551
-------------------- END OF TRAINING --------------------
BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 6.079 seconds

XGBRegressor Training R2: 0.4457
XGBRegressor Test R2: 0.4515
-------------------------------------------------------


[I 2024-07-06 11:17:01,041] Trial 3 finished with value: 0.893175156840681 and parameters: {'lambda': 0.0029359686308163403, 'alpha': 0.03138937546427548, 'max_depth': 4, 'eta': 0.15604814823180135, 'gamma': 0.01313128205633356, 'colsample_bytree': 0.5809885859537891, 'subsample': 0.638439722547011, 'min_child_weight': 4}. Best is trial 2 with value: 0.8267136023740482.


XGBRegressor Cross Validation R2: 0.4383
XGBRegressor Cross Validation RMSE: 6718
-------------------------------------------------------
XGBRegressor Training RMSE: 7914
XGBRegressor Model Test RMSE: 7982
-------------------- END OF TRAINING --------------------
BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 6.828 seconds

XGBRegressor Training R2: 0.447
XGBRegressor Test R2: 0.4546
-------------------------------------------------------


[I 2024-07-06 11:17:29,450] Trial 4 finished with value: 0.8906378622994044 and parameters: {'lambda': 3.614380016000055e-05, 'alpha': 5.904717047425582e-07, 'max_depth': 5, 'eta': 0.081574650079265, 'gamma': 2.0356123006090322e-05, 'colsample_bytree': 0.7435489787833414, 'subsample': 0.9291374666823375, 'min_child_weight': 7}. Best is trial 2 with value: 0.8267136023740482.


XGBRegressor Cross Validation R2: 0.4403
XGBRegressor Cross Validation RMSE: 6708
-------------------------------------------------------
XGBRegressor Training RMSE: 7900
XGBRegressor Model Test RMSE: 7973
-------------------- END OF TRAINING --------------------
BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 6.447 seconds

XGBRegressor Training R2: 0.513
XGBRegressor Test R2: 0.5142
-------------------------------------------------------


[I 2024-07-06 11:17:58,063] Trial 5 finished with value: 0.8405102514001523 and parameters: {'lambda': 0.05598349216720763, 'alpha': 1.5079717565038976e-06, 'max_depth': 6, 'eta': 0.15258711535445335, 'gamma': 5.445419599442805e-08, 'colsample_bytree': 0.5182570177565855, 'subsample': 0.5689203541956267, 'min_child_weight': 5}. Best is trial 2 with value: 0.8267136023740482.


XGBRegressor Cross Validation R2: 0.4992
XGBRegressor Cross Validation RMSE: 6295
-------------------------------------------------------
XGBRegressor Training RMSE: 7423
XGBRegressor Model Test RMSE: 7517
-------------------- END OF TRAINING --------------------
BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 7.094 seconds

XGBRegressor Training R2: 0.6986
XGBRegressor Test R2: 0.6205
-------------------------------------------------------


[I 2024-07-06 11:18:29,495] Trial 6 finished with value: 0.742916414934958 and parameters: {'lambda': 1.468123700185511e-06, 'alpha': 0.0764807160567634, 'max_depth': 9, 'eta': 0.44155991085855784, 'gamma': 0.02760574517247062, 'colsample_bytree': 0.8056873452659543, 'subsample': 0.9591805450975721, 'min_child_weight': 6}. Best is trial 6 with value: 0.742916414934958.


XGBRegressor Cross Validation R2: 0.5938
XGBRegressor Cross Validation RMSE: 5397
-------------------------------------------------------
XGBRegressor Training RMSE: 6283
XGBRegressor Model Test RMSE: 6857
-------------------- END OF TRAINING --------------------
BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 6.199 seconds

XGBRegressor Training R2: 0.3439
XGBRegressor Test R2: 0.3521
-------------------------------------------------------


[I 2024-07-06 11:18:56,390] Trial 7 finished with value: 0.970707609164834 and parameters: {'lambda': 3.180782759039164e-08, 'alpha': 1.68634331393793e-06, 'max_depth': 3, 'eta': 0.03891926929694263, 'gamma': 9.154432261411867e-07, 'colsample_bytree': 0.9416417394314329, 'subsample': 0.6748131403113629, 'min_child_weight': 10}. Best is trial 6 with value: 0.742916414934958.


XGBRegressor Cross Validation R2: 0.3454
XGBRegressor Cross Validation RMSE: 7520
-------------------------------------------------------
XGBRegressor Training RMSE: 9054
XGBRegressor Model Test RMSE: 9102
-------------------- END OF TRAINING --------------------
BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 6.412 seconds

XGBRegressor Training R2: 0.3178
XGBRegressor Test R2: 0.3256
-------------------------------------------------------


[I 2024-07-06 11:19:24,259] Trial 8 finished with value: 0.9903527885258662 and parameters: {'lambda': 0.2764272048175947, 'alpha': 4.3034455110276314e-08, 'max_depth': 3, 'eta': 0.025827731740233836, 'gamma': 0.000147523218739415, 'colsample_bytree': 0.6375892493817061, 'subsample': 0.7429171132630243, 'min_child_weight': 5}. Best is trial 6 with value: 0.742916414934958.


XGBRegressor Cross Validation R2: 0.3175
XGBRegressor Cross Validation RMSE: 7893
-------------------------------------------------------
XGBRegressor Training RMSE: 9682
XGBRegressor Model Test RMSE: 9735
-------------------- END OF TRAINING --------------------
BEGIN TRAINING
Training COMPLETE; Calculating predictions...
Elapsed time: 6.500 seconds

XGBRegressor Training R2: 0.257
XGBRegressor Test R2: 0.2625
-------------------------------------------------------


[I 2024-07-06 11:19:52,265] Trial 9 finished with value: 1.035658288195429 and parameters: {'lambda': 1.2745285094701467e-08, 'alpha': 0.08951157649318156, 'max_depth': 3, 'eta': 0.013135019961572899, 'gamma': 6.801784281928988e-08, 'colsample_bytree': 0.7227579105855002, 'subsample': 0.7292660288161577, 'min_child_weight': 8}. Best is trial 6 with value: 0.742916414934958.


XGBRegressor Cross Validation R2: 0.2565
XGBRegressor Cross Validation RMSE: 8830
-------------------------------------------------------
XGBRegressor Training RMSE: 10872
XGBRegressor Model Test RMSE: 10925
-------------------- END OF TRAINING --------------------
Best parameters: {'lambda': 1.468123700185511e-06, 'alpha': 0.0764807160567634, 'max_depth': 9, 'eta': 0.44155991085855784, 'gamma': 0.02760574517247062, 'colsample_bytree': 0.8056873452659543, 'subsample': 0.9591805450975721, 'min_child_weight': 6}
Best RMSE: 0.742916414934958


In [42]:
# Save best params to disk using pickle
with open('pkl/xgb_best_params.pkl', 'wb') as f:
    pickle.dump(xgb_best_params, f)

print("Best parameters saved to pkl/xgb_best_params.pkl:")
print(xgb_best_params)

Best parameters saved to pkl/xgb_best_params.pkl:
{'lambda': 1.468123700185511e-06, 'alpha': 0.0764807160567634, 'max_depth': 9, 'eta': 0.44155991085855784, 'gamma': 0.02760574517247062, 'colsample_bytree': 0.8056873452659543, 'subsample': 0.9591805450975721, 'min_child_weight': 6}


In [43]:
class Car:
    # Constructor method to initialize the attributes
    def __init__(self, brand, model, year):
        self.brand = brand
        self.model = model
        self.year = year
        self.odometer_reading = 1800  # Default attribute
    
    # Method to get a formatted descriptive name
    def get_descriptive_name(self):
        long_name = f"{self.year} {self.brand} {self.model}"
        return long_name.title()
    
    # Method to read the odometer
    def read_odometer(self):
        print(f"This car has {self.odometer_reading} miles on it.")
    
    # Method to update the odometer reading
    def update_odometer(self, mileage):
        if mileage >= self.odometer_reading:
            self.odometer_reading = mileage
        else:
            print("You cannot roll back the odometer!")
    
    # Method to increment the odometer reading
    def increment_odometer(self, miles):
        self.odometer_reading += miles

# Creating instances (objects) of the Car class
my_car = Car('audi', 'a4', 2020)
my_old_car = Car('ford', 'mustang', 1969)

# Accessing attributes and calling methods
print(my_car.get_descriptive_name())  # Output: 2020 Audi A4
my_old_car.read_odometer()  # Output: This car has 0 miles on it.

# Modifying attributes using methods
my_car.update_odometer(1500)
my_car.increment_odometer(100)
my_car.read_odometer()  # Output: This car has 1600 miles on it.


2020 Audi A4
This car has 1800 miles on it.
You cannot roll back the odometer!
This car has 1900 miles on it.
