In [21]:
import os
import numpy as np
import xgboost as xb
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline
from pathlib import Path
from dataclasses import dataclass
from mlAusCar.config.configuartion import ConfigurationManager
from mlAusCar.constants import *
import joblib
from mlAusCar import custom_logger

from mlAusCar.utils.common import read_yaml, create_dirs
from mlAusCar.components.data_transformation import DataTransformation

In [5]:
%pwd

'd:\\My DL Workstation\\Projects\\Australian Vehicle Prices\\research'

In [6]:
os.chdir('../')
%pwd

'd:\\My DL Workstation\\Projects\\Australian Vehicle Prices'

In [10]:
transformation_config = ConfigurationManager().get_data_transformation_config()
data_transformation = DataTransformation(transformation_config)
X_train, y_train, X_test, y_test, log_pipeline, preprocessing = data_transformation.transform_data()

[2023-12-25 21:20:47,613]: INFO: common: .yaml from config\config.yaml has been loaded.
[2023-12-25 21:20:47,614]: INFO: common: .yaml from params.yaml has been loaded.
[2023-12-25 21:20:47,616]: INFO: common: .yaml from schema.yaml has been loaded.


# Modeling:

In [11]:
lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(X_train, y_train)

In [None]:
df_predictions = lin_reg.predict(X_test)

In [None]:
# exploring predictions of Linear Regression
np.exp(df_predictions[:5]).round(-2)

array([13900., 39800., 28300., 35100., 85900.])

In [None]:
# Calculating MSE on original values
lin_rmse = mean_squared_error(y_true=np.exp(y_test), y_pred=np.exp(df_predictions), squared=False)
lin_r2 = r2_score(y_true=np.exp(y_test), y_pred=np.exp(df_predictions))
print(lin_rmse, lin_r2)

13599.039893632977 0.726817458279859


the predictions are off by an average of ~$13,599 for the linear regression model. That's an accuracy of 72%. Let's attempt other regression models.

In [22]:
models = {
    'Random Forest': RandomForestRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Ada Boost': AdaBoostRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'XG Boost': xb.XGBRegressor()
}

r2_best = 0
for name, model in models.items():
    model_pipeline = make_pipeline(preprocessing, model)
    model_pipeline.fit(X_train, y_train)
    df_predictions = model_pipeline.predict(X_test)
    
    rmse = mean_squared_error(y_true=np.exp(y_test), y_pred=np.exp(df_predictions), squared=False)
    r2 = r2_score(y_true=np.exp(y_test), y_pred=np.exp(df_predictions))
    
    if r2 > r2_best:
        final_model = model_pipeline
    
    print(f"{name} model results:", '\n', f'RMSE: {rmse}', '\n', f'Accuracy:{r2}', "\n" + "=" * 30 + "\n")
    
joblib.dump(final_model, 'Australian_Vehicle_Prices_final_model.pkl')

Random Forest model results: 
 RMSE: 11055.545906606756 
 Accuracy:0.8194502753866046 

Decision Tree model results: 
 RMSE: 12320.153945970858 
 Accuracy:0.7757829083616207 

Ada Boost model results: 
 RMSE: 16627.436434613865 
 Accuracy:0.591598772930302 

Gradient Boost model results: 
 RMSE: 13293.438526639078 
 Accuracy:0.7389575667516793 

XG Boost model results: 
 RMSE: 10610.31166466812 
 Accuracy:0.8336998167771368 



['Australian_Vehicle_Prices_final_model.pkl']

The best model is XGBoost, with an accuracy of 83%. Possible improvements include (from better to worse):
+ Including the ['Title'] column, and extracting the trim out of the column for each model.
+ Gathering additional data from other datasets.
+ Fine-tuning the model using cross-validation and grid-search.

# Modules

In [23]:
# Update entity
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    model_name: str
    target_column: str

In [24]:
# Update config manager
class ConfigurationManager:
    def __init__(self,
                    config=CONFIG_FILE_PATH,
                    params=PARAMS_FILE_PATH,
                    schema=SCHEMA_FILE_PATH):
        
        self.config= read_yaml(config)
        self.params= read_yaml(params)
        self.schema= read_yaml(schema)
        
        create_dirs(self.config.artifacts_root)
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        model_trainer_config = ModelTrainerConfig(
            root_dir= self.config.model_trainer.root_dir,
            model_name= self.config.model_trainer.model_name,
            target_column=self.schema.TARGET_COLUMN.name
        )
        
        return model_trainer_config

In [32]:
# Update component
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        create_dirs(self.config.root_dir)
    
    def train_model(self):
        models = {
            'Random Forest': RandomForestRegressor(),
            'Decision Tree': DecisionTreeRegressor(),
            'Ada Boost': AdaBoostRegressor(),
            'Gradient Boost': GradientBoostingRegressor(),
            'XG Boost': xb.XGBRegressor()
        }

        r2_best = 0
        for name, model in models.items():
            model_pipeline = make_pipeline(preprocessing, model)
            model_pipeline.fit(X_train, y_train)
            df_predictions = model_pipeline.predict(X_test)
            
            rmse = mean_squared_error(y_true=np.exp(y_test), y_pred=np.exp(df_predictions), squared=False)
            r2 = r2_score(y_true=np.exp(y_test), y_pred=np.exp(df_predictions))
            
            if r2 > r2_best:
                final_model = model_pipeline
            
            custom_logger.info(f"{name} model results: \nRMSE: {rmse} \nAccuracy: {r2} \n{'=' * 30}")
            
        joblib.dump(final_model, os.path.join(self.config.root_dir, self.config.model_name))

In [33]:
# Update pipeline
try:
    config= ConfigurationManager()
    data_training_config= config.get_model_trainer_config()
    trainer=ModelTrainer(data_training_config)
    trainer.train_model()
except Exception as e:
    raise e

[2023-12-25 22:58:00,924]: INFO: common: .yaml from config\config.yaml has been loaded.
[2023-12-25 22:58:00,925]: INFO: common: .yaml from params.yaml has been loaded.
[2023-12-25 22:58:00,926]: INFO: common: .yaml from schema.yaml has been loaded.
[2023-12-25 22:58:51,690]: INFO: 2138800250: Random Forest model results: 
RMSE: 11127.980213442253 
Accuracy: 0.8170766546161184 
[2023-12-25 22:58:52,361]: INFO: 2138800250: Decision Tree model results: 
RMSE: 12235.549636414566 
Accuracy: 0.7788517983025695 
[2023-12-25 22:58:53,290]: INFO: 2138800250: Ada Boost model results: 
RMSE: 16382.908943852817 
Accuracy: 0.6035225598198097 
[2023-12-25 22:58:55,020]: INFO: 2138800250: Gradient Boost model results: 
RMSE: 13293.576941012532 
Accuracy: 0.7389521306543327 
[2023-12-25 22:58:55,110]: INFO: 2138800250: XG Boost model results: 
RMSE: 10610.31166466812 
Accuracy: 0.8336998167771368 
