In [6]:
import os
import numpy as np
import xgboost as xb
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline
from pathlib import Path
from dataclasses import dataclass
from mlAusCar.config.configuartion import ConfigurationManager
from mlAusCar.constants import *
import joblib
from mlAusCar import custom_logger
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
from mlAusCar.utils.common import read_yaml, create_dirs, save_json
from mlAusCar.components.data_transformation import DataTransformation
from mlAusCar.pipeline.stage_03_data_transformation import DataTransformationTrainingPipeline

In [7]:
%pwd

'd:\\My DL Workstation\\Projects\\Australian Vehicle Prices\\research'

In [8]:
os.chdir('../')
%pwd

'd:\\My DL Workstation\\Projects\\Australian Vehicle Prices'

In [10]:
transformation_config = ConfigurationManager().get_data_transformation_config()
data_transformation = DataTransformation(transformation_config)
X_train, y_train, X_test, y_test, log_pipeline, preprocessing = data_transformation.transform_data()

[2023-12-25 21:20:47,613]: INFO: common: .yaml from config\config.yaml has been loaded.
[2023-12-25 21:20:47,614]: INFO: common: .yaml from params.yaml has been loaded.
[2023-12-25 21:20:47,616]: INFO: common: .yaml from schema.yaml has been loaded.


# Modeling:

In [11]:
lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(X_train, y_train)

In [None]:
df_predictions = lin_reg.predict(X_test)

In [None]:
# exploring predictions of Linear Regression
np.exp(df_predictions[:5]).round(-2)

array([13900., 39800., 28300., 35100., 85900.])

In [None]:
# Calculating MSE on original values
lin_rmse = mean_squared_error(y_true=np.exp(y_test), y_pred=np.exp(df_predictions), squared=False)
lin_r2 = r2_score(y_true=np.exp(y_test), y_pred=np.exp(df_predictions))
print(lin_rmse, lin_r2)

13599.039893632977 0.726817458279859


the predictions are off by an average of ~$13,599 for the linear regression model. That's an accuracy of 72%. Let's attempt other regression models.

In [22]:
models = {
    'Random Forest': RandomForestRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Ada Boost': AdaBoostRegressor(),
    'Gradient Boost': GradientBoostingRegressor(),
    'XG Boost': xb.XGBRegressor()
}

r2_best = 0
for name, model in models.items():
    model_pipeline = make_pipeline(preprocessing, model)
    model_pipeline.fit(X_train, y_train)
    df_predictions = model_pipeline.predict(X_test)
    
    rmse = mean_squared_error(y_true=np.exp(y_test), y_pred=np.exp(df_predictions), squared=False)
    r2 = r2_score(y_true=np.exp(y_test), y_pred=np.exp(df_predictions))
    
    if r2 > r2_best:
        final_model = model_pipeline
    
    print(f"{name} model results:", '\n', f'RMSE: {rmse}', '\n', f'Accuracy:{r2}', "\n" + "=" * 30 + "\n")
    
joblib.dump(final_model, 'Australian_Vehicle_Prices_final_model.pkl')

Random Forest model results: 
 RMSE: 11055.545906606756 
 Accuracy:0.8194502753866046 

Decision Tree model results: 
 RMSE: 12320.153945970858 
 Accuracy:0.7757829083616207 

Ada Boost model results: 
 RMSE: 16627.436434613865 
 Accuracy:0.591598772930302 

Gradient Boost model results: 
 RMSE: 13293.438526639078 
 Accuracy:0.7389575667516793 

XG Boost model results: 
 RMSE: 10610.31166466812 
 Accuracy:0.8336998167771368 



['Australian_Vehicle_Prices_final_model.pkl']

The best model is XGBoost, with an accuracy of 83%. Possible improvements include (from better to worse):
+ Including the ['Title'] column, and extracting the trim out of the column for each model.
+ Gathering additional data from other datasets.
+ Fine-tuning the model using cross-validation and grid-search.

# Modules

In [9]:
os.environ['MLFLOW_TRACKING_URI']='https://dagshub.com/dahshury/Australian-Vehicle-Price-Prediction.mlflow'
os.environ['MLFLOW_TRACKING_USERNAME']='dahshury'
os.environ['MLFLOW_TRACKING_PASSWORD']='e111d6af616a4951bc8463aef26461528e98b335'

In [10]:
# Update entity
@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    model_name: str
    target_column: str
    metrics_file_name: Path
    eval_root_dir: Path

In [11]:
# Update config manager
class ConfigurationManager:
    def __init__(self,
                    config=CONFIG_FILE_PATH,
                    params=PARAMS_FILE_PATH,
                    schema=SCHEMA_FILE_PATH):
        
        self.config= read_yaml(config)
        self.params= read_yaml(params)
        self.schema= read_yaml(schema)
        
        create_dirs(self.config.artifacts_root)
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        model_trainer_config = ModelTrainerConfig(
            root_dir= self.config.model_trainer.root_dir,
            model_name= self.config.model_trainer.model_name,
            metrics_file_name=self.config.model_trainer.metrics_file_name,
            target_column=self.schema.TARGET_COLUMN.name,
            eval_root_dir= self.config.model_evaluation.root_dir
        )
        
        return model_trainer_config

In [12]:
transformation_obj = DataTransformationTrainingPipeline().main()

[2023-12-26 14:49:00,876]: INFO: common: .yaml from config\config.yaml has been loaded.
[2023-12-26 14:49:00,878]: INFO: common: .yaml from params.yaml has been loaded.
[2023-12-26 14:49:00,880]: INFO: common: .yaml from schema.yaml has been loaded.


In [13]:
# Update component
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig, transformations: DataTransformation):
        self.config = config
        self.transformations= transformations
        self.scores = {}  # Initialize an empty dictionary to store scores
        self.models = {
            'Random Forest': RandomForestRegressor(),
            'Decision Tree': DecisionTreeRegressor(),
            'Ada Boost': AdaBoostRegressor(),
            'Gradient Boost': GradientBoostingRegressor(),
            'XG Boost': xb.XGBRegressor()
        }
        create_dirs([self.config.root_dir, self.config.eval_root_dir])
        self.model_pipeline= None

    def train_model(self, eval: bool = False):
        X_train, y_train, X_test, y_test, log_pipeline_, preprocessing = self.transformations

        r2_best = 0
        for name, model in self.models.items():
            model_pipeline = make_pipeline(preprocessing, model)
            model_pipeline.fit(X_train, y_train)
            df_predictions = model_pipeline.predict(X_test)
            
            rmse = mean_squared_error(y_true=np.exp(y_test), y_pred=np.exp(df_predictions), squared=False)
            r2 = r2_score(y_true=np.exp(y_test), y_pred=np.exp(df_predictions))
            mae= mean_absolute_error(y_true=np.exp(y_test), y_pred=np.exp(df_predictions))
            
            if r2 > r2_best:
                final_model = model_pipeline
            
            self.model_pipeline = model_pipeline
            
            # Store scores in the dictionary and construct the metrics json
            self.scores[name] = {'RMSE': rmse, 'Accuracy': r2, 'MAE': mae}
            save_json(save_path=Path(os.path.join(self.config.eval_root_dir, name + " " + self.config.metrics_file_name)), data=self.scores)
            
            custom_logger.info(f"{name} model results: \nRMSE: {rmse} \nAccuracy: {r2} \nMAE: {mae}\n{'=' * 30}")
            
            joblib.dump(model_pipeline, os.path.join(self.config.root_dir, name +" " + self.config.model_name))
            
            if eval:
        
                mlflow.set_registry_uri("https://dagshub.com/dahshury/Australian-Vehicle-Price-Prediction.mlflow")
                tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
                
                with mlflow.start_run():
                    
                    
                    for name in self.models.keys():
                        # Unpack scores into individual variables
                        rmse = self.scores[name]['RMSE']
                        accuracy = self.scores[name]['Accuracy']
                        mae = self.scores[name]['MAE']
                        
                        mlflow.log_metric('RMSE', rmse)
                        mlflow.log_metric('Accuracy', accuracy)
                        mlflow.log_metric('MAE', mae)
                        
                        # Model registry does not work with file store
                        if tracking_url_type_store != "file":

                            # Register the model
                            # There are other ways to use the Model Registry, which depends on the use case,
                            # please refer to the doc for more information:
                            # https://mlflow.org/docs/latest/model-registry.html#api-workflow
                            mlflow.sklearn.log_model(self.model_pipeline, "model", registered_model_name=f"{name}")
                        else:
                            mlflow.sklearn.log_model(self.model_pipeline, f"{name}")

In [45]:
# Update pipeline
try:
    config= ConfigurationManager()
    data_training_config= config.get_model_trainer_config()
    trainer=ModelTrainer(data_training_config, transformation_obj)
    scores= trainer.train_model(eval=True)
except Exception as e:
    raise e

[2023-12-26 13:59:08,524]: INFO: common: .yaml from config\config.yaml has been loaded.
[2023-12-26 13:59:08,525]: INFO: common: .yaml from params.yaml has been loaded.
[2023-12-26 13:59:08,527]: INFO: common: .yaml from schema.yaml has been loaded.
[2023-12-26 13:59:54,045]: INFO: common: .json file saved at artifacts\model_evaluation\Random Forestmetrics.json
[2023-12-26 13:59:54,046]: INFO: 2331206457: Random Forest model results: 
RMSE: 11068.085579307288 
Accuracy: 0.8190404686788613 
MAE: 4467.930219753342
[2023-12-26 13:59:54,761]: INFO: common: .json file saved at artifacts\model_evaluation\Decision Treemetrics.json
[2023-12-26 13:59:54,761]: INFO: 2331206457: Decision Tree model results: 
RMSE: 12216.803865249292 
Accuracy: 0.7795289101654655 
MAE: 5449.793558541856
[2023-12-26 13:59:55,672]: INFO: common: .json file saved at artifacts\model_evaluation\Ada Boostmetrics.json
[2023-12-26 13:59:55,673]: INFO: 2331206457: Ada Boost model results: 
RMSE: 16571.638304420507 
Accurac

Registered model 'Random Forest' already exists. Creating a new version of this model...
2023/12/26 14:00:03 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Random Forest, version 4
Created version '4' of model 'Random Forest'.
Successfully registered model 'Decision Tree'.
2023/12/26 14:00:09 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Decision Tree, version 1
Created version '1' of model 'Decision Tree'.
Successfully registered model 'Ada Boost'.
2023/12/26 14:00:14 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Ada Boost, version 1
Created version '1' of model 'Ada Boost'.
Successfully registered model 'Gradient Boost'.
2023/12/26 14:00:20 INFO mlflow.tracking._model_registry.client: Waiting up to 300 se