In [1]:
# Laad the transformed data
# Apply feature transformation to the data
# Write preprocessing pipelines and train the models
# Compare the performance of the models and select the best one

# Load the data
In this step we will be loading our data and splitting it into training and test sets. We will also be loading the data into a pandas dataframe.

In [2]:
import os.path

from src.utils.data_loader import DataLoader
from src.feature_engineering.feature_transformations import FeatureEngineeringTransformations
import pandas as pd
import numpy as np

from src.utils.constants import INDEX, TARGET, PROCESSED_FILE_BEGIN

In [3]:
# Get the file path
processed_dir = '../data/processed'
dl = DataLoader(dir_path=processed_dir)
file_name = dl.get_latest_file(begins_with=PROCESSED_FILE_BEGIN)
filepath = os.path.join(processed_dir, file_name)

In [4]:
df = pd.read_csv(filepath, index_col=INDEX)
df.head()

Unnamed: 0_level_0,myear,body,transmission,fuel,km_driven,ip,oem,model,variant,City,...,Cargo Volume,state,mileage_new,owner_type,Fuel Suppy System,Alloy Wheel Size,Max Power Delivered,Max Power At,Max Torque Delivered,Max Torque At
usedCarSkuId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7111bf25-97af-47f9-867b-40879190d800,2016,hatchback,manual,cng,69162,0,maruti,maruti wagon r,lxi cng,lucknow,...,180-liters,uttar pradesh,26.6,first,,,58.16,6200.0,77.0,3500.0
c309efc1-efaf-4f82-81ad-dcb38eb36665,2015,hatchback,manual,cng,45864,0,maruti,maruti celerio,green vxi,mumbai,...,235-litres,maharashtra,31.79,first,Gasoline Port Injection,,58.2,6000.0,78.0,3500.0
7609f710-0c97-4f00-9a47-9b9284b62d3a,2015,sedan,manual,cng,81506,0,honda,honda amaze,s plus i-vtec,new delhi,...,400-litres,delhi,18.0,second,,,86.7,6000.0,109.0,4500.0
278b76e3-5539-4a5e-ae3e-353a2e3b6d7d,2013,hatchback,manual,cng,115893,0,maruti,maruti wagon r,lxi cng,new delhi,...,,delhi,26.2,second,Multi-Point Fuel Injection,13.0,58.2,6200.0,77.0,3500.0
b1eab99b-a606-48dd-a75b-57feb8a9ad92,2022,muv,manual,cng,18900,0,maruti,maruti ertiga,vxi cng,mumbai,...,,maharashtra,26.11,first,,,86.63,5500.0,121.5,4200.0


# Split the data into training and test sets

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
def generate_train_test_sets(df, target, test_size=0.2):
    """
    Generate the training and test sets from the data

    Parameters
    ---------
        df:
            The dataframe containing the data
        target:
            The target column
        test_size:
            The size of the test set

    Returns
    ---------
        X_train:
            The training set
        X_test:
            The test set
        y_train:
            The training target
        y_test:
            The test target
    """
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = generate_train_test_sets(df, TARGET, test_size=0.3)

# Print the size of the training and test sets
print(f'The size of the training set is {X_train.shape[0]}')
print(f'The size of the test set is {X_test.shape[0]}')

The size of the training set is 26360
The size of the test set is 11298


# Preprocessors and feature engineering

In [8]:
feat_eng = FeatureEngineeringTransformations()
train_combined = pd.concat([X_train, y_train], axis=1)
test_combined = pd.concat([X_test, y_test], axis=1)
train_combined = feat_eng.fit_transform(train_combined)
test_combined = feat_eng.transform(test_combined)

# Recover X_train, X_test, y_train, y_test
X_train = train_combined.drop(TARGET, axis=1)
y_train = train_combined[TARGET]
X_test = test_combined.drop(TARGET, axis=1)
y_test = test_combined[TARGET]

In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline

In [10]:
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['category', 'object', 'bool']).columns
# Convert all object and bool columns to category
X_train[categorical_cols] = X_train[categorical_cols].astype('category')
target_cols = ['oem', 'model', 'variant', 'City', 'state']
onehot_cols = [col for col in categorical_cols if col not in target_cols]
# Create a ColumnTransformer to apply different preprocessing to numerical and categorical columns
preprocessor1 = ColumnTransformer(transformers=[
	('num', StandardScaler(), numerical_cols),
	('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_cols)
],
	remainder='passthrough'
)

preprocessor2 = ColumnTransformer(transformers=[
	('num', StandardScaler(), numerical_cols),
	('cat_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), onehot_cols),
	('cat_target', TargetEncoder(), target_cols)
],
	remainder='passthrough'
)

numerical_transformer = Pipeline(steps=[
	('imputer', KNNImputer(n_neighbors=5)),
	('scaler', StandardScaler())
])

categorical_transformer_onehot = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='most_frequent')),
	('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

categorical_transformer_target = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='most_frequent')),
	('target', TargetEncoder())
])

preprocessor3 = ColumnTransformer(transformers=[
	('num', numerical_transformer, numerical_cols),
	('cat_onehot', categorical_transformer_onehot, categorical_cols),
],
	remainder='passthrough'
)

preprocessor4 = ColumnTransformer(transformers=[
	('num', numerical_transformer, numerical_cols),
	('cat_onehot', categorical_transformer_onehot, onehot_cols),
	('cat_target', categorical_transformer_target, target_cols)
],
	remainder='passthrough'
)

# Models

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score, cross_validate

In [12]:
import lightgbm
import xgboost
import catboost
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.svm import SVR

In [13]:
def get_pipeline(model, preprocessor):
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    return pipe

def evaluate_model(model, X, y, cv=5):
    y_pred = model.predict(X)
    mape = mean_absolute_percentage_error(y, y_pred)
    mae = np.mean(np.abs(y - y_pred))
    scores = {
        'mape': mape,
        'mae': mae
    }
    return scores

## CatBoost

In [42]:
cb_model = catboost.CatBoostRegressor(
	learning_rate = 0.1,
	depth = 6,
	l2_leaf_reg = 3,
	loss_function = 'MAE',
	bootstrap_type = 'Bernoulli',
	subsample = 0.7,
	od_type = 'Iter',
	od_wait = 50,
	random_seed = 42,
	allow_writing_files = False,
	verbose = 2,
	n_estimators = 1000,
)

In [43]:
X_train_cat = X_train.copy()
X_test_cat = X_test.copy()
y_train_cat = y_train.copy()
y_test_cat = y_test.copy()

In [44]:
cat_categorical_cols = X_train_cat.select_dtypes(include=['category', 'object']).columns
categorical_cols_with_nan = [col for col in cat_categorical_cols if X_train_cat[col].isnull().any()]
X_train_cat[categorical_cols] = X_train_cat[cat_categorical_cols].astype('category')
X_train_cat[categorical_cols] = X_train_cat[cat_categorical_cols].astype('category')

si = SimpleImputer(strategy='most_frequent', fill_value='missing')
X_train_cat[categorical_cols_with_nan] = si.fit_transform(X_train_cat[categorical_cols_with_nan])
X_test_cat[categorical_cols_with_nan] = si.transform(X_test_cat[categorical_cols_with_nan])

pool_train = catboost.Pool(X_train_cat, y_train_cat, cat_features=cat_categorical_cols.to_list())
pool_test = catboost.Pool(X_test_cat, cat_features=cat_categorical_cols.to_list())

In [45]:
cb_pipe_1 = get_pipeline(cb_model,ColumnTransformer(transformers=[
	('num', numerical_transformer, numerical_cols),
],
	remainder='passthrough'
))
cb_model.fit(pool_train)

0:	learn: 435085.8527566	total: 55.3ms	remaining: 55.2s
2:	learn: 385412.0323704	total: 151ms	remaining: 50.2s
4:	learn: 348235.7977525	total: 243ms	remaining: 48.4s
6:	learn: 315386.2905680	total: 341ms	remaining: 48.4s
8:	learn: 288436.5924730	total: 433ms	remaining: 47.7s
10:	learn: 269309.7087129	total: 512ms	remaining: 46s
12:	learn: 248585.3010941	total: 594ms	remaining: 45.1s
14:	learn: 232504.6426143	total: 701ms	remaining: 46s
16:	learn: 217226.5676838	total: 796ms	remaining: 46s
18:	learn: 206964.2405824	total: 895ms	remaining: 46.2s
20:	learn: 197086.5316493	total: 990ms	remaining: 46.1s
22:	learn: 188596.3701482	total: 1.1s	remaining: 46.6s
24:	learn: 181470.6250960	total: 1.18s	remaining: 46s
26:	learn: 176390.4332900	total: 1.25s	remaining: 45.1s
28:	learn: 171017.2278428	total: 1.33s	remaining: 44.6s
30:	learn: 166045.1766631	total: 1.4s	remaining: 43.8s
32:	learn: 158936.9135287	total: 1.48s	remaining: 43.3s
34:	learn: 154792.8974132	total: 1.55s	remaining: 42.9s
36:	le

<catboost.core.CatBoostRegressor at 0x132134850>

In [46]:
# Predict on test set
scores = evaluate_model(cb_model, pool_test, y_test)
scores

{'mape': 0.11202561169983177, 'mae': 81904.20719290896}

In [None]:
import optuna
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
from optuna.integration import CatBoostPruningCallback
from sklearn.model_selection import cross_validate

def objective(trial, pool_train):
	param_grid = {
	"loss_function": "MAE",
	"n_estimators": trial.suggest_categorical("n_estimators", [500, 1000, 2000]),
	"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
	"depth": trial.suggest_categorical("depth", [5,7,9,11,13,15,17]),
	"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1),
	"l2_leaf_reg": trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
	"boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
	"bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
	"random_seed": 42,
	"eval_metric": "MAE",
	}

	if param_grid["bootstrap_type"] == "Bayesian":
		param_grid["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
	elif param["bootstrap_type"] == "Bernoulli":
		param_grid["subsample"] = trial.suggest_float("subsample", 0.1, 1)
	
	scores = catboost.cv(
		pool_train,
		param_grid,
		fold_count=5,
		early_stopping_rounds=100,
		verbose=0,
	)

	return scores['test-MAE-mean'].min()

In [None]:
from catboost import Pool, cv

cv_data = X_train_cat.copy()

labels = y_train_cat.copy()

cat_features = cat_categorical_cols.tolist()

cv_dataset = Pool(data=cv_data,
                  label=labels,
                  cat_features=cat_features)

params = {"iterations": 1000,
          "depth": 2,
          "loss_function": "MAE",
          "verbose": False}

scores = cv(cv_dataset,
            params,
            fold_count=2,
            early_stopping_rounds=100
            )
# Get the score for the best iteration
best_score = np.min(scores['test-MAE-mean'])

## LightGBM

In [67]:
lgb_model = lightgbm.LGBMRegressor(
	learning_rate = 0.08363996779482333,
	num_leaves = 26,
	max_depth = 7,
	min_child_samples = 14,
	subsample = 0.8130687216963774,
	colsample_bytree = 0.726149859230546,
	reg_alpha = 6.495685321153756,
	reg_lambda = 0.004206014748968054,
	n_estimators = 1000,
	metric = 'mape',
	importance_type = 'gain',
	boosting_type = 'gbdt',
	verbose = 1,
	min_split_gain = 0.0,
	random_state=42,
	n_jobs=-1
)

In [68]:
lgb_pipe_1 = get_pipeline(lgb_model, preprocessor1)
# Train the pipeline
lgb_pipe_1.fit(X_train, y_train)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6260
[LightGBM] [Info] Number of data points in the train set: 26360, number of used features: 1366
[LightGBM] [Info] Start training from score 781376.551252


In [69]:
scores = evaluate_model(lgb_pipe_1, X_test, y_test)
scores

{'mape': 0.11767337448199067, 'mae': 80567.89290751299}

## XGBoost

In [80]:
xgb_model = xgboost.XGBRegressor(
	objective = 'reg:absoluteerror',
	learning_rate = 0.08363996779482333,
	max_depth = 7,
	min_child_samples = 14,
	subsample = 0.8130687216963774,
	colsample_bytree = 0.726149859230546,
	reg_alpha = 6.495685321153756,
	reg_lambda = 0.004206014748968054,
	n_estimators = 1000,
	importance_type = 'gain',
	verbose = 1,
	min_split_gain = 0.0,
	random_state=42,
	tree_method = "hist",
	single_precision_histogram=True,
	n_jobs=-1
)

In [81]:
xgb_pipe_1 = get_pipeline(xgb_model, preprocessor1)
# Train the pipeline
xgb_pipe_1.fit(X_train, y_train)

Parameters: { "min_child_samples", "min_split_gain", "single_precision_histogram", "verbose" } are not used.



In [82]:
scores = evaluate_model(xgb_pipe_1, X_test, y_test)
scores

{'mape': 0.19746590157611224, 'mae': 132037.78464419863}

## Random Forest

In [96]:
extra_tree_model = ExtraTreesRegressor(
	n_estimators = 300,
	max_depth = 7,
	min_samples_split = 14,
	min_samples_leaf = 1,
	bootstrap = True,
	oob_score = True,
	random_state=42,
	n_jobs=-1,
	verbose=1
)

In [97]:
extra_tree_pipe_1 = get_pipeline(extra_tree_model, preprocessor3)
# Train the pipeline
extra_tree_pipe_1.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  7.5min


KeyboardInterrupt: 

In [None]:
scores = evaluate_model(extra_tree_pipe_1, X_test, y_test)
scores

## Linear Regression