In [4]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/transformed-data/transformed.csv


In [5]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/transformed-data/transformed.csv


In [6]:
df = pd.read_csv('/kaggle/input/transformed-data/transformed.csv')
INDEX = 'usedCarSkuId'
TARGET = 'listed_price'

df.head()

Unnamed: 0,usedCarSkuId,myear,body,transmission,fuel,km_driven,ip,oem,model,variant,...,Cargo Volume,state,mileage_new,owner_type,Fuel Suppy System,Alloy Wheel Size,Max Power Delivered,Max Power At,Max Torque Delivered,Max Torque At
0,7111bf25-97af-47f9-867b-40879190d800,2016,hatchback,manual,cng,69162,0,maruti,maruti wagon r,lxi cng,...,180-liters,uttar pradesh,26.6,first,,,58.16,6200.0,77.0,3500.0
1,c309efc1-efaf-4f82-81ad-dcb38eb36665,2015,hatchback,manual,cng,45864,0,maruti,maruti celerio,green vxi,...,235-litres,maharashtra,31.79,first,Gasoline Port Injection,,58.2,6000.0,78.0,3500.0
2,7609f710-0c97-4f00-9a47-9b9284b62d3a,2015,sedan,manual,cng,81506,0,honda,honda amaze,s plus i-vtec,...,400-litres,delhi,18.0,second,,,86.7,6000.0,109.0,4500.0
3,278b76e3-5539-4a5e-ae3e-353a2e3b6d7d,2013,hatchback,manual,cng,115893,0,maruti,maruti wagon r,lxi cng,...,,delhi,26.2,second,Multi-Point Fuel Injection,13.0,58.2,6200.0,77.0,3500.0
4,b1eab99b-a606-48dd-a75b-57feb8a9ad92,2022,muv,manual,cng,18900,0,maruti,maruti ertiga,vxi cng,...,,maharashtra,26.11,first,,,86.63,5500.0,121.5,4200.0


---

# Feature Engineering

In [7]:
from sklearn.base import BaseEstimator
from ast import literal_eval


class FeatureEngineeringTransformations(BaseEstimator):
    """
    This class contains all the recommended feature engineering transformations for the dataset.

    :parameter
        df: pd.DataFrame
            The dataframe to be transformed
        object_cols: list
            The list of columns that contain the car features like top_features, comfort_features, etc. that need to be transformed
    """

    def __init__(
            self,
            df: pd.DataFrame,
            object_cols=None
    ):
        if object_cols is None:
            object_cols = [
                'top_features',
                'comfort_features',
                'interior_features',
                'exterior_features',
                'safety_features'
            ]
        self.df = df.copy()
        self.object_cols = object_cols
        self.feature_prices = None
        self.is_fitted = False

    def _car_object_feature_dict(self) -> dict:
        unique_feature_scores = dict()
        for col in self.object_cols:
            for _, row in self.df.iterrows():
                feature_list = literal_eval(row[col])
                for feature in feature_list:
                    if feature in unique_feature_scores.keys():
                        unique_feature_scores[feature][1] += 1
                        unique_feature_scores[feature][0] += row[TARGET]
                    else:
                        unique_feature_scores[feature] = [row[TARGET], 1]

        return unique_feature_scores

    def _map_object_cols_to_scores(self, x: str) -> float:
        feature_list = literal_eval(x)
        feature_score = 0
        for feature in feature_list:
            if feature in self.feature_prices.keys():
                feature_score += self.feature_prices[feature][0] / self.feature_prices[feature][1]
            else:
                feature_score += 0
        return feature_score

    def _car_object_feature_transformation(self, df) -> pd.DataFrame:
        if self.feature_prices is None:
            raise Exception('Please fit the transformer first')

        for col in self.object_cols:
            df[f'{col}_score'] = df[col].apply(self._map_object_cols_to_scores)
            df.drop(col, axis=1, inplace=True)
            # Replace zero scores with nan
            df[f'{col}_score'] = df[f'{col}_score'].replace(0, np.nan)
        return df

    def fit(self, X=None, y=None):
        self.feature_prices = self._car_object_feature_dict()
        self.df = self._car_object_feature_transformation(self.df)
        self.is_fitted = True
        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        if not self.is_fitted:
            raise Exception('Please fit the transformer first')
        
        df = df.copy()
        # Transform the object columns to scores
        df = self._car_object_feature_transformation(df)
        return df

---

# Training and Testing Data

In [8]:
from sklearn.model_selection import train_test_split

def get_training_testing_validation_data(df: pd.DataFrame) -> tuple:
    keep, val = train_test_split(df, test_size=0.1, random_state=42)
    # Do not touch the val data, it will be used for the final final evalutation
    # The val data would be split into 2 parts:
    # 1. The first part val_1 will be used to determine the best model out of the HP optimized models
    # 2. The second part val_2 will serve as our final scores

    train, test = train_test_split(keep, test_size=0.2, random_state=42)
    return train, test, val

In [9]:
train, test, valid = get_training_testing_validation_data(df)

# Do the feature engineering transformations
fe = FeatureEngineeringTransformations(train)
fe = fe.fit()

train = fe.transform(train)
test = fe.transform(test)
valid = fe.transform(valid)

train.shape, test.shape, valid.shape

((27113, 51), (6779, 51), (3766, 51))

In [10]:
X_train = train.drop(columns=[TARGET], axis=1).reset_index(drop=True)
y_train = train[TARGET].reset_index(drop=True)
X_test = test.drop(columns=[TARGET], axis=1).reset_index(drop=True)
y_test = test[TARGET].reset_index(drop=True)

---

# Pre Processor

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
bool_cols = X_train.select_dtypes(include=['bool']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
onehot_cols = [col for col in categorical_cols if X_train[col].nunique() < 50]
target_cols = [col for col in categorical_cols if col not in onehot_cols]
all_cols = X_train.columns

numerical_transformer = Pipeline(
    [
        ('scaler', StandardScaler()),
    ]
)
boolean_transformer = Pipeline(
    [
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ]
)
categorical_transformer = Pipeline(
    [
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ]
)

preprocessor = ColumnTransformer(
    [
        ('num', numerical_transformer, numerical_cols),
        ('bool', boolean_transformer, bool_cols),
        ('cat', categorical_transformer, categorical_cols),
    ],
    remainder='drop',
)

---

# Model

In [12]:
import xgboost

xgb_model = xgboost.XGBRegressor(
    objective = 'reg:linear',
    learning_rate = 0.08363996779482333,
    max_depth = 7,
    min_child_samples = 14,
    subsample = 0.8130687216963774,
    colsample_bytree = 0.726149859230546,
    reg_alpha = 6.495685321153756,
    reg_lambda = 0.004206014748968054,
    n_estimators = 1000,
    importance_type = 'gain',
    verbose = 1,
    min_split_gain = 0.0,
    random_state=42,
    tree_method = "hist", # change to gpu_hist if a gpu is available
    single_precision_histogram=True,
    n_jobs=-1
)

---

# Pipeline

In [13]:
# Create a pipeline to preprocess the data and train the model
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb_model)
])

### Fit the pipeline

In [14]:
pipe.fit(X_train, y_train)

Parameters: { "min_child_samples", "min_split_gain", "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['myear', 'km_driven', 'ip', 'Displacement', 'No of Cylinder',
       'Valves per Cylinder', 'Width', 'Wheel Base', 'Front Tread',
       'Kerb Weight', 'Seats', 'Turning Radius', 'Top Speed', 'Acceleration',
       'Doors', 'mileage_new', 'Alloy Wheel Size', 'Max Power Deliver...
                              importance_type='gain',
                              interaction_constraints='',
                              learning_rate=0.08363996779482333, max_bin=256,
                              max_cat_to_onehot=4, max_delta_step=0,
                              max_depth=7, max_leaves=0, min_child_samples=14,
                              min_child_weight=1, min_split_gain=0.0,

### Make some predictions

In [15]:
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error

# Make predictions on the test data
y_pred = pipe.predict(X_test)

# Calculate the MAPE and MAE
mape = mean_absolute_percentage_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAPE: {mape}")
print(f"MAE: {mae}")

MAPE: 0.11695463710664752
MAE: 78025.79261436974


---

# Hyperparamter Optimization usin Optuna HyperBand

In [31]:
import optuna
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold

def objective(trial, pipeline, X, y):
    param_grid = {
        "tree_method": "hist", # change to gpu_hist if a gpu is available
        "objective": "reg:squarederror",
        "single_precision_histogram": True,
        "importance_type": "gain",
        "n_estimators": trial.suggest_categorical("n_estimators", [1000, 2000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_categorical("max_depth", [5,7,9,11,13,15,17]),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1),
        "reg_lambda": trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        "reg_alpha": trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        "random_state": 42,
    }
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        pipeline['model'].set_params(
            n_jobs= -1,
            **param_grid
        )
        pipeline.fit(X_train, y_train)

        preds = pipeline.predict(X_test)
        cv_scores[idx] = mean_absolute_percentage_error(y_test, preds)

    return np.mean(cv_scores)

In [33]:
# Define the XGBoost model
xgb_model = xgboost.XGBRegressor()

# Define the full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', xgb_model)
])

# Define the Optuna search algorithm
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(lambda trial: objective(trial, pipeline, X_train, y_train), n_trials=50, timeout=None)

[32m[I 2023-04-04 17:50:25,340][0m A new study created in memory with name: no-name-b0054549-12f7-4a89-9843-8ec7aa8fe37f[0m
[32m[I 2023-04-04 18:01:09,757][0m Trial 0 finished with value: 0.12576865870555248 and parameters: {'n_estimators': 2000, 'learning_rate': 0.1205712628744377, 'max_depth': 13, 'min_child_weight': 7, 'subsample': 0.9729188669457949, 'colsample_bytree': 0.8491983767203796, 'reg_lambda': 0.0070689749506246055, 'reg_alpha': 0.005337032762603957}. Best is trial 0 with value: 0.12576865870555248.[0m
[32m[I 2023-04-04 18:04:28,657][0m Trial 1 finished with value: 0.1420403798131069 and parameters: {'n_estimators': 2000, 'learning_rate': 0.05958389350068958, 'max_depth': 9, 'min_child_weight': 236, 'subsample': 0.2797064039425238, 'colsample_bytree': 0.5628109945722505, 'reg_lambda': 0.23423849847112907, 'reg_alpha': 0.0015339162591163618}. Best is trial 0 with value: 0.12576865870555248.[0m
[32m[I 2023-04-04 18:05:38,114][0m Trial 2 finished with value: 0.155

KeyboardInterrupt: 

In [35]:
trial = study.best_trial
best_params = trial.params

best_xgbmodel = xgboost.XGBRegressor(
    **best_params, 
    random_state=42, 
    n_jobs=-1, 
    tree_method="hist", 
    objective="reg:squarederror", 
    single_precision_histogram=True,
    importance_type="gain"
)

# Define the full pipeline
best_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', best_xgbmodel)
])

best_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['myear', 'km_driven', 'ip', 'Displacement', 'No of Cylinder',
       'Valves per Cylinder', 'Width', 'Wheel Base', 'Front Tread',
       'Kerb Weight', 'Seats', 'Turning Radius', 'Top Speed', 'Acceleration',
       'Doors', 'mileage_new', 'Alloy Wheel Size', 'Max Power Deliver...
                              importance_type='gain',
                              interaction_constraints='',
                              learning_rate=0.04766173848322245, max_bin=256,
                              max_cat_to_onehot=4, max_delta_step=0,
                              max_depth=11, max_leaves=0, min_child_weight=25,
                              missing=nan, monotone_constraints='()',

In [36]:
# Make predictions on the test data
y_pred = best_pipeline.predict(X_test)

# Calculate the MAPE and MAE
mape = mean_absolute_percentage_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAPE: {mape}")
print(f"MAE: {mae}")

MAPE: 0.11816291390223506
MAE: 80835.50288459765
