In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [8]:
df = pd.read_csv('updated_car_data.csv')
df.head()

Unnamed: 0,Brand,Insurance Validity,Fuel Type,Seats,Kms Driven,Ownership,Engine Displacement (in cc),Transmission,Mileage (in kmpl),Max Power (in bhp),Wheel Size,Registration Month,Registration Year,Price (in lacs),City
0,Maruti,Third Party,CNG,7,51012.0,First Owner,1373,Manual,22.8,80.9,16,Jun,2015,7.04,Mumbai
1,Renault,Third Party,Petrol,5,31782.0,First Owner,999,Manual,21.74,67.0,16,Jun,2020,4.7,Hyderabad
2,Hyundai,Third Party,Petrol,5,102028.0,First Owner,1591,Manual,17.01,121.3,16,Oct,2012,4.9,Bangalore
3,Ford,Not Available,Petrol,5,27807.0,First Owner,1196,Manual,18.16,86.8,14,Sept,2017,3.96,Kolkata
4,Maruti,Comprehensive,Diesel,5,60680.0,First Owner,1248,Manual,24.3,88.5,16,Oct,2019,9.5,Ahmedabad


In [9]:
price = "Price (in lacs)"

In [10]:
X = df.drop(price, axis = 1)
y = df[price]

In [11]:
col_index_mapper = {}
cnt = 0

for col in X.columns:
    col_index_mapper[col] = cnt
    cnt += 1;

col_index_mapper

{'Brand': 0,
 'Insurance Validity': 1,
 'Fuel Type': 2,
 'Seats': 3,
 'Kms Driven': 4,
 'Ownership': 5,
 'Engine Displacement (in cc)': 6,
 'Transmission': 7,
 'Mileage (in kmpl)': 8,
 'Max Power (in bhp)': 9,
 'Wheel Size': 10,
 'Registration Month': 11,
 'Registration Year': 12,
 'City': 13}

In [12]:
num_features_temp = X.select_dtypes(exclude="object").columns

num_features = []
for feature in num_features_temp:
    num_features.append(col_index_mapper[feature])

num_features

[3, 4, 6, 8, 9, 10, 12]

In [13]:
ohe_features = [col_index_mapper['Brand'],
col_index_mapper['Fuel Type'],
col_index_mapper['Transmission'],
col_index_mapper['Registration Month'],
col_index_mapper['City']]

ohe_features

[0, 2, 7, 11, 13]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [15]:
transformer = ColumnTransformer(
    transformers = [
        ('oe_ownership', OrdinalEncoder(categories=[['Third or More Owners', 'Second Owner', 'First Owner']]), [col_index_mapper['Ownership']]),
        ('oe_insurance', OrdinalEncoder(categories=[['Others', 'Not Available', 'Third Party', 'Comprehensive']]), [col_index_mapper['Insurance Validity']]),
        ('ohe', OneHotEncoder(sparse_output=False, drop='first'), ohe_features),
        ('scale', StandardScaler(), num_features)
    ],
    remainder = "passthrough"
)

In [16]:
xgboost_regressor = xgb.XGBRegressor()

In [17]:
xgboost_regressor = xgb.XGBRegressor()

pipe = Pipeline([
    ('transformer', transformer),
    ('xgboost_regressor', xgboost_regressor)
])

In [18]:
pipe.named_steps

{'transformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('oe_ownership',
                                  OrdinalEncoder(categories=[['Third or More '
                                                              'Owners',
                                                              'Second Owner',
                                                              'First Owner']]),
                                  [5]),
                                 ('oe_insurance',
                                  OrdinalEncoder(categories=[['Others',
                                                              'Not Available',
                                                              'Third Party',
                                                              'Comprehensive']]),
                                  [1]),
                                 ('ohe',
                                  OneHotEncoder(drop='first',
                                       

In [19]:
pipe.fit(X_train, y_train)

In [20]:
y_predict = pipe.predict(X_test)
y_predict

array([7.5257688, 6.7886925, 6.1412163, ..., 9.151995 , 5.8999767,
       7.227758 ], dtype=float32)

In [21]:
r2_score(y_test, y_predict)

0.9214788458398705

In [22]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    
    print(f'mse: {mse}')
    print(f'mae: {mae}')
    print(f'rmse: {rmse}')
    print(f'r2_score: {r2_square}')

In [23]:
evaluate_model(y_test, y_predict)

mse: 7.2302086035813655
mae: 1.1215241132450373
rmse: 2.68890472192329
r2_score: 0.9214788458398705


In [29]:
pipe2 = pipe
pipe2

## Random Search CV

In [None]:
random_grid ={
 "xgboost_regressor__learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
 "xgboost_regressor__max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "xgboost_regressor__min_child_weight" : [ 1, 3, 5, 7 ],
 "xgboost_regressor__gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "xgboost_regressor__colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
}


xgb_random = RandomizedSearchCV(estimator = pipe2, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [None]:
xgb_random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [None]:
params_random = xgb_random.best_params_
params_random

{'xgboost_regressor__min_child_weight': 1,
 'xgboost_regressor__max_depth': 10,
 'xgboost_regressor__learning_rate': 0.05,
 'xgboost_regressor__gamma': 0.3,
 'xgboost_regressor__colsample_bytree': 0.5}

In [30]:
xgboost_regressor = xgb.XGBRegressor(
    colsample_bytree = 0.5,
    gamma = 0.3,
    learning_rate = 0.05,
    max_depth = 10,
    min_child_weight = 1,
    n_estimators = 300
)

pipe2 = Pipeline([
    ('transformer', transformer),
    ('xgboost_regressor', xgboost_regressor)
])

In [33]:
pipe2.fit(X_train, y_train)

In [34]:
y_pred = pipe2.predict(X_test)

In [None]:
evaluate_model(y_test, y_pred)

mse: 7.2302086035813655
mae: 1.1215241132450373
rmse: 2.68890472192329
r2_score: 0.9214788458398705


## Grid Search CV

In [None]:
params_grid ={
    'xgboost_regressor__min_child_weight' : [1,2],
    'xgboost_regressor__max_depth': [10,12],
    'xgboost_regressor__learning_rage': [0.04, 0.05],
    'xgboost_regressor__gamma': [0.2,0.3],
    'xgboost_regressor__colsample_bytree': [0.5, 0.6],
    'xgboost_regressor__n_estimators': [300, 400]
}

xgb_grid = GridSearchCV(estimator=pipe2,param_grid=params_grid,cv=5,n_jobs=-1,verbose=2)

In [None]:
xgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [None]:
xgb_grid.best_params_

{'xgboost_regressor__colsample_bytree': 0.5,
 'xgboost_regressor__gamma': 0.3,
 'xgboost_regressor__learning_rage': 0.04,
 'xgboost_regressor__max_depth': 10,
 'xgboost_regressor__min_child_weight': 2,
 'xgboost_regressor__n_estimators': 400}

In [None]:
import xgboost as xgb

In [35]:
xgboost_regressor = xgb.XGBRegressor(
    colsample_bytree = 0.3,
    gamma = 0.1,
    learning_rate = 0.03,
    max_depth = 10,
    min_child_weight = 3,
    n_estimators = 600
)

pipe3 = Pipeline([
    ('transformer', transformer),
    ('xgboost_regressor', xgboost_regressor)
])

In [36]:
pipe3.fit(X_train, y_train)

In [37]:
y_pred = pipe3.predict(X_test)
y_pred

array([8.108395 , 6.6385274, 6.24682  , ..., 8.973387 , 6.002624 ,
       7.133637 ], dtype=float32)

In [38]:
evaluate_model(y_test, y_pred)

mse: 6.349301599349086
mae: 1.077603649000292
rmse: 2.519782053938214
r2_score: 0.9310456285528614


# Creating a Pickle File

In [40]:
import pickle
pickle.dump(pipe3,open('model.pkl','wb'))

In [41]:
model = pickle.load(open('model.pkl','rb'))