In [3]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score

from category_encoders.target_encoder import TargetEncoder

In [7]:
RANDOM_STATE = 42

In [8]:
df = pd.read_csv('clean_cars.csv')

In [9]:
df.sample(3)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,torque_1,torque_2
249,Hyundai Verna 1.6 SX VTVT (O),2014,500000,70000,Petrol,Individual,Manual,First Owner,17.1,1591,121.3,5,154.9,4200
2223,Maruti Eeco 5 Seater AC BSIV,2020,415000,15000,Petrol,Individual,Manual,First Owner,15.37,1196,73.0,5,101.0,3000
4278,Toyota Corolla H2,2006,150000,90000,Petrol,Individual,Manual,Third Owner,13.4,1794,125.0,5,157.89,4200


In [10]:
X = df.drop(columns=['selling_price'])
y = df['selling_price']
y_log = np.log1p(y)

In [11]:
owner = {'Test Drive Car': 0, 'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3, 'Fourth & Above Owner': 4}
X['owner'] = X['owner'].apply(lambda x: owner[x])
X = pd.get_dummies(X, columns=['fuel', 'seller_type', 'transmission'], drop_first=True)

In [12]:
cat_cols = ['name']

In [13]:
p_gb_best = Pipeline([
    ('encoder_',TargetEncoder(cols=cat_cols, smoothing=1)),
    ('scaler_', StandardScaler()),
    ('gb', GradientBoostingRegressor(max_depth=4, n_estimators=561, random_state=RANDOM_STATE))
    ])

In [14]:
p_gb_best.fit(X, y_log)

In [11]:
with open('model_cars.pkl', 'wb') as file:
    pickle.dump(p_gb_best, file)