In [1]:
#import libraries
import pandas as pd
import numpy as np

In [2]:
#import dataset
df = pd.read_csv("market data clean for ML.csv")
df.shape

(169, 9)

In [3]:
#drop XE variants because of low representation (4 rows)
df = df.loc[~(df.XE == 1),:]
df = df.drop(["XE"], axis = 1)
df.shape

(165, 8)

In [4]:
#check for outliers in price
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
df[(df.price<lower_limit)|(df.price>upper_limit)]
#no outliers

Unnamed: 0,year,transmission,price,mileage,E,G,J,XLE


In [5]:
#check for outliers in mileage
Q1 = df.mileage.quantile(0.25)
Q3 = df.mileage.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
df[(df.mileage<lower_limit)|(df.mileage>upper_limit)]
#although we detected outliers using IQR these outliers are natural and therefore we will keep them

Unnamed: 0,year,transmission,price,mileage,E,G,J,XLE
7,2016,Manual,320000,195000,0,0,1,0
48,2008,Manual,268000,145000,1,0,0,0
58,2013,Manual,280000,195000,0,1,0,0
146,2008,Manual,260000,155000,0,1,0,0
159,2016,Manual,330000,155000,0,0,1,0
166,2016,Manual,365000,175000,1,0,0,0


In [6]:
#get_dummies for transmission
dummy = pd.get_dummies(df.transmission)
df = pd.concat([df,dummy],axis=1)
df.head()

Unnamed: 0,year,transmission,price,mileage,E,G,J,XLE,Automatic,Manual
0,2021,Automatic,598000,5000,0,0,0,1,1,0
1,2021,Automatic,579000,15000,0,0,0,1,1,0
2,2021,Automatic,680000,15000,0,0,0,1,1,0
3,2021,Automatic,628000,5000,0,0,0,1,1,0
4,2020,Automatic,698000,15000,0,1,0,0,1,0


In [7]:
#We will change year values into age values
df["age"] = [2023-x for x in df.year]

In [8]:
#drop original features
df = df.drop(["transmission","year"], axis = "columns")
df.head()

Unnamed: 0,price,mileage,E,G,J,XLE,Automatic,Manual,age
0,598000,5000,0,0,0,1,1,0,2
1,579000,15000,0,0,0,1,1,0,2
2,680000,15000,0,0,0,1,1,0,2
3,628000,5000,0,0,0,1,1,0,2
4,698000,15000,0,1,0,0,1,0,3


In [9]:
#drop one variable per one hot encoded feature to avoid multicollinearity
df = df.drop(["XLE","Manual"], axis = "columns")
df.head()

Unnamed: 0,price,mileage,E,G,J,Automatic,age
0,598000,5000,0,0,0,1,2
1,579000,15000,0,0,0,1,2
2,680000,15000,0,0,0,1,2
3,628000,5000,0,0,0,1,2
4,698000,15000,0,1,0,1,3


In [10]:
#split into dependent and independent variables
X = df.drop('price', axis = 1)
y = df['price']

In [11]:
#get column names of X
X_columns = X.columns.values.tolist()
X_columns

['mileage', 'E ', 'G ', 'J ', 'Automatic', 'age']

In [12]:
#use Standardscaler to scale the independent features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

In [13]:
X = pd.DataFrame(scaled_X,columns = X_columns)
X

Unnamed: 0,mileage,E,G,J,Automatic,age
0,-0.916585,-0.890774,-0.402457,-0.371391,0.879883,-0.932363
1,-0.665153,-0.890774,-0.402457,-0.371391,0.879883,-0.932363
2,-0.665153,-0.890774,-0.402457,-0.371391,0.879883,-0.932363
3,-0.916585,-0.890774,-0.402457,-0.371391,0.879883,-0.932363
4,-0.665153,-0.890774,2.484736,-0.371391,0.879883,-0.663412
...,...,...,...,...,...,...
160,-0.413720,-0.890774,-0.402457,2.692582,-1.136515,-0.125510
161,0.089144,1.122619,-0.402457,-0.371391,-1.136515,0.143440
162,3.357763,1.122619,-0.402457,-0.371391,-1.136515,0.412391
163,0.843441,1.122619,-0.402457,-0.371391,-1.136515,0.143440


In [14]:
#split into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test,y_train, y_test = train_test_split(X,y, 
                                   test_size=0.25,
                                   random_state=104)

In [15]:
#import linear regression models and metrics
from sklearn.metrics import make_scorer, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet, SGDRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [16]:
# Define the hyperparameters
elastic_param = {'alpha': [0.1, 1, 10], 'l1_ratio': [0.1, 0.5, 0.9]}
sgd_param = {'alpha': [0.0001, 0.001, 0.01], 'penalty': ['l1', 'l2']}
svr_param = {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
rf_param = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 10],'bootstrap': [True, False]}
lgbm_param = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 10],'learning_rate': [0.01, 0.1]}
xgb_param = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 10],'learning_rate': [0.01, 0.1]}

In [17]:
# Create the models
elastic_model = ElasticNet()
sgd_model = SGDRegressor()
svr_model = SVR()
rf_model = RandomForestRegressor()
lgbm_model = LGBMRegressor()
xgb_model = XGBRegressor()

# Define the scoring function
scorer = make_scorer(r2_score)

In [18]:
# Create the GridSearchCV objects
elastic_search = GridSearchCV(elastic_model, elastic_param, cv=5, scoring=scorer)
sgd_search = GridSearchCV(sgd_model, sgd_param, cv=5, scoring=scorer)
svr_search = GridSearchCV(svr_model, svr_param, cv=5, scoring=scorer)
rf_search = GridSearchCV(rf_model, rf_param, cv=5, scoring=scorer)
lgbm_search = GridSearchCV(lgbm_model, lgbm_param, cv=5, scoring=scorer)
xgb_search = GridSearchCV(xgb_model, xgb_param, cv=5, scoring=scorer)

In [19]:
# Fit the GridSearchCV objects to the training data
elastic_search.fit(X_train, y_train)
sgd_search.fit(X_train, y_train)
svr_search.fit(X_train, y_train)
rf_search.fit(X_train, y_train)
lgbm_search.fit(X_train, y_train)
xgb_search.fit(X_train, y_train)

In [20]:
# Print the best r2 score for each model on the testing set
print("Best r2 score for ElasticNet: ", r2_score(y_test, elastic_search.predict(X_test)))
print("Best r2 score for SGDRegressor: ", r2_score(y_test, sgd_search.predict(X_test)))
print("Best r2 score for SVR: ", r2_score(y_test, svr_search.predict(X_test)))
print("Best r2 score for RandomForestRegressor: ", r2_score(y_test, rf_search.predict(X_test)))
print("Best r2 score for LGBMRegressor: ", r2_score(y_test, lgbm_search.predict(X_test)))
print("Best r2 score for XGBRegressor: ", r2_score(y_test, xgb_search.predict(X_test)))

Best r2 score for ElasticNet:  0.7884783480345467
Best r2 score for SGDRegressor:  0.7869958811152423
Best r2 score for SVR:  0.020896800544334404
Best r2 score for RandomForestRegressor:  0.724301943399978
Best r2 score for LGBMRegressor:  0.7551585215129235
Best r2 score for XGBRegressor:  0.75225047297114


In [21]:
# For ElasticNet
best_params_elasticnet = elastic_search.best_params_
print("Best parameters for ElasticNet:", best_params_elasticnet)

# For SGDRegressor
best_params_sgd = sgd_search.best_params_
print("Best parameters for SGDRegressor:", best_params_sgd)

# For SVR
best_params_svr = svr_search.best_params_
print("Best parameters for SVR:", best_params_svr)

# For RandomForestRegressor
best_params_randomforest = rf_search.best_params_
print("Best parameters for RandomForestRegressor:", best_params_randomforest)

# For LGBMRegressor
best_params_lgbm = lgbm_search.best_params_
print("Best parameters for LGBMRegressor:", best_params_lgbm)

# For XGBRegressor
best_params_xgb = xgb_search.best_params_
print("Best parameters for XGBRegressor:", best_params_xgb)

Best parameters for ElasticNet: {'alpha': 1, 'l1_ratio': 0.9}
Best parameters for SGDRegressor: {'alpha': 0.01, 'penalty': 'l2'}
Best parameters for SVR: {'C': 10, 'gamma': 0.1, 'kernel': 'linear'}
Best parameters for RandomForestRegressor: {'bootstrap': True, 'max_depth': 3, 'n_estimators': 50}
Best parameters for LGBMRegressor: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best parameters for XGBRegressor: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}


In [31]:
#fit data using best parameters
SGDR_model = SGDRegressor(alpha= 0.01, penalty= 'l2')
SGDR_model.fit(X_train, y_train)


In [33]:
#save trained model
import joblib
joblib.dump(SGDR_model,"Toyota Vios price estimator")

['Toyota Vios price estimator']

In [34]:
#import test set 
sample = pd.read_csv("sample test set.csv")
scaler2 = StandardScaler()
scaled_sample = scaler2.fit_transform(sample)
results = SGDR_model.predict(scaled_sample)



In [35]:
#test set with predicted Price results
sample["Predicted Price"] = results.astype(int)
sample

Unnamed: 0,mileage,E,G,J,Automatic,age,Predicted Price
0,55000,1,0,0,1,5,470194
1,82000,1,0,0,0,6,327096
2,16000,1,0,0,1,3,550646
3,12000,0,1,0,1,2,609019
4,37000,0,1,0,1,3,566560
5,94000,0,1,0,0,5,378974
6,8000,0,0,1,0,1,509703
7,88000,0,0,1,0,7,283370
8,46000,0,0,1,1,5,464867
9,23000,0,0,0,0,4,455514
