In [1]:
#import libraries
import pandas as pd

In [2]:
#import dataset
df = pd.read_csv("market data clean for ML.csv")
df.drop(["Unnamed: 0"], axis = "columns", inplace=True)
df.shape

(169, 9)

In [3]:
#drop XE variants because of low representation (4 rows)
df = df.loc[~(df.XE == 1),:]
df.shape

(165, 9)

In [4]:
#check for outliers in price
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
df[(df.price<lower_limit)|(df.price>upper_limit)]
#no outliers

Unnamed: 0,year,transmission,price,mileage,E,G,J,XE,XLE


In [5]:
#check for outliers in mileage
Q1 = df.mileage.quantile(0.25)
Q3 = df.mileage.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
df[(df.mileage<lower_limit)|(df.mileage>upper_limit)]
#although we detected outliers using IQR these outliers are natural and therefore we will keep them

Unnamed: 0,year,transmission,price,mileage,E,G,J,XE,XLE
7,2016,Manual,320000,195000,0,0,1,0,0
48,2008,Manual,268000,145000,1,0,0,0,0
58,2013,Manual,280000,195000,0,1,0,0,0
146,2008,Manual,260000,155000,0,1,0,0,0
159,2016,Manual,330000,155000,0,0,1,0,0
166,2016,Manual,365000,175000,1,0,0,0,0


In [6]:
#get_dummies for transmission
dummy = pd.get_dummies(df.transmission)
df = pd.concat([df,dummy],axis="columns")
df.head()

Unnamed: 0,year,transmission,price,mileage,E,G,J,XE,XLE,Automatic,Manual
0,2021,Automatic,598000,5000,0,0,0,0,1,1,0
1,2021,Automatic,579000,15000,0,0,0,0,1,1,0
2,2021,Automatic,680000,15000,0,0,0,0,1,1,0
3,2021,Automatic,628000,5000,0,0,0,0,1,1,0
4,2020,Automatic,698000,15000,0,1,0,0,0,1,0


In [7]:
#We will change year values into age values
df["age"] = [2023-x for x in df.year]

In [8]:
#drop 1 column under variants and transmission
df = df.drop(["XE","transmission","year"], axis = "columns")
df.head()

Unnamed: 0,price,mileage,E,G,J,XLE,Automatic,Manual,age
0,598000,5000,0,0,0,1,1,0,2
1,579000,15000,0,0,0,1,1,0,2
2,680000,15000,0,0,0,1,1,0,2
3,628000,5000,0,0,0,1,1,0,2
4,698000,15000,0,1,0,0,1,0,3


In [9]:
#divide into features and target
X = df.drop(["price"], axis= "columns")
y = df[["price"]].values

In [10]:
#Try Stratified CV first to deal with imbalanced features
#Gridsearch cv=5
#Try removing the imbalanced feature and test accuracy
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor


model_params = {
    'linear_regression' : {
        'model': LinearRegression(),
        'params': {}
    },
    'lasso' : {
        'model': Lasso(),
        'params': {}
    },
    'ridge' : {
        'model': Ridge(),
        'params': {"alpha": [1,5,10]}
    },
    'elasticnet' : {
        'model': ElasticNet(),
        'params': {}
    },
    'XGBRegressor' : {
        'model': XGBRegressor(),
        'params': {"n_estimators" : [10, 50, 100]}
    }
}


In [11]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=3, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
score = pd.DataFrame(scores,columns=['model','best_score','best_params'])
score
#best model based form our gridsearchcv is ridge with alpha=10 as parameter
#!!!This model is still under hyperparameter tuning and may change

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.633863,{}
1,lasso,0.633869,{}
2,ridge,0.640415,{'alpha': 10}
3,elasticnet,0.616311,{}
4,XGBRegressor,0.475324,{'n_estimators': 10}


In [12]:
#check using cross val score


from sklearn.model_selection import cross_val_score
cross_val_score(Ridge(alpha=5), X, y,cv=3)

array([0.76172642, 0.59075693, 0.56839933])

In [13]:
#save model
model = Ridge(alpha=5)
model.fit(X, y)


Ridge(alpha=5)

In [14]:
#save trained model
import joblib
joblib.dump(model,"Toyota Vios price estimator")

['Toyota Vios price estimator']

In [15]:
sample = pd.read_csv("sample test set.csv")
results = model.predict(sample)

In [23]:
sample["Results"] = results.astype(int)
sample

Unnamed: 0,mileage,E,G,J,XLE,Automatic,Manual,age,Results
0,55000,1,0,0,0,1,0,5,499036
1,82000,1,0,0,0,0,1,6,398025
2,16000,1,0,0,0,1,0,3,549192
3,12000,0,1,0,0,1,0,2,585442
4,37000,0,1,0,0,1,0,3,558566
5,94000,0,1,0,0,0,1,5,429044
6,8000,0,0,1,0,0,1,1,483631
7,88000,0,0,1,0,0,1,7,345262
8,46000,0,0,1,0,1,0,5,469879
9,23000,0,0,0,1,0,1,4,495447
