In [1]:
#import libraries
import pandas as pd

In [2]:
#import dataset
df = pd.read_csv("market data clean for ML.csv")
df.drop(["Unnamed: 0"], axis = "columns", inplace=True)
df.shape

(174, 9)

In [3]:
#drop XE variants because of low representation (4 rows)
df = df.loc[~(df.XE == 1),:]
df.shape

(170, 9)

In [4]:
#check for outliers in price
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
df[(df.price<lower_limit)|(df.price>upper_limit)]
#no outliers

Unnamed: 0,year,transmission,price,mileage,XLE,XE,J,E,G


In [5]:
#check for outliers in mileage
Q1 = df.mileage.quantile(0.25)
Q3 = df.mileage.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR
df[(df.mileage<lower_limit)|(df.mileage>upper_limit)]
#although we detected outliers using IQR these outliers are natural and therefore we will keep them

Unnamed: 0,year,transmission,price,mileage,XLE,XE,J,E,G
7,2016,Manual,320000,195000,0,0,1,0,0
48,2008,Manual,268000,145000,0,0,0,1,0
58,2013,Manual,280000,195000,0,0,0,0,1
150,2008,Manual,260000,155000,0,0,0,0,1
164,2016,Manual,330000,155000,0,0,1,0,0
171,2016,Manual,365000,175000,0,0,0,1,0


In [6]:
#get_dummies for transmission
dummy = pd.get_dummies(df.transmission)
df = pd.concat([df,dummy],axis="columns")
df.head()

Unnamed: 0,year,transmission,price,mileage,XLE,XE,J,E,G,Automatic,Manual
0,2021,Automatic,598000,5000,1,0,0,1,0,1,0
1,2021,Automatic,579000,15000,1,0,0,1,0,1,0
2,2021,Automatic,680000,15000,1,0,0,1,0,1,0
3,2021,Automatic,628000,5000,1,0,0,1,0,1,0
4,2020,Automatic,698000,15000,0,0,0,0,1,1,0


In [7]:
#We will change year values into age values
df["age"] = [2023-x for x in df.year]

In [8]:
#drop 1 column under variants and transmission
df = df.drop(["XE","transmission","year"], axis = "columns")
df.head()

Unnamed: 0,price,mileage,XLE,J,E,G,Automatic,Manual,age
0,598000,5000,1,0,1,0,1,0,2
1,579000,15000,1,0,1,0,1,0,2
2,680000,15000,1,0,1,0,1,0,2
3,628000,5000,1,0,1,0,1,0,2
4,698000,15000,0,0,0,1,1,0,3


In [9]:
# Scale price and mileage
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df["price"] = scaler.fit_transform(df[["price"]])
df["mileage"] = scaler.fit_transform(df[["mileage"]])

In [10]:
#divide into features and target
X = df.drop(["price"], axis= "columns")
y = df[["price"]].values

In [11]:
#Try Stratified CV first to deal with imbalanced features
#Gridsearch cv=5
#Try removing the imbalacned feature and test accuracy
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor


model_params = {
    'linear_regression' : {
        'model': LinearRegression(),
        'params': {}
    },
    'lasso' : {
        'model': Lasso(),
        'params': {}
    },
    'ridge' : {
        'model': Ridge(),
        'params': {"alpha": [1,5,10]}
    },
    'elasticnet' : {
        'model': ElasticNet(),
        'params': {}
    },
    'XGBRegressor' : {
        'model': XGBRegressor(),
        'params': {"n_estimators" : [10, 50, 100]}
    }
}


In [12]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=3, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
score = pd.DataFrame(scores,columns=['model','best_score','best_params'])
score

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.659692,{}
1,lasso,0.499756,{}
2,ridge,0.664217,{'alpha': 5}
3,elasticnet,0.539926,{}
4,XGBRegressor,0.577309,{'n_estimators': 10}


In [13]:
from sklearn.model_selection import cross_val_score
cross_val_score(Ridge(alpha=5), X, y,cv=3)

array([0.76162852, 0.64355963, 0.58746242])

In [14]:
#save model
model = Ridge(alpha=5)
model.fit(X, y)


Ridge(alpha=5)

In [15]:
#save trained model
import joblib
joblib.dump(model,"Toyota Vios price estimator")

['Toyota Vios price estimator']