### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm
import math
import collections
import pickle


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


from sklearn.metrics import mean_squared_error, mean_absolute_error


import category_encoders as ce

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor


import xgboost
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold

from sklearn.compose import TransformedTargetRegressor

### Import drive, dataset, helper functions

In [2]:
import sys
from pathlib import Path
import os
sys.path.append(str(Path(os.getcwd()).parents[1]))
from Helper_Functions import common_utils
directory_to_extract_to = os.getcwd()

common_utils.load_data_from_one_drive(directory_to_extract_to, "regression_paths", "car_price_dataset_path")

BadZipFile: File is not a zip file

### Read dataset

In [15]:
df = pd.read_csv("car_prices.csv", on_bad_lines='skip')
df.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,"kia motors america, inc",20500,21500,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,"kia motors america, inc",20800,21500,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,4.5,1331.0,gray,black,financial services remarketing (lease),31900,30000,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,4.1,14282.0,white,black,volvo na rep/world omni,27500,27750,Thu Jan 29 2015 04:30:00 GMT-0800 (PST)
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,4.3,2641.0,gray,black,financial services remarketing (lease),66000,67000,Thu Dec 18 2014 12:30:00 GMT-0800 (PST)


In [None]:
common_utils.get_dataset_info(df)

In [None]:
# Delete columns : 'vin'
# We are dropping this column, because it just contains ID for the car, and has no importance in predictions
df = df.drop(columns = ['vin'])

### **Data Insights**

#### Extracting Categorical and Numerical columns from data

In [None]:
categorical_columns = []
numerical_columns = []

for x in df.columns:
  if df[x].dtypes == 'O':
    categorical_columns.append(x)
  else:
    if df[x].nunique()>20:
      numerical_columns.append(x)

In [None]:
categorical_columns

In [None]:
numerical_columns

#### Value counts in categorical columns

To get a trend of different categories in categorical columns

In [None]:
df['make'].value_counts()

In [None]:
df['model'].value_counts()

In [None]:
df['trim'].value_counts()

In [None]:
df['body'].value_counts()

In [None]:
df['transmission'].value_counts()

In [None]:
df['state'].value_counts()

In [None]:
df['color'].value_counts()

In [None]:
df['interior'].value_counts()

In [None]:
df['seller'].value_counts()

In [None]:
df['saledate'].value_counts()

#### Boxplots for numerical columns

To see a trend of outliers in numerical columns

In [None]:
common_utils.plot_boxplot(df, ['year'])

In [None]:
common_utils.plot_boxplot(df, ['condition'])

In [None]:
common_utils.plot_boxplot(df, ['odometer'])

In [None]:
common_utils.plot_boxplot(df, ['mmr'])

In [None]:
common_utils.plot_boxplot(df, ['sellingprice'])

#### Missing values in data

In [None]:
common_utils.get_count_and_percentage_missing_values(df)

#### Heatmap
Gives the correlation between the target and features

In [None]:
common_utils.plot_heatmap(df)

#### Using select k best feature to get which columns to use

In [None]:
df_temp = df

In [None]:
df_temp = fill_missing_values(df_temp)

In [None]:
x_num = df_temp[['year', 'condition', 'odometer', 'mmr']]
y = df_temp['sellingprice']

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, f_regression
selector1 = SelectKBest(score_func = f_regression, k = 3)
selector1.fit(x_num, y)

In [None]:
cols = selector1.get_support(indices=True)

In [None]:
cols

No need to drop any numeric column

### **Data preprocessing**
Creating functions that can directly be called when making the ML model

#### Duplicates

In [None]:
def remove_duplicates(df):
  return df.drop_duplicates()

#### Handle missing and null values

In [None]:
def fill_missing_values(df):

  for x in categorical_columns:
   df[x].fillna(df[x].mode()[0],inplace=True)
  df['condition'].fillna(df['condition'].mean(),inplace=True)
  df = df[df['odometer'].notna()]

  return df

#### Removing outliers

In [None]:
def remove_outlier(df, col_name):
  q1 = df[col_name].quantile(0.25)
  q3 = df[col_name].quantile(0.75)
  iqr = q3-q1 #Interquartile range
  fence_low  = q1-1.5*iqr
  fence_high = q3+1.5*iqr
  df_out = df.loc[(df[col_name] > fence_low) & (df[col_name] < fence_high)]
  return df_out

#### Binary Encoding

In [None]:
def apply_encoding(df, columns, be=True):
  encoder= ce.BinaryEncoder(cols=columns,return_df=True)
  df=encoder.fit_transform(df) 
  return df

### Download df as csv

In [None]:
df1 = fill_missing_values(df)

In [None]:
df1.to_csv('car_prices_pred.csv', index=False)

### **Random Forest**

In [None]:
# Using functions to preprocess data
df_new = remove_duplicates(df)
df_new = fill_missing_values(df_new)
common_utils.get_count_and_percentage_missing_values(df_new)
df_new = apply_encoding(df_new,categorical_columns)
df_new.head()

In [None]:
# Data splitting into train and test 70:30 ratio
df_new_train, df_new_test = train_test_split(df_new, test_size = .30, random_state=1)

In [None]:
print(df_new_train.shape, df_new_test.shape)

Removing outliers from numerical columns

In [None]:
df_new_train = remove_outlier(df_new_train, 'mmr')
df_new_test = remove_outlier(df_new_test, 'mmr')

In [None]:
df_new_train = remove_outlier(df_new_train, 'odometer')
df_new_test = remove_outlier(df_new_test, 'odometer')

In [None]:
df_new_train = remove_outlier(df_new_train, 'year')
df_new_test = remove_outlier(df_new_test, 'year')

In [None]:
print(df_new_train.shape, df_new_test.shape)

x and y split from train and test each

In [None]:
x_train = df_new_train.copy()
x_train = x_train.drop(columns = ['sellingprice'])
y_train = df_new_train['sellingprice']

In [None]:
x_test = df_new_test.copy()
x_test = x_test.drop(columns = ['sellingprice'])
y_test = df_new_test['sellingprice']

Checking which columns to normalize and which to scale

In [None]:
def preprocess_numeric_column_data(data):
  columns = data.columns
  for i in columns:
    if data.dtypes[i] == np.object:
      print("Cannot process object data")
      return

  ## if skew is 0.5 and -0.5 i.e its normal distribution and use scaler else normalise
  scale_list = []
  normalise_list = []
  data_skew = data.skew()

  for i in columns:
    if data_skew[i] >= -0.5 and data_skew[i] <= 0.5:
      scale_list.append(i)
    else:
      normalise_list.append(i)
  return (scale_list, normalise_list)

In [None]:
scale_list, normalise_list = preprocess_numeric_column_data(x_train[['year', 'condition', 'odometer', 'mmr']])

In [None]:
scale_list

In [None]:
normalise_list

In [None]:
standard_scaler = preprocessing.StandardScaler()
standard_scaler.fit(x_train[scale_list])
x_train[scale_list] = standard_scaler.transform(x_train[scale_list])
x_test[scale_list] = standard_scaler.transform(x_test[scale_list])

In [None]:
normal_scaler = preprocessing.MinMaxScaler()
normal_scaler.fit(x_train[normalise_list])
x_train[normalise_list] = normal_scaler.transform(x_train[normalise_list])
x_test[normalise_list] = normal_scaler.transform(x_test[normalise_list])

In [None]:
pickle.dump(standard_scaler, open("standard_scaler.pkl", "wb"))

In [None]:
pickle.dump(normal_scaler, open("normal_scaler.pkl", "wb"))

In [None]:
# define model
model_randomforest = RandomForestRegressor(n_estimators=100, random_state=0)
model_randomforest.fit(x_train, y_train)

In [None]:
file_name = "model_randomforest.pkl"

In [None]:
pickle.dump(model_randomforest, open(file_name, "wb"))

In [None]:
y_hat = model_randomforest.predict(x_test)

In [None]:
y_test_arr = y_test.to_numpy()

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(y_hat[0:50], label='Predicted', color="blue")
plt.plot(y_test_arr[0:50], label='Actual', color="red")
plt.title("Predictions vs Actual")
plt.legend()

In [None]:
mdape = np.median(np.abs(y_hat - y_test)/np.abs(y_test)) # Median Absolute Percentage Error
acc = 1-mdape
print("Median Absolute Percentage Error: ",mdape)
print("Accuracy: ", acc)

### **Decision Tree**

In [None]:
# define model
model_decisiontree = DecisionTreeRegressor()
model_decisiontree.fit(x_train, y_train)

In [None]:
file_name = "model_decisiontree.pkl"

In [None]:
pickle.dump(model_decisiontree, open(file_name, "wb"))

In [None]:
y_hat = model_decisiontree.predict(x_test)

In [None]:
y_test_arr = y_test.to_numpy()

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(y_hat[0:50], label='Predicted', color="blue")
plt.plot(y_test_arr[0:50], label='Actual', color="red")
plt.title("Predictions vs Actual")
plt.legend()

In [None]:
mdape = np.median(np.abs(y_hat - y_test)/np.abs(y_test))
acc = 1-mdape
print("Median Absolute Percentage Error: ",mdape)
print("Accuracy: ", acc)

### **XGBoost**

In [None]:
model_xgb = XGBRegressor(learning_rate = 0.01, n_estimators = 1000, max_depth = 10)
model_xgb.fit(x_train, y_train)

In [None]:
y_hat = model_xgb.predict(x_test)

In [None]:
file_name = "model_xgboost.pkl"

In [None]:
pickle.dump(model_xgb, open(file_name, "wb"))

In [None]:
y_test_arr = y_test.to_numpy()

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(y_hat[0:50], label='Predicted', color="blue")
plt.plot(y_test_arr[0:50], label='Actual', color="red")
plt.title("Predictions vs Actual")
plt.legend()

In [None]:
mdape = np.median(np.abs(y_hat - y_test)/np.abs(y_test))
acc = 1-mdape
print("Median Absolute Percentage Error: ",mdape)
print("Accuracy: ", acc)

### Sampling

Creating a sample of 10,000 rows to run with kfold and gridsearchcv.
Running for 50,00,000+ rows is computationally expensive

In [None]:
df_sample = df.sample(n = 10000)

In [None]:
df_sample.shape

### **RandomForest with kfold**

In [None]:
df_new_sampled = remove_duplicates(df_sample)
df_new_sampled = fill_missing_values(df_new_sampled)
common_utils.get_count_and_percentage_missing_values(df_new_sampled)
df_new_sampled = apply_encoding(df_new_sampled,categorical_columns)
df_new_sampled.head()

In [None]:
kf = KFold(n_splits=10,random_state=1,shuffle=True)
i = 1
for train_index, test_index in kf.split(df_new_sampled):
  print('\n{} of kfold {}'.format(i,kf.n_splits))

  df_new_train, df_new_test = df_new.iloc[train_index], df_new.iloc[test_index]

  df_new_train = remove_outlier(df_new_train, 'mmr')
  df_new_test = remove_outlier(df_new_test, 'mmr')
  df_new_train = remove_outlier(df_new_train, 'odometer')
  df_new_test = remove_outlier(df_new_test, 'odometer')
  df_new_train = remove_outlier(df_new_train, 'year')
  df_new_test = remove_outlier(df_new_test, 'year')

  x_train = df_new_train.copy()
  x_train = x_train.drop(columns = ['sellingprice'])
  y_train = df_new_train['sellingprice']

  x_test = df_new_test.copy()
  x_test = x_test.drop(columns = ['sellingprice'])
  y_test = df_new_test['sellingprice']

  model_onehot = XGBRegressor(learning_rate = 0.01, n_estimators = 1000, max_depth = 10)
  model_onehot.fit(x_train, y_train)
  # print (model.best_params_)
  y_hat = model_onehot.predict(x_test)
  mdape = np.median(np.abs(y_hat - y_test)/np.abs(y_test))
  acc = 1-mdape
  print("Median Absolute Percentage Error: ",mdape)
  print("Accuracy: ", acc)
  i+=1

### **XGBoost with kfold and gridsearchcv**

In [None]:
kf = KFold(n_splits=10,random_state=1,shuffle=True)
i = 1
for train_index, test_index in kf.split(df_new_sampled):
  print('\n{} of kfold {}'.format(i,kf.n_splits))

  df_new_train, df_new_test = df_new.iloc[train_index], df_new.iloc[test_index]

  df_new_train = remove_outlier(df_new_train, 'mmr')
  df_new_test = remove_outlier(df_new_test, 'mmr')
  df_new_train = remove_outlier(df_new_train, 'odometer')
  df_new_test = remove_outlier(df_new_test, 'odometer')
  df_new_train = remove_outlier(df_new_train, 'year')
  df_new_test = remove_outlier(df_new_test, 'year')

  x_train = df_new_train.copy()
  x_train = x_train.drop(columns = ['sellingprice'])
  y_train = df_new_train['sellingprice']

  x_test = df_new_test.copy()
  x_test = x_test.drop(columns = ['sellingprice'])
  y_test = df_new_test['sellingprice']

  xgb1 = XGBRegressor()
  parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
                'learning_rate': [0.01, 0.05, 0.1, 0.5], #so called `eta` value
                'max_depth': [5, 6, 7, 8],
                'min_child_weight': [4],
                'silent': [1],
                'subsample': [0.7],
                'colsample_bytree': [0.7],
                'n_estimators': [500]}

  xgb_grid = GridSearchCV(xgb1, parameters, cv = 2, n_jobs = 5, verbose=True)

  xgb_grid.fit(x_train, y_train)
  y_hat = xgb_grid.predict(x_test)
  mdape = np.median(np.abs(y_hat - y_test)/np.abs(y_test))
  acc = 1-mdape
  print("Median Absolute Percentage Error: ",mdape)
  print("Accuracy: ", acc)
  i+=1
  print("Best parameters: ", xgb_grid.best_params_)

### Script to run all regressors

In [None]:
regressors = ['XGBRegressor', 'RandomForestRegressor', 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'AdaBoostRegressor']

for reg in regressors:
  if reg == 'XGBRegressor':
    regressor = XGBRegressor()
  if reg == 'RandomForestRegressor':
    regressor = RandomForestRegressor(n_estimators=100, random_state=0)
  if reg == 'DecisionTreeRegressor':
    regressor = DecisionTreeRegressor()
  if reg == 'ExtraTreesRegressor':
    regressor = ExtraTreesRegressor()
  if reg == 'AdaBoostRegressor':
    regressor = AdaBoostRegressor()

  regressor = TransformedTargetRegressor(regressor=regressor, transformer=MinMaxScaler())
  regressor.fit(x_train, y_train)
  y_pred = regressor.predict(x_test)

  plt.figure(figsize=(15, 8))
  plt.plot(y_pred[0:50], label='Predicted', color="blue")
  plt.plot(y_test_arr[0:50], label='Actual', color="red")
  plt.title(reg)
  plt.legend()

  print("\n")
  print("Accuracy for ", reg)
  acc = 1 - np.median(np.abs(y_pred - y_test)/np.abs(y_test))
  print(acc)