In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

#https://www.kaggle.com/datasets/adhurimquku/ford-car-price-prediction

def load_car_data():
    csv_path = os.path.join("Data", "ford.csv")
    return pd.read_csv(csv_path)

#https://datascience.stackexchange.com/questions/74962/evaluating-model-accuracy-on-a-testing-data-set-for-a-decisiontreereegressor-mod
#this code was taken from stack exchange to test the accuracy of my models
def reg_metrics(y_test, y_pred, X_train):
    from sklearn.metrics import mean_squared_error, r2_score 

    rmse = np.sqrt(mean_squared_error(y_test,y_pred))
    r2 = r2_score(y_test,y_pred)

    # Scikit-learn doesn't have adjusted r-square, hence custom code
    n = y_pred.shape[0]
    k = X_train.shape[1]
    adj_r_sq = 1 - (1 - r2)*(n-1)/(n-1-k)

    print(rmse, r2, adj_r_sq)

def standardscaler(df):
    from sklearn.preprocessing import StandardScaler

    ss = StandardScaler()
    car_ss = pd.DataFrame(ss.fit_transform(df),columns = car.columns)
    return car_ss

car = load_car_data()
car.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0


In [16]:
car.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17914 entries, 10257 to 13041
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17914 non-null  object 
 1   year          17914 non-null  int64  
 2   price         17914 non-null  int64  
 3   transmission  17914 non-null  object 
 4   mileage       17914 non-null  int64  
 5   fuelType      17914 non-null  object 
 6   tax           17914 non-null  int64  
 7   mpg           17914 non-null  float64
 8   engineSize    17914 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.4+ MB


In [3]:
car.describe()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize
count,17966.0,17966.0,17966.0,17966.0,17966.0,17966.0
mean,2016.86647,12279.534844,23362.608761,113.329456,57.90698,1.350807
std,2.050336,4741.343657,19472.054349,62.012456,10.125696,0.432367
min,1996.0,495.0,1.0,0.0,20.8,0.0
25%,2016.0,8999.0,9987.0,30.0,52.3,1.0
50%,2017.0,11291.0,18242.5,145.0,58.9,1.2
75%,2018.0,15299.0,31060.0,145.0,65.7,1.5
max,2060.0,54995.0,177644.0,580.0,201.8,5.0


### Cleaning the Dataset

Not much has to be done in terms of cleaning, from the car.info() output we can see that no columns contain null values. The only problem with the data is that the year column contains some values that are too high. The max for year column is 2060 which is impossible. This needs to be investigated and I will likely remove any values above 2023. The price column has a minimum value of 495. This seems low but the car could have been bought used for very cheap. The mileage column has some very low values. I assume this just means the car is very new and in this context the values seem reasonable. The mpg column has a max of 201, this seems high and should be investigated as well. 

In [15]:
car = car.sort_values('engineSize')
car.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
10257,B-MAX,2017,9495,Manual,32442,Petrol,145,55.4,1.0
12855,Focus,2019,18200,Automatic,11115,Petrol,150,41.5,1.0
11705,Fiesta,2020,15199,Manual,50,Petrol,145,56.5,1.0
12863,Fiesta,2019,15500,Manual,11486,Petrol,150,60.1,1.0
3083,Puma,2020,20890,Manual,2548,Petrol,150,50.4,1.0
3220,Fiesta,2018,11260,Manual,7891,Petrol,145,65.7,1.0
12077,Fiesta,2019,14600,Manual,11273,Petrol,145,60.1,1.0
3211,Fiesta,2018,10191,Manual,11462,Petrol,145,65.7,1.0
14988,Puma,2020,21999,Manual,2500,Petrol,150,50.4,1.0
3091,Puma,2020,21990,Manual,1255,Petrol,145,50.4,1.0


In [14]:
car = car.drop(car.index[car['year'] > 2023], axis = 0)
car = car.drop(car.index[car['engineSize'] < 1], axis = 0)
car['model'] = car['model'].str.strip()

### One Hot Encoding

This is necessary to prepare the categorical attributes to be accurately processed in the model. I use one hot encoding over ordinal encoding because there is no relationship between the values in the categorical columns. 

In [6]:
fuel_encode = pd.get_dummies(car.fuelType, prefix='fuel')
trans_encode = pd.get_dummies(car.transmission, prefix='trans')
model_encode = pd.get_dummies(car.model, prefix='model')
car_ohe = car.join(fuel_encode)
car_ohe = car_ohe.join(trans_encode)
car_ohe = car_ohe.join(model_encode)
car_ohe = car_ohe.drop(['model', 'transmission', 'fuelType'], axis = 1)
#car_ohe.columns = car_ohe.columns.str.replace(' ', '')
car_ohe.head()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Other,...,model_Kuga,model_Mondeo,model_Mustang,model_Puma,model_Ranger,model_S-MAX,model_Streetka,model_Tourneo Connect,model_Tourneo Custom,model_Transit Tourneo
16878,1996,3000,50000,265,34.4,1.8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13544,1998,2699,37000,160,41.5,1.2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17282,2000,1995,43000,160,41.5,1.3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16899,2002,2195,108000,230,38.2,1.7,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
14367,2002,895,136784,300,36.2,1.8,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


### Train Test Split

In [7]:
from sklearn.model_selection import train_test_split

X = car_ohe.drop(['price'], axis = 1)
y = car_ohe['price'].to_numpy()
X_train, X_test, y_train, y_test = (train_test_split(X, y, test_size=0.33))

### Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression()

In [9]:
thing1 = lin_reg.predict(X_test).tolist()
thing2 = y_test.tolist()

In [10]:
data = {'Prediction':thing1,
        'Actual':thing2}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Prediction,Actual
0,13231.578454,12599
1,8635.106835,6470
2,16084.832276,16000
3,17377.185312,18500
4,15547.956153,17900


In [11]:
x = reg_metrics(y_test, lin_reg.predict(X_test), X_train)
print(x)

1859.5933546020908 0.8510046995378572 0.850094341286561
None


### Decision Tree Regressor

In [12]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
thing1 = tree_reg.predict(X_test).tolist()
data = {'Prediction':thing1,
        'Actual':thing2}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Prediction,Actual
0,11200.0,12599
1,3995.0,6470
2,16200.0,16000
3,17299.0,18500
4,17900.0,17900


In [13]:
x = reg_metrics(y_test, tree_reg.predict(X_test), X_train)
print(x)

1609.9758637022687 0.8883200194131929 0.8876376570063489
None


### Random Forest Regressor

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)

RandomForestRegressor()

In [15]:
x = reg_metrics(y_test, forest_reg.predict(X_test), X_train)
print(x)

1250.1488399082562 0.932662026299637 0.9322505926517732
None


In [16]:
print(car_ohe.columns.values)

['year' 'price' 'mileage' 'tax' 'mpg' 'engineSize' 'fuel_Diesel'
 'fuel_Electric' 'fuel_Hybrid' 'fuel_Other' 'fuel_Petrol'
 'trans_Automatic' 'trans_Manual' 'trans_Semi-Auto' 'model_B-MAX'
 'model_C-MAX' 'model_EcoSport' 'model_Edge' 'model_Escort' 'model_Fiesta'
 'model_Focus' 'model_Fusion' 'model_Galaxy' 'model_Grand C-MAX'
 'model_Grand Tourneo Connect' 'model_KA' 'model_Ka+' 'model_Kuga'
 'model_Mondeo' 'model_Mustang' 'model_Puma' 'model_Ranger' 'model_S-MAX'
 'model_Streetka' 'model_Tourneo Connect' 'model_Tourneo Custom'
 'model_Transit Tourneo']


In [17]:
test = car_ohe.drop(['price'], axis = 1)
keyList = test.columns.values.tolist()
my_dict = {}

for i in keyList:
    my_dict[i] = 0
    
my_dict['year'] = 1996
my_dict['mileage'] = 50000
my_dict['tax'] = 265
my_dict['mpg'] = 34.4
my_dict['engineSize'] = 1.8
my_dict['fuel_Petrol'] = 1
my_dict['trans_Manual'] = 1
my_dict['model_Escort'] = 1

usr_input = pd.DataFrame(my_dict, index = [0])
usr_input.head()
print(forest_reg.predict(usr_input))

[2819.78]


In [18]:
car.head(20)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
16878,Escort,1996,3000,Manual,50000,Petrol,265,34.4,1.8
13544,Fiesta,1998,2699,Manual,37000,Petrol,160,41.5,1.2
17282,Fiesta,2000,1995,Manual,43000,Petrol,160,41.5,1.3
16899,Puma,2002,2195,Manual,108000,Petrol,230,38.2,1.7
14367,Mondeo,2002,895,Manual,136784,Petrol,300,36.2,1.8
16900,Puma,2002,2695,Manual,69000,Petrol,220,38.7,1.6
16819,Focus,2003,3999,Manual,56064,Petrol,325,31.0,2.0
16970,Focus,2003,1695,Manual,89630,Petrol,200,40.9,1.6
17103,Focus,2003,495,Manual,177644,Petrol,200,41.5,1.6
17455,Fusion,2004,1500,Automatic,59300,Petrol,200,42.8,1.4


In [18]:
car['transmission'].unique()

array(['Manual', 'Automatic', 'Semi-Auto'], dtype=object)