In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

#https://www.kaggle.com/datasets/adhurimquku/ford-car-price-prediction

def load_car_data():
    csv_path = os.path.join("Data", "ford.csv")
    return pd.read_csv(csv_path)

car = load_car_data()
car.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0


In [2]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17966 entries, 0 to 17965
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17966 non-null  object 
 1   year          17966 non-null  int64  
 2   price         17966 non-null  int64  
 3   transmission  17966 non-null  object 
 4   mileage       17966 non-null  int64  
 5   fuelType      17966 non-null  object 
 6   tax           17966 non-null  int64  
 7   mpg           17966 non-null  float64
 8   engineSize    17966 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.2+ MB


In [3]:
car.describe()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize
count,17966.0,17966.0,17966.0,17966.0,17966.0,17966.0
mean,2016.86647,12279.534844,23362.608761,113.329456,57.90698,1.350807
std,2.050336,4741.343657,19472.054349,62.012456,10.125696,0.432367
min,1996.0,495.0,1.0,0.0,20.8,0.0
25%,2016.0,8999.0,9987.0,30.0,52.3,1.0
50%,2017.0,11291.0,18242.5,145.0,58.9,1.2
75%,2018.0,15299.0,31060.0,145.0,65.7,1.5
max,2060.0,54995.0,177644.0,580.0,201.8,5.0


### Cleaning the Dataset

Not much has to be done in terms of cleaning, from the car.info() output we can see that no columns contain null values. The only problem with the data is that the year column contains some values that are too high. The max for year column is 2060 which is impossible. This needs to be investigated and I will likely remove any values above 2023. The price column has a minimum value of 495. This seems low but the car could have been bought used for very cheap. The mileage column has some very low values. I assume this just means the car is very new and in this context the values seem reasonable. The mpg column has a max of 201, this seems high and should be investigated as well. 

In [4]:
car = car.sort_values('year')
car.tail()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
7315,Fiesta,2020,18200,Manual,10,Petrol,145,49.6,1.0
12600,Mustang,2020,41000,Manual,100,Petrol,145,22.8,5.0
9240,Focus,2020,22000,Manual,50,Petrol,145,48.7,1.0
347,Focus,2020,19998,Manual,4000,Diesel,145,74.3,1.5
17726,Fiesta,2060,6495,Automatic,54807,Petrol,205,42.8,1.4


In [5]:
car = car.drop(car.index[car['year'] > 2023], axis = 0)

### One Hot Encode

In [6]:
fuel_encode = pd.get_dummies(car.fuelType, prefix='fuel')
trans_encode = pd.get_dummies(car.transmission, prefix='trans')
model_encode = pd.get_dummies(car.model, prefix='model')

In [7]:
car = car.join(fuel_encode)
car = car.join(trans_encode)
car = car.join(model_encode)
car = car.drop(['model', 'transmission', 'fuelType'], axis = 1)
car.head()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Other,...,model_ Mondeo,model_ Mustang,model_ Puma,model_ Ranger,model_ S-MAX,model_ Streetka,model_ Tourneo Connect,model_ Tourneo Custom,model_ Transit Tourneo,model_Focus
16878,1996,3000,50000,265,34.4,1.8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13544,1998,2699,37000,160,41.5,1.2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17282,2000,1995,43000,160,41.5,1.3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16899,2002,2195,108000,230,38.2,1.7,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
14367,2002,895,136784,300,36.2,1.8,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### Standardize Features

In [8]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
car_ss = pd.DataFrame(ss.fit_transform(car),columns = car.columns)
car_ss.head()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,fuel_Diesel,fuel_Electric,fuel_Hybrid,fuel_Other,...,model_ Mondeo,model_ Mustang,model_ Puma,model_ Ranger,model_ S-MAX,model_ Streetka,model_ Tourneo Connect,model_ Tourneo Custom,model_ Transit Tourneo,model_Focus
0,-10.303641,-1.957302,1.36817,2.446039,-2.321744,1.038922,-0.687153,-0.010552,-0.035016,-0.007461,...,-0.173673,-0.056418,-0.066881,-0.007461,-0.129431,-0.010552,-0.042899,-0.062094,-0.007461,-0.007461
1,-9.315948,-2.020789,0.700498,0.752728,-1.620515,-0.348788,-0.687153,-0.010552,-0.035016,-0.007461,...,-0.173673,-0.056418,-0.066881,-0.007461,-0.129431,-0.010552,-0.042899,-0.062094,-0.007461,-0.007461
2,-8.328256,-2.169276,1.008654,0.752728,-1.620515,-0.117503,-0.687153,-0.010552,-0.035016,-0.007461,...,-0.173673,-0.056418,-0.066881,-0.007461,-0.129431,-0.010552,-0.042899,-0.062094,-0.007461,-0.007461
3,-7.340564,-2.127092,4.347014,1.881602,-1.946438,0.807637,-0.687153,-0.010552,-0.035016,-0.007461,...,-0.173673,-0.056418,14.952007,-0.007461,-0.129431,-0.010552,-0.042899,-0.062094,-0.007461,-0.007461
4,-7.340564,-2.401287,5.825342,3.010476,-2.143968,1.038922,-0.687153,-0.010552,-0.035016,-0.007461,...,5.75795,-0.056418,-0.066881,-0.007461,-0.129431,-0.010552,-0.042899,-0.062094,-0.007461,-0.007461


### Train/Test Split

In [9]:
from sklearn.model_selection import train_test_split

X = car_ss.drop(['price'], axis = 1)
y = car['price'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [10]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression()

In [17]:
thing1 = lin_reg.predict(X_test).tolist()
thing2 = y_test.tolist()

In [18]:
data = {'Prediction':thing1,
        'Actual':thing2}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Prediction,Actual
0,11504.551025,8800
1,10234.051025,8499
2,13734.551025,12000
3,16325.551025,15991
4,16347.551025,18398


In [21]:
r2_score = lin_reg.score(X_test,y_test)
print(r2_score*100,'%')

-3.7400109977841795e+22 %


In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)