In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

In [2]:
df = pd.read_csv("Car_Price_Prediction.csv")
df

Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Fuel Type,Transmission,Price
0,Honda,Model B,2015,3.9,74176,Petrol,Manual,30246.207931
1,Ford,Model C,2014,1.7,94799,Electric,Automatic,22785.747684
2,BMW,Model B,2006,4.1,98385,Electric,Manual,25760.290347
3,Honda,Model B,2015,2.6,88919,Electric,Automatic,25638.003491
4,Honda,Model C,2004,3.4,138482,Petrol,Automatic,21021.386657
...,...,...,...,...,...,...,...,...
995,Toyota,Model D,2002,1.9,5445,Petrol,Manual,22765.597091
996,Honda,Model B,2020,3.1,149112,Diesel,Manual,30392.575567
997,Ford,Model C,2008,1.9,195387,Petrol,Automatic,16446.892292
998,Toyota,Model A,2003,4.4,246,Petrol,Automatic,27396.156708


In [3]:
df.sort_values(by='Year', inplace=True)
df

Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Fuel Type,Transmission,Price
105,Honda,Model E,2000,2.9,49725,Electric,Automatic,20092.423460
480,Ford,Model A,2000,1.3,183176,Electric,Automatic,14986.715073
442,Ford,Model C,2000,3.9,25714,Petrol,Automatic,25517.225710
430,BMW,Model C,2000,2.4,93254,Petrol,Automatic,19288.553557
355,Honda,Model C,2000,2.1,117202,Petrol,Manual,20199.755581
...,...,...,...,...,...,...,...,...
206,Toyota,Model E,2021,3.8,158497,Electric,Automatic,32061.750020
518,Audi,Model C,2021,3.4,6163,Electric,Manual,34551.564800
477,Toyota,Model E,2021,3.2,128256,Diesel,Automatic,29764.975043
388,Audi,Model A,2021,4.2,90815,Electric,Automatic,35063.598363


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 105 to 391
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Make          1000 non-null   object 
 1   Model         1000 non-null   object 
 2   Year          1000 non-null   int64  
 3   Engine Size   1000 non-null   float64
 4   Mileage       1000 non-null   int64  
 5   Fuel Type     1000 non-null   object 
 6   Transmission  1000 non-null   object 
 7   Price         1000 non-null   float64
dtypes: float64(2), int64(2), object(4)
memory usage: 70.3+ KB


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train_val, test_set = train_test_split(df,test_size=0.2,random_state=42)
train_set,val_set = train_test_split(train_val,test_size=0.25,random_state=42)

In [7]:
input_cols = list(train_set)[1:-1]
target_cols = 'Price'

In [8]:
train_inputs = train_set[input_cols].copy()
val_inputs = val_set[input_cols].copy()
test_inputs = test_set[input_cols].copy()
train_targets = train_set[target_cols].copy()
val_targets = val_set[target_cols].copy()
test_targets = test_set[target_cols].copy()

In [9]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.to_list()
categorical_cols = train_inputs.select_dtypes(include='object').columns.to_list()

In [10]:
train_inputs[numeric_cols].describe()

Unnamed: 0,Year,Engine Size,Mileage
count,600.0,600.0,600.0
mean,2010.81,2.812333,99622.12
std,6.321432,1.020106,59855.217958
min,2000.0,1.0,56.0
25%,2005.0,2.0,49218.0
50%,2011.0,2.8,95538.0
75%,2016.0,3.7,154782.0
max,2021.0,4.5,199521.0


In [11]:
from sklearn.preprocessing import OneHotEncoder
encode = OneHotEncoder(sparse_output = False)

In [12]:
encode.fit(df[categorical_cols])

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [13]:
encode.categories_

[array(['Model A', 'Model B', 'Model C', 'Model D', 'Model E'],
       dtype=object),
 array(['Diesel', 'Electric', 'Petrol'], dtype=object),
 array(['Automatic', 'Manual'], dtype=object)]

In [14]:
encoded_cols = list(encode.get_feature_names_out(categorical_cols))
encoded_cols

['Model_Model A',
 'Model_Model B',
 'Model_Model C',
 'Model_Model D',
 'Model_Model E',
 'Fuel Type_Diesel',
 'Fuel Type_Electric',
 'Fuel Type_Petrol',
 'Transmission_Automatic',
 'Transmission_Manual']

In [15]:
train_inputs[encoded_cols] = encode.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encode.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encode.transform(test_inputs[categorical_cols])

In [16]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [17]:
scaler.fit(df[numeric_cols])

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [18]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [19]:
train_inputs[numeric_cols].describe()

Unnamed: 0,Year,Engine Size,Mileage
count,600.0,600.0,600.0
mean,0.514762,0.51781,0.498301
std,0.301021,0.291459,0.299559
min,0.0,0.0,0.0
25%,0.238095,0.285714,0.246043
50%,0.52381,0.514286,0.477862
75%,0.761905,0.771429,0.774362
max,1.0,1.0,0.998268


In [20]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()


In [21]:
model.fit(train_inputs[numeric_cols+encoded_cols], train_targets)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [22]:
X_train = train_inputs[numeric_cols+encoded_cols]
X_test = test_inputs[numeric_cols+encoded_cols]

In [23]:
train_preds = (model.predict(X_train))

In [24]:
(train_targets)

414    28481.884015
685    20953.472512
997    16446.892292
568    26303.696095
224    27885.186712
           ...     
464    16288.721955
658    16792.046977
146    25348.294592
663    25599.172135
112    19619.339512
Name: Price, Length: 600, dtype: float64

In [25]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(targets - predictions)))

In [26]:
rmse(train_targets, train_preds)

np.float64(2012.068140543629)

In [27]:
pd.DataFrame(train_preds).describe()

Unnamed: 0,0
count,600.0
mean,25081.779925
std,4657.05239
min,12387.328528
25%,21941.49102
50%,25190.577701
75%,28332.623919
max,36434.84157


In [28]:
test_preds = model.predict(X_test)

In [29]:
pd.DataFrame(test_preds).describe()

Unnamed: 0,0
count,200.0
mean,25183.937381
std,4587.924647
min,15239.03949
25%,21767.408753
50%,24983.233893
75%,28559.245869
max,35567.23988


In [30]:
rmse(test_targets, test_preds)

np.float64(2195.301310984327)

In [31]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score


In [32]:
mean = mean_absolute_error(test_targets, test_preds)
print('Mean Absolute Error:', mean)
print()
score = r2_score(test_targets,test_preds)
print('Score:', score)

Mean Absolute Error: 1679.3421441756911

Score: 0.8311956260491302
