In [76]:
import pandas as pd
import numpy as np

In [77]:
dataset = pd.read_csv('car_price_prediction_.csv')

In [78]:
# Drop the columns that are not in use 
dataset = dataset.drop(['Car ID','Brand','Condition','Model'],axis=1)

In [79]:
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1:]

In [80]:
print(x)

[[2016 2.3 'Petrol' 'Manual' 114832]
 [2018 4.4 'Electric' 'Manual' 143190]
 [2013 4.5 'Electric' 'Manual' 181601]
 ...
 [2021 1.1 'Hybrid' 'Manual' 272827]
 [2002 4.5 'Diesel' 'Manual' 229164]
 [2005 4.6 'Diesel' 'Automatic' 80978]]


In [81]:
print(y)

         Price
0     26613.92
1     14679.61
2     44402.61
3     86374.33
4     73577.10
...        ...
2495  61384.10
2496  24710.35
2497  29902.45
2498  46085.67
2499  16594.14

[2500 rows x 1 columns]


In [82]:
# Encoding the independent variable 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[2,3])],remainder='passthrough')
# ct = ColumnTransformer(transformers = [('encoder',OneHotEncoder(),[0])],remainder='passthrough')
x =np.array(ct.fit_transform(x))

In [83]:
print(x)

[[0.0 0.0 0.0 ... 2016 2.3 114832]
 [0.0 1.0 0.0 ... 2018 4.4 143190]
 [0.0 1.0 0.0 ... 2013 4.5 181601]
 ...
 [0.0 0.0 1.0 ... 2021 1.1 272827]
 [1.0 0.0 0.0 ... 2002 4.5 229164]
 [1.0 0.0 0.0 ... 2005 4.6 80978]]


In [84]:
dataset

Unnamed: 0,Year,Engine Size,Fuel Type,Transmission,Mileage,Price
0,2016,2.3,Petrol,Manual,114832,26613.92
1,2018,4.4,Electric,Manual,143190,14679.61
2,2013,4.5,Electric,Manual,181601,44402.61
3,2011,4.1,Diesel,Automatic,68682,86374.33
4,2009,2.6,Diesel,Manual,223009,73577.10
...,...,...,...,...,...,...
2495,2020,2.4,Petrol,Automatic,22650,61384.10
2496,2001,5.7,Hybrid,Manual,77701,24710.35
2497,2021,1.1,Hybrid,Manual,272827,29902.45
2498,2002,4.5,Diesel,Manual,229164,46085.67


In [85]:
# Splitting the dataset into training and testing data 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [86]:
print(x_train)

[[0.0 1.0 0.0 ... 2005 3.4 73401]
 [1.0 0.0 0.0 ... 2004 1.6 252234]
 [1.0 0.0 0.0 ... 2002 5.8 89882]
 ...
 [1.0 0.0 0.0 ... 2014 3.2 80636]
 [0.0 0.0 0.0 ... 2011 4.8 6336]
 [1.0 0.0 0.0 ... 2003 4.4 212139]]


In [87]:
print(x_test)

[[1.0 0.0 0.0 ... 2010 3.0 211864]
 [0.0 0.0 1.0 ... 2020 4.4 18781]
 [1.0 0.0 0.0 ... 2013 4.3 55547]
 ...
 [0.0 1.0 0.0 ... 2010 3.8 201264]
 [0.0 0.0 0.0 ... 2018 4.5 58480]
 [0.0 1.0 0.0 ... 2009 3.4 253301]]


In [88]:
print(y_train)

         Price
2055  97600.01
1961   9212.70
1864  89909.81
2326  38235.97
461   77675.22
...        ...
1638  73142.61
1095  82138.86
1130  74003.92
1294  14457.06
860   34382.84

[2000 rows x 1 columns]


In [89]:
print(y_test)

         Price
1447  17494.90
1114  75919.94
1064  87474.10
2287  13522.58
1537  77070.57
...        ...
2375  18249.22
1609  94121.24
596   72013.84
84    84585.18
2213  16960.31

[500 rows x 1 columns]


In [90]:
# Frature scaling 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,-1:]=sc.fit_transform(x_train[:,-1:])
x_test[:,-1:] = sc.transform(x_test[:,-1:])

In [91]:
print(x_train)

[[0.0 1.0 0.0 ... 2005 3.4 -0.8831454283603284]
 [1.0 0.0 0.0 ... 2004 1.6 1.1533627290778816]
 [1.0 0.0 0.0 ... 2002 5.8 -0.6954636753128531]
 ...
 [1.0 0.0 0.0 ... 2014 3.2 -0.8007549494271032]
 [0.0 0.0 0.0 ... 2011 4.8 -1.6468658802824767]
 [1.0 0.0 0.0 ... 2003 4.4 0.6967702960678386]]


In [92]:
print(x_test)

[[1.0 0.0 0.0 ... 2010 3.0 0.6936386607248342]
 [0.0 0.0 1.0 ... 2020 4.4 -1.5051451463054226]
 [1.0 0.0 0.0 ... 2013 4.3 -1.0864625825930583]
 ...
 [0.0 1.0 0.0 ... 2010 3.8 0.5729283529581187]
 [0.0 0.0 0.0 ... 2018 4.5 -1.0530622681893058]
 [0.0 1.0 0.0 ... 2009 3.4 1.1655134742087387]]


In [93]:
from sklearn.linear_model import LinearRegression

In [94]:
model = LinearRegression()

In [95]:
model.fit(x_train,y_train)

In [98]:
y_pred = model.predict(x_test)

In [99]:
# Evaluate the model 
from sklearn.metrics import mean_squared_error,r2_score

In [102]:
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

In [103]:
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Print the coefficients (Beta values) and intercept
print(f'Intercept (Beta 0): {model.intercept_}')
print(f'Coefficients (Beta values): {model.coef_}')

Mean Squared Error: 762714235.4568865
R-squared: -0.006812593850535453
Intercept (Beta 0): [3.35608682e+17]
Coefficients (Beta values): [[-3.32240856e+17 -3.32240856e+17 -3.32240856e+17 -3.32240856e+17
  -3.36782589e+15 -3.36782589e+15 -1.76000000e+02 -1.44000000e+02
  -9.77500000e+01]]


In [105]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared: {r2}')

Mean Squared Error (MSE): 762714235.4568865
Root Mean Squared Error (RMSE): 27617.28146391108
R-squared: -0.006812593850535453
