In [75]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime

In [76]:
df = pd.read_csv("./CAR DETAILS FROM CAR DEKHO.csv")
df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner
...,...,...,...,...,...,...,...,...
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,409999,80000,Diesel,Individual,Manual,Second Owner
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,Diesel,Individual,Manual,Second Owner
4337,Maruti 800 AC BSIII,2009,110000,83000,Petrol,Individual,Manual,Second Owner
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,Diesel,Individual,Manual,First Owner


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [78]:
df.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [79]:
df["Car_Age"] = datetime.now().year - df["year"]
df.drop(columns=["year", "name"], inplace=True)
df

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,owner,Car_Age
0,60000,70000,Petrol,Individual,Manual,First Owner,18
1,135000,50000,Petrol,Individual,Manual,First Owner,18
2,600000,100000,Diesel,Individual,Manual,First Owner,13
3,250000,46000,Petrol,Individual,Manual,First Owner,8
4,450000,141000,Diesel,Individual,Manual,Second Owner,11
...,...,...,...,...,...,...,...
4335,409999,80000,Diesel,Individual,Manual,Second Owner,11
4336,409999,80000,Diesel,Individual,Manual,Second Owner,11
4337,110000,83000,Petrol,Individual,Manual,Second Owner,16
4338,865000,90000,Diesel,Individual,Manual,First Owner,9


In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   selling_price  4340 non-null   int64 
 1   km_driven      4340 non-null   int64 
 2   fuel           4340 non-null   object
 3   seller_type    4340 non-null   object
 4   transmission   4340 non-null   object
 5   owner          4340 non-null   object
 6   Car_Age        4340 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 237.5+ KB


In [81]:
df['fuel'].unique()

array(['Petrol', 'Diesel', 'CNG', 'LPG', 'Electric'], dtype=object)

In [82]:
df['seller_type'].unique()

array(['Individual', 'Dealer', 'Trustmark Dealer'], dtype=object)

In [83]:
df['transmission'].unique()

array(['Manual', 'Automatic'], dtype=object)

In [84]:
df['owner'].unique()

array(['First Owner', 'Second Owner', 'Fourth & Above Owner',
       'Third Owner', 'Test Drive Car'], dtype=object)

In [85]:
df["owner"] = LabelEncoder().fit_transform(df["owner"])
df

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,owner,Car_Age
0,60000,70000,Petrol,Individual,Manual,0,18
1,135000,50000,Petrol,Individual,Manual,0,18
2,600000,100000,Diesel,Individual,Manual,0,13
3,250000,46000,Petrol,Individual,Manual,0,8
4,450000,141000,Diesel,Individual,Manual,2,11
...,...,...,...,...,...,...,...
4335,409999,80000,Diesel,Individual,Manual,2,11
4336,409999,80000,Diesel,Individual,Manual,2,11
4337,110000,83000,Petrol,Individual,Manual,2,16
4338,865000,90000,Diesel,Individual,Manual,0,9


In [86]:
df = pd.get_dummies(df, columns=["fuel", "seller_type", "transmission"], drop_first=True).astype(int)
df

Unnamed: 0,selling_price,km_driven,owner,Car_Age,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual
0,60000,70000,0,18,0,0,0,1,1,0,1
1,135000,50000,0,18,0,0,0,1,1,0,1
2,600000,100000,0,13,1,0,0,0,1,0,1
3,250000,46000,0,8,0,0,0,1,1,0,1
4,450000,141000,2,11,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
4335,409999,80000,2,11,1,0,0,0,1,0,1
4336,409999,80000,2,11,1,0,0,0,1,0,1
4337,110000,83000,2,16,0,0,0,1,1,0,1
4338,865000,90000,0,9,1,0,0,0,1,0,1


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   selling_price                 4340 non-null   int64
 1   km_driven                     4340 non-null   int64
 2   owner                         4340 non-null   int64
 3   Car_Age                       4340 non-null   int64
 4   fuel_Diesel                   4340 non-null   int64
 5   fuel_Electric                 4340 non-null   int64
 6   fuel_LPG                      4340 non-null   int64
 7   fuel_Petrol                   4340 non-null   int64
 8   seller_type_Individual        4340 non-null   int64
 9   seller_type_Trustmark Dealer  4340 non-null   int64
 10  transmission_Manual           4340 non-null   int64
dtypes: int64(11)
memory usage: 373.1 KB


In [88]:
X = df.drop(columns=["selling_price"])
y = df["selling_price"]

In [89]:
X

Unnamed: 0,km_driven,owner,Car_Age,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual
0,70000,0,18,0,0,0,1,1,0,1
1,50000,0,18,0,0,0,1,1,0,1
2,100000,0,13,1,0,0,0,1,0,1
3,46000,0,8,0,0,0,1,1,0,1
4,141000,2,11,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
4335,80000,2,11,1,0,0,0,1,0,1
4336,80000,2,11,1,0,0,0,1,0,1
4337,83000,2,16,0,0,0,1,1,0,1
4338,90000,0,9,1,0,0,0,1,0,1


In [90]:
y

0        60000
1       135000
2       600000
3       250000
4       450000
         ...  
4335    409999
4336    409999
4337    110000
4338    865000
4339    225000
Name: selling_price, Length: 4340, dtype: int64

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [92]:
model = LinearRegression()
model.fit(X_train, y_train)

In [93]:
y_pred = model.predict(X_test)
y_pred

array([ 4.24067873e+05,  5.04287763e+05,  7.25279817e+04,  4.88505118e+05,
        1.53365585e+06,  2.40569237e+05,  4.46360257e+05,  2.70751512e+05,
        6.64088008e+05,  3.09359958e+05,  7.01675431e+05,  5.99617730e+05,
        5.99448450e+05,  7.92731698e+04,  7.17219824e+05,  4.50042202e+05,
        8.54594497e+05,  1.34789577e+06,  9.80858501e+04,  1.48857346e+06,
        6.61867089e+05,  7.44536903e+05,  4.59889787e+05,  7.64957359e+05,
        3.51439006e+05,  6.35847524e+05,  7.06759060e+05,  2.61035279e+05,
        4.08344652e+05,  5.27771287e+05, -4.68333228e+04,  1.44429464e+06,
        6.13900502e+05,  7.29748589e+05,  5.06329808e+05,  4.65896776e+05,
        7.80272702e+05,  4.70100015e+05,  1.05905878e+05,  4.91541494e+05,
        6.05216840e+05,  4.68568481e+05,  7.18517338e+05,  4.51602457e+05,
        4.13688365e+05,  3.67248343e+05,  3.99682474e+05,  4.81536832e+05,
        7.24116447e+05,  4.76285877e+05,  2.39653526e+05,  3.09865500e+05,
        5.71535725e+05,  

In [94]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [95]:
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Absolute Error: 222105.55
Mean Squared Error: 182521565071.41
R² Score: 0.40
