In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import max_error,mean_squared_error


In [2]:
db = pd.read_csv('used_cars_data.csv')
db.drop_duplicates(keep='last',inplace=True)

# percentage of NULL Values
db.isna().sum()/len(db) * 100

S.No.                 0.000000
Name                  0.000000
Location              0.000000
Year                  0.000000
Kilometers_Driven     0.000000
Fuel_Type             0.000000
Transmission          0.000000
Owner_Type            0.000000
Mileage               0.027575
Engine                0.634220
Power                 0.634220
Seats                 0.730732
New_Price            86.129877
Price                17.013650
dtype: float64

In [3]:
db.head()

Unnamed: 0,S.No.,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [4]:
def cvt_to_float(n):
    if isinstance(n,str): #checks if the columns are string datatype
        if n.endswith('kmpl'):
            return float(n.split('kmpl')[0])     
        elif n.endswith('km/kg'):                   
            return float(n.split('km/kg')[0])
        elif n.endswith('CC'):
            return float(n.split('CC')[0])
        elif n.startswith('null'):     #replaces values that have string 'null bhp' to Nan
            return(None)          
        elif n.endswith('bhp'):
             return float(n.split('bhp')[0])
        elif n.endswith('Lakh'):
            return float(n.split('Lakh')[0])
    else: 
        return None
    

#removing CC kmpl etc and converting string to float    
db['Mileage'] = db['Mileage'].apply(cvt_to_float)
db['Engine'] = db['Engine'].apply(cvt_to_float)
db['Power'] = db['Power'].apply(cvt_to_float)
db['New_Price'] = db['New_Price'].apply(cvt_to_float)

#converting fuel type transmission etc from string to number
le = LabelEncoder()
db['Name'] = le.fit_transform(db['Name'])
db['Location'] = le.fit_transform(db['Location'])
db['Fuel_Type'] = le.fit_transform(db['Fuel_Type'])
db['Transmission'] = le.fit_transform(db['Transmission'])
db['Owner_Type'] = le.fit_transform(db['Owner_Type'])

db = db.dropna()
db.head()


Unnamed: 0,S.No.,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
2,2,529,2,2011,46000,4,1,0,18.2,1199.0,88.7,5.0,8.61,4.5
7,7,1933,9,2016,36000,1,0,0,11.36,2755.0,171.5,8.0,21.0,17.5
10,10,1139,7,2018,25692,4,1,0,21.56,1462.0,103.25,5.0,10.65,9.95
15,15,1489,4,2014,110000,1,1,0,13.5,2477.0,175.56,7.0,32.01,15.0
20,20,76,7,2014,32982,1,0,0,22.69,1995.0,190.0,5.0,47.87,18.55


In [5]:
db.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 806 entries, 2 to 6014
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   S.No.              806 non-null    int64  
 1   Name               806 non-null    int64  
 2   Location           806 non-null    int64  
 3   Year               806 non-null    int64  
 4   Kilometers_Driven  806 non-null    int64  
 5   Fuel_Type          806 non-null    int64  
 6   Transmission       806 non-null    int64  
 7   Owner_Type         806 non-null    int64  
 8   Mileage            806 non-null    float64
 9   Engine             806 non-null    float64
 10  Power              806 non-null    float64
 11  Seats              806 non-null    float64
 12  New_Price          806 non-null    float64
 13  Price              806 non-null    float64
dtypes: float64(6), int64(8)
memory usage: 94.5 KB


In [6]:
Y = db['Price']
X = db.drop(['Price','S.No.'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
x_train.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
2665,1062,7,2018,46805,4,1,0,24.07,998.0,67.1,5.0,4.09
5715,495,7,2019,22321,4,0,0,18.0,1497.0,117.3,5.0,13.4
4147,1146,7,2017,54758,1,1,0,28.4,1248.0,73.75,5.0,8.58
4780,566,1,2016,93000,1,1,0,20.5,1582.0,126.2,5.0,19.18
111,1151,3,2017,38053,4,1,0,15.1,1196.0,73.0,5.0,5.13


In [7]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 540 entries, 2665 to 746
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               540 non-null    int64  
 1   Location           540 non-null    int64  
 2   Year               540 non-null    int64  
 3   Kilometers_Driven  540 non-null    int64  
 4   Fuel_Type          540 non-null    int64  
 5   Transmission       540 non-null    int64  
 6   Owner_Type         540 non-null    int64  
 7   Mileage            540 non-null    float64
 8   Engine             540 non-null    float64
 9   Power              540 non-null    float64
 10  Seats              540 non-null    float64
 11  New_Price          540 non-null    float64
dtypes: float64(5), int64(7)
memory usage: 54.8 KB


In [8]:
model = LinearRegression()
model.fit(x_train , y_train)
y_pred = model.predict(x_test)

In [9]:
model.score(x_test , y_test)

0.9169702501315686

In [10]:
print("Max Error" , max_error(y_test , y_pred))
print("Root Mean Square Error" , mean_squared_error(y_test , y_pred)**0.5)

Max Error 19.796305391737132
Root Mean Square Error 3.8588445401969773


In [11]:
print("Intercept of the linear equation:", model.intercept_) 
for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}".format(col_name, model.coef_[idx]))

Intercept of the linear equation: -2337.71818997019
The coefficient for Name is 0.002018936426830757
The coefficient for Location is -0.052613210148336594
The coefficient for Year is 1.162874962818607
The coefficient for Kilometers_Driven is -4.044962040113814e-05
The coefficient for Fuel_Type is -0.1759859894787871
The coefficient for Transmission is 1.2235463766127268
The coefficient for Owner_Type is -0.22019509253300487
The coefficient for Mileage is -0.20978014706492681
The coefficient for Engine is 0.0017564623914412237
The coefficient for Power is -0.01527567818525839
The coefficient for Seats is -0.5615998363960318
The coefficient for New_Price is 0.5546999840651158
