# Creating a Linear Regression Model


The objective of this project is to create a linear regression model to predict the price of used cars.

In [163]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import re

In [164]:
data = pd.read_csv('data/cars.csv')
display(data.head())

categorical_data = data.select_dtypes(include=['object'])
numerical_data = data.select_dtypes(include=['int64', 'float64'])

print(f"Categorical data: {categorical_data.columns.tolist()}\nNumerical data: {numerical_data.columns.tolist()}")

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


Categorical data: ['name', 'fuel', 'seller_type', 'transmission', 'owner', 'mileage', 'engine', 'max_power', 'torque']
Numerical data: ['year', 'selling_price', 'km_driven', 'seats']


### Converting units

Some categorical columns can be easily converted to numerical columns, like mileage '23.4 kmpl' can be converted to 23.4.
But looking at the data, we can see that the mileage has two different units, 'kmpl' and 'km/kg', so we need to convert km/kg to kmpl.

1Kg of petrol weighs roughly 0.740 kg, so we need to multiply the mileage by 1/0.740 to get the correct value.

In [165]:
mileage_units = data['mileage'].str[-5:].unique()
engine_size_units = data['engine'].str[-3:].unique()
max_power_units = data['max_power'].str[-3:].unique()

print(mileage_units.tolist())
print(engine_size_units.tolist())
print(max_power_units.tolist())

[' kmpl', 'km/kg', nan]
[' CC', nan]
['bhp', nan, '0']


In [166]:
# Converting km/kg to kmpl
corrected_mileage = list()

for i in data['mileage']:
    if str(i).endswith('km/kg'):
        mileage_whithout_unit = float(i.replace('km/kg', ''))
        mileage_in_kmpl = mileage_whithout_unit * 1/0.740
        corrected_mileage.append(mileage_in_kmpl)
    elif str(i).endswith('kmpl'):
        mileage_whithout_unit = float(i.replace('kmpl', ''))
        corrected_mileage.append(mileage_whithout_unit)
    else:
        corrected_mileage.append(i)

data['mileage (Km/l)'] = corrected_mileage
data['mileage (Km/l)'].replace(0, np.nan, inplace=True)
data.drop('mileage', axis=1, inplace=True)

# Removing CC from engine
data['engine_size (CC)'] = data['engine'].str.replace('CC', '').astype(float)
data['engine_size (CC)'].replace(0, np.nan, inplace=True)
data.drop('engine', axis=1, inplace=True)


# Removing bhp from max_power
corrected_max_power = list()
for i in data['max_power']:
    if str(i) == ' bhp':
        corrected_max_power.append(np.nan)
    elif str(i).endswith('bhp'):
        max_power_whithout_unit = float(i.replace('bhp', ''))
        corrected_max_power.append(max_power_whithout_unit)
    else:
        corrected_max_power.append(i)

data['max_power (bhp)'] = corrected_max_power
data['max_power (bhp)'].replace(0, np.nan, inplace=True)
data.drop('max_power', axis=1, inplace=True)


data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,torque,seats,mileage (Km/l),engine_size (CC),max_power (bhp)
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,190Nm@ 2000rpm,5.0,23.4,1248.0,74.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,250Nm@ 1500-2500rpm,5.0,21.14,1498.0,103.52
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,"12.7@ 2,700(kgm@ rpm)",5.0,17.7,1497.0,78.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,22.4 kgm at 1750-2750rpm,5.0,23.0,1396.0,90.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,"11.5@ 4,500(kgm@ rpm)",5.0,16.1,1298.0,88.2


In [167]:
data.torque.unique()

array(['190Nm@ 2000rpm', '250Nm@ 1500-2500rpm', '12.7@ 2,700(kgm@ rpm)',
       '22.4 kgm at 1750-2750rpm', '11.5@ 4,500(kgm@ rpm)',
       '113.75nm@ 4000rpm', '7.8@ 4,500(kgm@ rpm)', '59Nm@ 2500rpm',
       '170Nm@ 1800-2400rpm', '160Nm@ 2000rpm', '248Nm@ 2250rpm',
       '78Nm@ 4500rpm', nan, '84Nm@ 3500rpm', '115Nm@ 3500-3600rpm',
       '200Nm@ 1750rpm', '62Nm@ 3000rpm', '219.7Nm@ 1500-2750rpm',
       '114Nm@ 3500rpm', '115Nm@ 4000rpm', '69Nm@ 3500rpm',
       '172.5Nm@ 1750rpm', '6.1kgm@ 3000rpm', '114.7Nm@ 4000rpm',
       '60Nm@ 3500rpm', '90Nm@ 3500rpm', '151Nm@ 4850rpm',
       '104Nm@ 4000rpm', '320Nm@ 1700-2700rpm', '250Nm@ 1750-2500rpm',
       '145Nm@ 4600rpm', '146Nm@ 4800rpm', '343Nm@ 1400-3400rpm',
       '200Nm@ 1400-3400rpm', '200Nm@ 1250-4000rpm',
       '400Nm@ 2000-2500rpm', '138Nm@ 4400rpm', '360Nm@ 1200-3400rpm',
       '200Nm@ 1200-3600rpm', '380Nm@ 1750-2500rpm', '173Nm@ 4000rpm',
       '400Nm@ 1750-3000rpm', '400Nm@ 1400-2800rpm',
       '200Nm@ 1750-3000rp

As we can see, the torque column has a lot of different patterns, we can filter torque with a regex pattern, like:

Nm regex:
`(\d+[\.\,]?\d*)[Nn]m`

Kgm regex:
`(\d+[\.\,]?\d*)[Kk]gm`

Different patterns:
`\([Kk][Gg][Mm]@\srpm\)` -> torque will be the first `(\d+[\.\,]?\d*)` match
`\([Nn][Mm]@\srpm\)` -> torque will be the first `(\d+[\.\,]?\d*)` match

In [168]:
def kmg_to_nm(x: float):
    return round(x * 9.80665, 1)

def extract_first_number(x: str):
    return float(re.match(r'(\d+[\.\,]?\d*)', x)[0])

corrected_torque = list()
for torque in data.torque:
    if isinstance(torque, float):
        corrected_torque.append(torque)

    elif re.search(r'\([Kk][Gg][Mm]@\srpm\)', torque):
        torque_kgm = extract_first_number(torque)
        torque_nm = kmg_to_nm(torque_kgm)
        corrected_torque.append(torque_nm)

    elif re.search(r'\([Nn][Mm]@\srpm\)', torque):
        torque_nm = extract_first_number(torque)
        corrected_torque.append(torque_nm)

    elif re.search(r'(\d+[\.\,]?\d*)\s?[Nn]m', torque):
        torque_nm = extract_first_number(torque)
        corrected_torque.append(torque_nm)

    elif re.search(r'(\d+[\.\,]?\d*)\s?[Kk][Gg][Mm]', torque):
        torque_kgm = extract_first_number(torque)
        torque_nm = kmg_to_nm(torque_kgm)
        corrected_torque.append(torque_nm)

    elif re.search(r'(\d+[\.\,]?\d*)@', torque):
        torque_nm = extract_first_number(torque)
        corrected_torque.append(torque_nm)

    else:
        torque_nm = extract_first_number(torque)
        corrected_torque.append(torque_nm)

data['torque (Nm)'] = corrected_torque
data['torque (Nm)'].astype(float)
data['torque (Nm)'].replace(0, np.nan, inplace=True)
data.drop('torque', axis=1, inplace=True)
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,seats,mileage (Km/l),engine_size (CC),max_power (bhp),torque (Nm)
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,5.0,23.4,1248.0,74.0,190.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,5.0,21.14,1498.0,103.52,250.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,5.0,17.7,1497.0,78.0,124.5
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,5.0,23.0,1396.0,90.0,219.7
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,5.0,16.1,1298.0,88.2,112.8


In [169]:
# Handling empty values
data.isna().sum()

name                  0
year                  0
selling_price         0
km_driven             0
fuel                  0
seller_type           0
transmission          0
owner                 0
seats               221
mileage (Km/l)      238
engine_size (CC)    221
max_power (bhp)     216
torque (Nm)         222
dtype: int64

In [177]:
data.describe().round(2)

Unnamed: 0,year,selling_price,km_driven,seats,mileage (Km/l),engine_size (CC),torque (Nm)
count,8128.0,8128.0,8128.0,7907.0,7890.0,7907.0,7906.0
mean,2013.8,638271.81,69819.51,5.42,19.55,1458.63,180.6
std,4.04,806253.4,56550.55,0.96,4.11,503.92,109.25
min,1983.0,29999.0,1.0,2.0,9.0,624.0,47.1
25%,2011.0,254999.0,35000.0,5.0,16.8,1197.0,111.8
50%,2015.0,450000.0,60000.0,5.0,19.33,1248.0,170.0
75%,2017.0,675000.0,98000.0,5.0,22.36,1582.0,209.0
max,2020.0,10000000.0,2360457.0,14.0,45.19,3604.0,1863.3


In [179]:
data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,seats,mileage (Km/l),engine_size (CC),max_power (bhp),torque (Nm)
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,5.0,23.40,1248.0,74.0,190.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,5.0,21.14,1498.0,103.52,250.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,5.0,17.70,1497.0,78.0,124.5
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,5.0,23.00,1396.0,90.0,219.7
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,5.0,16.10,1298.0,88.2,112.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,First Owner,5.0,18.50,1197.0,82.85,113.7
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,5.0,16.80,1493.0,110.0,235.4
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,First Owner,5.0,19.30,1248.0,73.9,190.0
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,First Owner,5.0,23.57,1396.0,70.0,140.0
