In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("train-data.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.50
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.50
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.00
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,28.4 kmpl,1248 CC,74 bhp,5.0,4.75
6015,6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,24.4 kmpl,1120 CC,71 bhp,5.0,4.00
6016,6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,14.0 kmpl,2498 CC,112 bhp,8.0,2.90
6017,6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,18.9 kmpl,998 CC,67.1 bhp,5.0,2.65


In [4]:
df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [5]:
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.50
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.50
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.00
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,28.4 kmpl,1248 CC,74 bhp,5.0,4.75
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,24.4 kmpl,1120 CC,71 bhp,5.0,4.00
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,14.0 kmpl,2498 CC,112 bhp,8.0,2.90
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,18.9 kmpl,998 CC,67.1 bhp,5.0,2.65


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 564.4+ KB


In [7]:
df.describe()

Unnamed: 0,Year,Kilometers_Driven,Seats,Price
count,6019.0,6019.0,5977.0,6019.0
mean,2013.358199,58738.38,5.278735,9.479468
std,3.269742,91268.84,0.80884,11.187917
min,1998.0,171.0,0.0,0.44
25%,2011.0,34000.0,5.0,3.5
50%,2014.0,53000.0,5.0,5.64
75%,2016.0,73000.0,5.0,9.95
max,2019.0,6500000.0,10.0,160.0


In [8]:
df.isna().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               36
Power                36
Seats                42
Price                 0
dtype: int64

In [9]:
df["Mileage"].unique()

array(['26.6 km/kg', '19.67 kmpl', '18.2 kmpl', '20.77 kmpl', '15.2 kmpl',
       '21.1 km/kg', '23.08 kmpl', '11.36 kmpl', '20.54 kmpl',
       '22.3 kmpl', '21.56 kmpl', '16.8 kmpl', '25.2 kmpl', '12.7 kmpl',
       '0.0 kmpl', '13.5 kmpl', '25.8 kmpl', '28.4 kmpl', '20.45 kmpl',
       '14.84 kmpl', '22.69 kmpl', '23.65 kmpl', '13.53 kmpl',
       '18.5 kmpl', '14.4 kmpl', '20.92 kmpl', '17.5 kmpl', '12.8 kmpl',
       '19.01 kmpl', '14.53 kmpl', '11.18 kmpl', '12.4 kmpl',
       '16.09 kmpl', '14.0 kmpl', '24.3 kmpl', '18.15 kmpl', '11.74 kmpl',
       '22.07 kmpl', '19.7 kmpl', '25.4 kmpl', '25.32 kmpl', '14.62 kmpl',
       '14.28 kmpl', '14.9 kmpl', '11.25 kmpl', '24.4 kmpl', '16.55 kmpl',
       '17.11 kmpl', '22.9 kmpl', '17.8 kmpl', '18.9 kmpl', '15.04 kmpl',
       '25.17 kmpl', '20.36 kmpl', '13.29 kmpl', '13.68 kmpl',
       '20.0 kmpl', '15.8 kmpl', '25.0 kmpl', '16.4 kmpl', '24.52 kmpl',
       '22.1 kmpl', '8.5 kmpl', '15.1 kmpl', '16.95 kmpl', '19.64 kmpl',
       '16.

In [10]:
df["Fuel_Type"].unique()

array(['CNG', 'Diesel', 'Petrol', 'LPG', 'Electric'], dtype=object)

In [11]:
density = {
    "CNG": 0.5,
    "Diesel": 0.85,
    "Petrol": 0.7,
    "LPG": 0.6
}

In [12]:
df["Mileage"] = df["Mileage"].fillna("0 kmpl")

In [13]:
df["Mileage"].isna().sum()

0

In [14]:
df[["mileage", "unit"]] = df["Mileage"].str.split(expand=True)

In [15]:
df["mileage"] = df["mileage"].astype("float")

In [16]:
def apply_density(df):
    if df["unit"] == "km/kg":
        return df["mileage"] * density[df["Fuel_Type"]]
    return df["mileage"]

In [17]:
df["mileage"] = df.apply(apply_density, axis=1)

In [18]:
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,mileage,unit
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75,13.30,km/kg
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.50,19.67,kmpl
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.50,18.20,kmpl
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.00,20.77,kmpl
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74,15.20,kmpl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,28.4 kmpl,1248 CC,74 bhp,5.0,4.75,28.40,kmpl
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,24.4 kmpl,1120 CC,71 bhp,5.0,4.00,24.40,kmpl
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,14.0 kmpl,2498 CC,112 bhp,8.0,2.90,14.00,kmpl
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,18.9 kmpl,998 CC,67.1 bhp,5.0,2.65,18.90,kmpl


In [19]:
df["Power"].unique()

array(['58.16 bhp', '126.2 bhp', '88.7 bhp', '88.76 bhp', '140.8 bhp',
       '55.2 bhp', '63.1 bhp', '171.5 bhp', '103.6 bhp', '74 bhp',
       '103.25 bhp', '116.3 bhp', '187.7 bhp', '115 bhp', '175.56 bhp',
       '98.6 bhp', '83.8 bhp', '167.62 bhp', '190 bhp', '88.5 bhp',
       '177.01 bhp', '80 bhp', '67.1 bhp', '102 bhp', '108.45 bhp',
       '138.1 bhp', '184 bhp', '179.5 bhp', '103.5 bhp', '64 bhp',
       '82 bhp', '254.8 bhp', '73.9 bhp', '46.3 bhp', '37.5 bhp',
       '77 bhp', '82.9 bhp', '149.92 bhp', '138.03 bhp', '112.2 bhp',
       '163.7 bhp', '71 bhp', '105 bhp', '174.33 bhp', '75 bhp',
       '103.2 bhp', '53.3 bhp', '78.9 bhp', '147.6 bhp', '147.8 bhp',
       '68 bhp', '186 bhp', '170 bhp', '69 bhp', '140 bhp', '78 bhp',
       '194 bhp', '500 bhp', '108.5 bhp', '86.8 bhp', '187.74 bhp',
       'null bhp', '132 bhp', '86.7 bhp', '73.94 bhp', '117.3 bhp',
       '218 bhp', '168.5 bhp', '89.84 bhp', '110 bhp', '90 bhp',
       '82.85 bhp', '67 bhp', '241.4 bhp', '3

In [20]:
df["Power"].isna().sum()

36

In [21]:
df["Power"].unique()

array(['58.16 bhp', '126.2 bhp', '88.7 bhp', '88.76 bhp', '140.8 bhp',
       '55.2 bhp', '63.1 bhp', '171.5 bhp', '103.6 bhp', '74 bhp',
       '103.25 bhp', '116.3 bhp', '187.7 bhp', '115 bhp', '175.56 bhp',
       '98.6 bhp', '83.8 bhp', '167.62 bhp', '190 bhp', '88.5 bhp',
       '177.01 bhp', '80 bhp', '67.1 bhp', '102 bhp', '108.45 bhp',
       '138.1 bhp', '184 bhp', '179.5 bhp', '103.5 bhp', '64 bhp',
       '82 bhp', '254.8 bhp', '73.9 bhp', '46.3 bhp', '37.5 bhp',
       '77 bhp', '82.9 bhp', '149.92 bhp', '138.03 bhp', '112.2 bhp',
       '163.7 bhp', '71 bhp', '105 bhp', '174.33 bhp', '75 bhp',
       '103.2 bhp', '53.3 bhp', '78.9 bhp', '147.6 bhp', '147.8 bhp',
       '68 bhp', '186 bhp', '170 bhp', '69 bhp', '140 bhp', '78 bhp',
       '194 bhp', '500 bhp', '108.5 bhp', '86.8 bhp', '187.74 bhp',
       'null bhp', '132 bhp', '86.7 bhp', '73.94 bhp', '117.3 bhp',
       '218 bhp', '168.5 bhp', '89.84 bhp', '110 bhp', '90 bhp',
       '82.85 bhp', '67 bhp', '241.4 bhp', '3

In [22]:
df["Power"] = df["Power"].fillna("0 bhp")

In [23]:
df[["power", "unit"]] = df["Power"].str.split(expand=True)

In [24]:
df["power"].unique()

array(['58.16', '126.2', '88.7', '88.76', '140.8', '55.2', '63.1',
       '171.5', '103.6', '74', '103.25', '116.3', '187.7', '115',
       '175.56', '98.6', '83.8', '167.62', '190', '88.5', '177.01', '80',
       '67.1', '102', '108.45', '138.1', '184', '179.5', '103.5', '64',
       '82', '254.8', '73.9', '46.3', '37.5', '77', '82.9', '149.92',
       '138.03', '112.2', '163.7', '71', '105', '174.33', '75', '103.2',
       '53.3', '78.9', '147.6', '147.8', '68', '186', '170', '69', '140',
       '78', '194', '500', '108.5', '86.8', '187.74', 'null', '132',
       '86.7', '73.94', '117.3', '218', '168.5', '89.84', '110', '90',
       '82.85', '67', '241.4', '35', '270.9', '126.32', '73', '130',
       '100.6', '150', '75.94', '215', '107.3', '37.48', '120', '178',
       '152', '91.1', '85.80', '362.07', '121.3', '143', '81.80', '171',
       '76.8', '103.52', '444', '362.9', '67.06', '120.7', '258', '81.86',
       '112', '88.73', '57.6', '157.75', '102.5', '201.1', '83.1',
       '6

In [25]:
df["power"] = df["power"].replace("null", 0)

In [26]:
df["power"] = df["power"].astype("float")

In [27]:
df["power"] = df["power"].replace(0.0, round(df["power"].mean(), 2))

In [28]:
df["power"].unique()

array([ 58.16 , 126.2  ,  88.7  ,  88.76 , 140.8  ,  55.2  ,  63.1  ,
       171.5  , 103.6  ,  74.   , 103.25 , 116.3  , 187.7  , 115.   ,
       175.56 ,  98.6  ,  83.8  , 167.62 , 190.   ,  88.5  , 177.01 ,
        80.   ,  67.1  , 102.   , 108.45 , 138.1  , 184.   , 179.5  ,
       103.5  ,  64.   ,  82.   , 254.8  ,  73.9  ,  46.3  ,  37.5  ,
        77.   ,  82.9  , 149.92 , 138.03 , 112.2  , 163.7  ,  71.   ,
       105.   , 174.33 ,  75.   , 103.2  ,  53.3  ,  78.9  , 147.6  ,
       147.8  ,  68.   , 186.   , 170.   ,  69.   , 140.   ,  78.   ,
       194.   , 500.   , 108.5  ,  86.8  , 187.74 , 110.56 , 132.   ,
        86.7  ,  73.94 , 117.3  , 218.   , 168.5  ,  89.84 , 110.   ,
        90.   ,  82.85 ,  67.   , 241.4  ,  35.   , 270.9  , 126.32 ,
        73.   , 130.   , 100.6  , 150.   ,  75.94 , 215.   , 107.3  ,
        37.48 , 120.   , 178.   , 152.   ,  91.1  ,  85.8  , 362.07 ,
       121.3  , 143.   ,  81.8  , 171.   ,  76.8  , 103.52 , 444.   ,
       362.9  ,  67.

In [29]:
df.isna().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               0
Engine               36
Power                 0
Seats                42
Price                 0
mileage               0
unit                  0
power                 0
dtype: int64

In [30]:
df["Engine"].unique()

array(['998 CC', '1582 CC', '1199 CC', '1248 CC', '1968 CC', '814 CC',
       '1461 CC', '2755 CC', '1598 CC', '1462 CC', '1497 CC', '2179 CC',
       '2477 CC', '1498 CC', '2143 CC', '1995 CC', '1984 CC', '1197 CC',
       '2494 CC', '1798 CC', '2696 CC', '2698 CC', '1061 CC', '1198 CC',
       '2987 CC', '796 CC', '624 CC', '1999 CC', '1991 CC', '2694 CC',
       '1120 CC', '2498 CC', '799 CC', '2393 CC', '1399 CC', '1796 CC',
       '2148 CC', '1396 CC', '1950 CC', '4806 CC', '1998 CC', '1086 CC',
       '1193 CC', '2982 CC', '1493 CC', '2967 CC', '2993 CC', '1196 CC',
       '1799 CC', '2497 CC', '2354 CC', '1373 CC', '2996 CC', '1591 CC',
       '2894 CC', '5461 CC', '1595 CC', '936 CC', '1997 CC', nan,
       '1896 CC', '1390 CC', '1364 CC', '2199 CC', '993 CC', '999 CC',
       '1405 CC', '2956 CC', '1794 CC', '995 CC', '2496 CC', '1599 CC',
       '2400 CC', '1495 CC', '2523 CC', '793 CC', '4134 CC', '1596 CC',
       '1395 CC', '2953 CC', '1586 CC', '2362 CC', '1496 CC', '1368

In [31]:
df[["engine", "unit"]] = df["Engine"].str.split(expand=True)

In [32]:
df["engine"] = df["engine"].fillna("0")

In [33]:
df["engine"] = df["engine"].astype("int")

In [34]:
df["engine"] = df["engine"].replace(0, round(df["engine"].mean()))

In [35]:
df["engine"].unique()

array([ 998, 1582, 1199, 1248, 1968,  814, 1461, 2755, 1598, 1462, 1497,
       2179, 2477, 1498, 2143, 1995, 1984, 1197, 2494, 1798, 2696, 2698,
       1061, 1198, 2987,  796,  624, 1999, 1991, 2694, 1120, 2498,  799,
       2393, 1399, 1796, 2148, 1396, 1950, 4806, 1998, 1086, 1193, 2982,
       1493, 2967, 2993, 1196, 1799, 2497, 2354, 1373, 2996, 1591, 2894,
       5461, 1595,  936, 1997, 1612, 1896, 1390, 1364, 2199,  993,  999,
       1405, 2956, 1794,  995, 2496, 1599, 2400, 1495, 2523,  793, 4134,
       1596, 1395, 2953, 1586, 2362, 1496, 1368, 1298, 1956, 1299, 3498,
       2835, 1150, 3198, 1343, 1499, 1186, 1590, 2609, 2499, 2446, 1978,
       2360, 3436, 2198, 4367, 2706, 1422, 2979, 1969, 1489, 2489, 1242,
       1388, 1172, 2495, 1194, 3200, 1781, 1341, 2773, 3597, 1985, 2147,
       1047, 2999, 2995, 2997, 1948, 2359, 4395, 2349, 2720, 1468, 3197,
       2487, 1597, 2771,   72, 4951,  970, 2925, 2200, 5000, 2149, 5998,
       2092, 5204, 2112, 1797])

In [36]:
df = df.drop(["Mileage", "Engine", "Power", "unit"], axis=1)

In [37]:
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,mileage,power,engine
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,13.30,58.16,998
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,5.0,12.50,19.67,126.20,1582
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,5.0,4.50,18.20,88.70,1199
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,7.0,6.00,20.77,88.76,1248
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,15.20,140.80,1968
...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,5.0,4.75,28.40,74.00,1248
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,5.0,4.00,24.40,71.00,1120
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,8.0,2.90,14.00,112.00,2498
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,5.0,2.65,18.90,67.10,998


In [38]:
df.isna().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Seats                42
Price                 0
mileage               0
power                 0
engine                0
dtype: int64

In [39]:
df = df.fillna(0)

In [40]:
df["Seats"] = df["Seats"].replace(0, round(df["Seats"].mean()))

In [41]:
df["Seats"].unique()

array([ 5.,  7.,  8.,  4.,  6.,  2., 10.,  9.])

In [42]:
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,mileage,power,engine
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,5.0,1.75,13.30,58.16,998
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,5.0,12.50,19.67,126.20,1582
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,5.0,4.50,18.20,88.70,1199
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,7.0,6.00,20.77,88.76,1248
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,5.0,17.74,15.20,140.80,1968
...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,5.0,4.75,28.40,74.00,1248
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,5.0,4.00,24.40,71.00,1120
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,8.0,2.90,14.00,112.00,2498
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,5.0,2.65,18.90,67.10,998


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Seats              6019 non-null   float64
 8   Price              6019 non-null   float64
 9   mileage            6019 non-null   float64
 10  power              6019 non-null   float64
 11  engine             6019 non-null   int32  
dtypes: float64(4), int32(1), int64(2), object(5)
memory usage: 540.9+ KB


In [44]:
label_encoder = LabelEncoder()

In [45]:
df["Name"] = label_encoder.fit_transform(df["Name"])
df["Location"] = label_encoder.fit_transform(df["Location"])
df["Fuel_Type"] = label_encoder.fit_transform(df["Fuel_Type"])
df["Transmission"] = label_encoder.fit_transform(df["Transmission"])
df["Owner_Type"] = label_encoder.fit_transform(df["Owner_Type"])

In [46]:
scaler = MinMaxScaler()

In [47]:
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,Price,mileage,power,engine
0,1200,9,2010,72000,0,1,0,5.0,1.75,13.30,58.16,998
1,512,10,2015,41000,1,1,0,5.0,12.50,19.67,126.20,1582
2,486,2,2011,46000,4,1,0,5.0,4.50,18.20,88.70,1199
3,1059,2,2012,87000,1,1,0,7.0,6.00,20.77,88.76,1248
4,23,3,2013,40670,1,0,2,5.0,17.74,15.20,140.80,1968
...,...,...,...,...,...,...,...,...,...,...,...,...
6014,1159,4,2014,27365,1,1,0,5.0,4.75,28.40,74.00,1248
6015,668,6,2015,100000,1,1,0,5.0,4.00,24.40,71.00,1120
6016,932,6,2012,55000,1,1,2,8.0,2.90,14.00,112.00,2498
6017,1207,8,2013,46000,4,1,0,5.0,2.65,18.90,67.10,998


In [48]:
X = df.drop("Price", axis=1)
Y = df["Price"]

In [49]:
X = scaler.fit_transform(X)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True ,random_state=101)

In [51]:
Y

0        1.75
1       12.50
2        4.50
3        6.00
4       17.74
        ...  
6014     4.75
6015     4.00
6016     2.90
6017     2.65
6018     2.50
Name: Price, Length: 6019, dtype: float64

In [52]:
model = RandomForestRegressor(n_estimators=100, random_state=101)

In [53]:
model.fit(X_train, y_train)

RandomForestRegressor(random_state=101)

In [54]:
y_pred = model.predict(X_test)

In [55]:
model.score(X_test,y_test)

0.8743139401037199

# Prediction on test file

In [56]:
test = pd.read_csv("test-data.csv")

In [57]:
test

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats
0,0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0
1,1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0
2,2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0
3,3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0
4,4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1229,1229,Volkswagen Vento Diesel Trendline,Hyderabad,2011,89411,Diesel,Manual,First,20.54 kmpl,1598 CC,103.6 bhp,5.0
1230,1230,Volkswagen Polo GT TSI,Mumbai,2015,59000,Petrol,Automatic,First,17.21 kmpl,1197 CC,103.6 bhp,5.0
1231,1231,Nissan Micra Diesel XV,Kolkata,2012,28000,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0
1232,1232,Volkswagen Polo GT TSI,Pune,2013,52262,Petrol,Automatic,Third,17.2 kmpl,1197 CC,103.6 bhp,5.0


In [58]:
test.drop(["Unnamed: 0"], axis=1, inplace=True)

In [59]:
test["Mileage"] = test["Mileage"].fillna("0 kmpl")

In [60]:
test[["mileage", "unit"]] = test["Mileage"].str.split(expand=True)

In [61]:
test["mileage"] = test["mileage"].astype("float")

In [62]:
def apply_density(test):
    if test["unit"] == "km/kg":
        return test["mileage"] * density[test["Fuel_Type"]]
    return test["mileage"]

In [63]:
test["mileage"] = test.apply(apply_density, axis=1)

In [64]:
test["Power"] = test["Power"].fillna("0 bhp")

In [65]:
test[["power", "unit"]] = test["Power"].str.split(expand=True)

In [66]:
test["power"] = test["power"].replace("null", 0)

In [67]:
test["power"] = test["power"].astype("float")

In [68]:
test["power"] = test["power"].replace(0.0, round(test["power"].mean(), 2))

In [69]:
test[["engine", "unit"]] = test["Engine"].str.split(expand=True)

In [70]:
test["engine"] = test["engine"].fillna("0")

In [71]:
test["engine"] = test["engine"].astype("int")

In [72]:
test["engine"] = test["engine"].replace(0, round(test["engine"].mean()))

In [73]:
test = test.drop(["Mileage", "Engine", "Power", "unit"], axis=1)

In [74]:
test = test.fillna(0)

In [75]:
test["Seats"] = test["Seats"].replace(0, round(test["Seats"].mean()))

In [76]:
test

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,mileage,power,engine
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,4.0,16.13,58.20,998
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,5.0,24.70,47.30,796
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,7.0,13.68,147.80,2393
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,5.0,23.59,107.52,1364
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,5.0,18.50,82.85,1197
...,...,...,...,...,...,...,...,...,...,...,...
1229,Volkswagen Vento Diesel Trendline,Hyderabad,2011,89411,Diesel,Manual,First,5.0,20.54,103.60,1598
1230,Volkswagen Polo GT TSI,Mumbai,2015,59000,Petrol,Automatic,First,5.0,17.21,103.60,1197
1231,Nissan Micra Diesel XV,Kolkata,2012,28000,Diesel,Manual,First,5.0,23.08,63.10,1461
1232,Volkswagen Polo GT TSI,Pune,2013,52262,Petrol,Automatic,Third,5.0,17.20,103.60,1197


In [77]:
X = test.copy()

In [78]:
X["Name"] = label_encoder.fit_transform(X["Name"])
X["Location"] = label_encoder.fit_transform(X["Location"])
X["Fuel_Type"] = label_encoder.fit_transform(X["Fuel_Type"])
X["Transmission"] = label_encoder.fit_transform(X["Transmission"])
X["Owner_Type"] = label_encoder.fit_transform(X["Owner_Type"])

In [79]:
X = scaler.transform(X)

In [80]:
price = model.predict(X)

In [81]:
test["Price"] = price

In [82]:
test

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Seats,mileage,power,engine,Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,4.0,16.13,58.20,998,3.10110
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,5.0,24.70,47.30,796,2.67400
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,7.0,13.68,147.80,2393,15.19810
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,5.0,23.59,107.52,1364,4.57880
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,5.0,18.50,82.85,1197,3.71615
...,...,...,...,...,...,...,...,...,...,...,...,...
1229,Volkswagen Vento Diesel Trendline,Hyderabad,2011,89411,Diesel,Manual,First,5.0,20.54,103.60,1598,4.25240
1230,Volkswagen Polo GT TSI,Mumbai,2015,59000,Petrol,Automatic,First,5.0,17.21,103.60,1197,6.43820
1231,Nissan Micra Diesel XV,Kolkata,2012,28000,Diesel,Manual,First,5.0,23.08,63.10,1461,2.75220
1232,Volkswagen Polo GT TSI,Pune,2013,52262,Petrol,Automatic,Third,5.0,17.20,103.60,1197,4.21780
