# Objective:
To predict the costs of used cars given the data collected from various sources and distributed across various locations in India.
# Features:
Name: The brand and model of the car.
Location: The location in which the car is being sold or is available for purchase.
Year: The year or edition of the model.
Kilometers_Driven: The total kilometres driven in the car by the previous owner(s) in KM.
Fuel_Type: The type of fuel used by the car.
Transmission: The type of transmission used by the car.
Owner_Type: Whether the ownership is Firsthand, Second hand or other.
Mileage: The standard mileage offered by the car company in kmpl or km/kg
Engine: The displacement volume of the engine in cc.
Power: The maximum power of the engine in bhp.
Seats: The number of seats in the car.
New_Price: The price of a new car of the same model.
Price: The price of the used car in INR Lakhs

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
%matplotlib inline
sns.set_style('darkgrid')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression  
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Lasso

In [2]:
train = pd.read_csv('train-data.csv')

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [4]:
train.tail()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
6014,6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,28.4 kmpl,1248 CC,74 bhp,5.0,7.88 Lakh,4.75
6015,6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,24.4 kmpl,1120 CC,71 bhp,5.0,,4.0
6016,6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,14.0 kmpl,2498 CC,112 bhp,8.0,,2.9
6017,6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,18.9 kmpl,998 CC,67.1 bhp,5.0,,2.65
6018,6018,Chevrolet Beat Diesel,Hyderabad,2011,47000,Diesel,Manual,First,25.44 kmpl,936 CC,57.6 bhp,5.0,,2.5


In [5]:
train.shape

(6019, 14)

In [6]:
train.columns

Index(['Unnamed: 0', 'Name', 'Location', 'Year', 'Kilometers_Driven',
       'Fuel_Type', 'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power',
       'Seats', 'New_Price', 'Price'],
      dtype='object')

In [7]:
train.isnull().sum()

Unnamed: 0              0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64

In [8]:
import re
print(train.Mileage.str.replace('\d+\.?\d+','').value_counts())
print(train.Engine.str.replace('\d+','').value_counts())
print(train.Power.str.replace('\d+\.?\d+','').value_counts())

 kmpl     5951
 km/kg      66
Name: Mileage, dtype: int64
 CC    5983
Name: Engine, dtype: int64
 bhp        5876
null bhp     107
Name: Power, dtype: int64


  print(train.Mileage.str.replace('\d+\.?\d+','').value_counts())
  print(train.Engine.str.replace('\d+','').value_counts())
  print(train.Power.str.replace('\d+\.?\d+','').value_counts())


In [9]:
train.Name = train.Name.str.split().str[0]

In [10]:
group = train.groupby('Name').Price.describe()['50%']

In [11]:
def myfunc(car):
    if group[car]<10:
        return 1
    elif group[car]>=10 and group[car]<20:
        return 2
    elif group[car]>=20 and group[car]<50:
        return 3
    else:
        return 4

train['Car_Group'] = train.Name.apply(myfunc)

In [12]:
train['Mileage'] = train['Mileage'].str.replace(' km/kg', '')
train['Mileage'] = train['Mileage'].str.replace(' kmpl', '')
train['Mileage'] = pd.to_numeric(train['Mileage'], errors='coerce')

In [13]:
train['Engine'] = train['Engine'].str.replace(' CC', '')
train['Engine'] = pd.to_numeric(train['Engine'], errors='coerce')

In [14]:
train['Power'] = train['Power'].str.replace(' bhp', '')
train['Power'] = train['Power'].str.replace('null', '0.0')
train['Power'] = pd.to_numeric(train['Power'], errors='coerce')

In [16]:
train['Mileage'].replace(0.0,np.nan,inplace=True)
train['Engine'].replace(0.0,np.nan,inplace=True)
train['Seats'].replace(0.0,np.nan,inplace=True)

In [17]:
train.corr()

Unnamed: 0.1,Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,Car_Group
Unnamed: 0,1.0,0.002354,-0.008734,0.017214,-0.004171,-0.007907,-0.010247,-0.020275,-0.030434
Year,0.002354,1.0,-0.173048,0.296718,-0.052197,0.071181,0.011909,0.305327,0.015788
Kilometers_Driven,-0.008734,-0.173048,1.0,-0.064283,0.091068,0.024208,0.084209,-0.011493,0.018795
Mileage,0.017214,0.296718,-0.064283,1.0,-0.646476,-0.528542,-0.348206,-0.333263,-0.394367
Engine,-0.004171,-0.052197,0.091068,-0.646476,1.0,0.840934,0.397848,0.658354,0.644116
Power,-0.007907,0.071181,0.024208,-0.528542,0.840934,1.0,0.106954,0.759136,0.721312
Seats,-0.010247,0.011909,0.084209,-0.348206,0.397848,0.106954,1.0,0.053247,0.016032
Price,-0.020275,0.305327,-0.011493,-0.333263,0.658354,0.759136,0.053247,1.0,0.729917
Car_Group,-0.030434,0.015788,0.018795,-0.394367,0.644116,0.721312,0.016032,0.729917,1.0


In [18]:
train.var()

Unnamed: 0           3.019532e+06
Year                 1.069121e+01
Kilometers_Driven    8.330002e+09
Mileage              1.743459e+01
Engine               3.616281e+05
Power                3.075914e+03
Seats                6.496665e-01
Price                1.251695e+02
Car_Group            5.623745e-01
dtype: float64

In [19]:
train.Fuel_Type = train.Fuel_Type.astype('category')

In [20]:
train.Transmission.unique()

array(['Manual', 'Automatic'], dtype=object)

In [21]:
train.Transmission = train.Transmission.astype('category')

In [22]:
train.Owner_Type = train.Owner_Type.astype('category')

In [23]:
train.drop(['Name','New_Price','Unnamed: 0','Seats','Kilometers_Driven'],axis=1,inplace=True)

In [24]:
train.dropna(inplace=True)

In [25]:
temp_df = pd.get_dummies(train.Location,drop_first=True)
train = pd.concat([train,temp_df],axis=1)
train.drop(['Location'],axis=1,inplace=True)

In [26]:
temp_df = pd.get_dummies(train.Transmission,drop_first=True)
train = pd.concat([train,temp_df],axis=1)
train.drop(['Transmission'],axis=1,inplace=True)
temp_df = pd.get_dummies(train.Owner_Type,drop_first=True)
train = pd.concat([train,temp_df],axis=1)
train.drop(['Owner_Type'],axis=1,inplace=True)
temp_df = pd.get_dummies(train.Fuel_Type,drop_first=True)
train = pd.concat([train,temp_df],axis=1)
train.drop(['Fuel_Type'],axis=1,inplace=True)

In [27]:
train.drop(['Electric'],axis=1,inplace=True)

In [33]:
from sklearn.ensemble import RandomForestRegressor
features = list(train.columns)
features.remove('Price')
target = ['Price']
x_rf = train[features]
y_rf = train[target]
X_train, X_test, y_train, y_test = train_test_split(x_rf, y_rf, test_size=0.25, random_state=1)

In [34]:
regressor = RandomForestRegressor(n_estimators=100)
regressor.fit(X_train,y_train)
regressor.fit(X_train,y_train)
test_data_prediction = regressor.predict(X_test)
error_score = metrics.r2_score(y_test,test_data_prediction)
print("R squared error : ", error_score)

  regressor.fit(X_train,y_train)
  regressor.fit(X_train,y_train)


R squared error :  0.9152907369947201


In [35]:
regressor= LinearRegression()  
regressor.fit(X_train, y_train)  

LinearRegression()

In [36]:
y_pred= regressor.predict(X_test)

In [37]:
print('Train Score: ', regressor.score(X_train, y_train)) 

Train Score:  0.7312706723815185


In [38]:
model = Lasso(alpha=1.0)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print('Train Score: ', model.score(X_train, y_train))  

Train Score:  0.692689580706427


# Conclusion:
Hence, accuracy of random forest regression is more, we would choose it for building the project.