In [1]:
import pandas as pd 
import numpy as np 
import os

# models algorithms
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error



In [2]:
df=pd.read_csv('data/clean_car_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,3,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,4,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,6,Ford Figo,Ford,2012,175000,41000,Diesel


Preparing X and Y variable


In [4]:
X=df[['name','company','year','kms_driven','fuel_type']]
y=df['Price']

In [5]:
X,y

(                       name   company  year  kms_driven fuel_type
 0       Hyundai Santro Xing   Hyundai  2007       45000    Petrol
 1       Mahindra Jeep CL550  Mahindra  2006          40    Diesel
 2         Hyundai Grand i10   Hyundai  2014       28000    Petrol
 3    Ford EcoSport Titanium      Ford  2014       36000    Diesel
 4                 Ford Figo      Ford  2012       41000    Diesel
 ..                      ...       ...   ...         ...       ...
 806      Maruti Suzuki Ritz    Maruti  2011       50000    Petrol
 807          Tata Indica V2      Tata  2009       30000    Diesel
 808    Toyota Corolla Altis    Toyota  2009      132000    Petrol
 809            Tata Zest XM      Tata  2018       27000    Diesel
 810      Mahindra Quanto C8  Mahindra  2013       40000    Diesel
 
 [811 rows x 5 columns],
 0       80000
 1      425000
 2      325000
 3      575000
 4      175000
         ...  
 806    270000
 807    110000
 808    300000
 809    260000
 810    390000
 Nam

Creating train test split

In [6]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)

In [7]:
X_train

Unnamed: 0,name,company,year,kms_driven,fuel_type
239,Mini Cooper S,Mini,2013,13000,Petrol
445,Tata Indigo CS,Tata,2015,100000,Diesel
333,Hyundai Grand i10,Hyundai,2017,6821,Petrol
292,Mahindra Scorpio S4,Mahindra,2015,30000,Diesel
628,Maruti Suzuki Ertiga,Maruti,2013,48000,Diesel
...,...,...,...,...,...
71,Datsun Redi GO,Datsun,2017,16000,Petrol
106,Hyundai Grand i10,Hyundai,2014,49000,Diesel
270,Hyundai Grand i10,Hyundai,2014,41000,Petrol
435,Ford Figo Petrol,Ford,2011,75000,Petrol


In [37]:
# data Preprocessing


from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score



oh_transformar=OneHotEncoder()
oh_transformar.fit(X[['name','company','fuel_type']])





In [38]:
std=MinMaxScaler()
std.fit(X[['year','kms_driven']])

In [39]:
column_trans=make_column_transformer((OneHotEncoder(categories=oh_transformar.categories_),['name','company','fuel_type']),
                                     (MinMaxScaler(),['year','kms_driven']),
                                    remainder='passthrough')

In [28]:
oh_transformar.categories_

[array(['Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A6 2.0',
        'Audi A8', 'Audi Q3 2.0', 'Audi Q5 2.0', 'Audi Q7', 'BMW 3 Series',
        'BMW 5 Series', 'BMW 7 Series', 'BMW X1', 'BMW X1 sDrive20d',
        'BMW X1 xDrive20d', 'Chevrolet Beat', 'Chevrolet Beat Diesel',
        'Chevrolet Beat LS', 'Chevrolet Beat LT', 'Chevrolet Beat PS',
        'Chevrolet Cruze LTZ', 'Chevrolet Enjoy', 'Chevrolet Enjoy 1.4',
        'Chevrolet Sail 1.2', 'Chevrolet Sail UVA', 'Chevrolet Spark',
        'Chevrolet Spark 1.0', 'Chevrolet Spark LS', 'Chevrolet Spark LT',
        'Chevrolet Tavera LS', 'Chevrolet Tavera Neo', 'Datsun GO T',
        'Datsun Go Plus', 'Datsun Redi GO', 'Fiat Linea Emotion',
        'Fiat Petra ELX', 'Fiat Punto Emotion', 'Force Motors Force',
        'Force Motors One', 'Ford EcoSport', 'Ford EcoSport Ambiente',
        'Ford EcoSport Titanium', 'Ford EcoSport Trend',
        'Ford Endeavor 4x4', 'Ford Fiesta', 'Ford Fiesta SXi', 'Ford Figo',
        '

In [19]:
lr = LinearRegression()

In [20]:
pipe = make_pipeline(column_trans, lr)

In [36]:
pipe.fit(X_train, y_train)

In [22]:
y_pred = pipe.predict(X_test)
r2_score(y_test,y_pred)

0.6069648458334795

In [10]:
X.shape

(811, 5)

In [11]:
y_train.shape

(648,)

In [12]:
def model_evaluation(true,predict):
    mse=mean_squared_error(true,predict)
    mae=mean_absolute_error(true,predict)
    rmse=np.sqrt(mean_squared_error(true,predict))
    score=r2_score(true,predict)

    return mse,mae,rmse,score



In [29]:
models={
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [30]:
y_train.shape

(648,)

In [40]:
model_list=[]
r2_list=[]


for i in range(len(list(models))):
    model=list(models.values())[i]
    pipe=make_pipeline(column_trans,model)

    pipe.fit(X_train,y_train)

    y_train_pred=pipe.predict(X_train)
    y_test_pred=pipe.predict(X_test)

    model_train_mse,model_train_mae,model_train_rmse,model_train_score=model_evaluation(y_train,y_train_pred)
    model_test_mse,model_test_mae,model_test_rmse,model_test_score = model_evaluation(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Mean Squared Error: {:.4f}".format(model_train_mse))
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_score))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Mean Squared Error: {:.4f}".format(model_test_mse))
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_score))
    r2_list.append(model_test_score)
    
    print('='*35)
    print('\n')





Linear Regression
Model performance for Training set
- Mean Squared Error: 6794907771.1692
- Root Mean Squared Error: 82431.2306
- Mean Absolute Error: 46231.3882
- R2 Score: 0.9531
----------------------------------
Model performance for Test set
- Mean Squared Error: 60333150056.7144
- Root Mean Squared Error: 245628.0726
- Mean Absolute Error: 104545.5505
- R2 Score: 0.5921




  model = cd_fast.sparse_enet_coordinate_descent(


Lasso
Model performance for Training set
- Mean Squared Error: 6795015726.7750
- Root Mean Squared Error: 82431.8854
- Mean Absolute Error: 46355.9643
- R2 Score: 0.9531
----------------------------------
Model performance for Test set
- Mean Squared Error: 65970592337.2421
- Root Mean Squared Error: 256847.4106
- Mean Absolute Error: 108373.4457
- R2 Score: 0.5540


Ridge
Model performance for Training set
- Mean Squared Error: 13051940804.3153
- Root Mean Squared Error: 114245.0909
- Mean Absolute Error: 74759.9725
- R2 Score: 0.9099
----------------------------------
Model performance for Test set
- Mean Squared Error: 55638091120.2830
- Root Mean Squared Error: 235877.2798
- Mean Absolute Error: 110606.1970
- R2 Score: 0.6239


K-Neighbors Regressor
Model performance for Training set
- Mean Squared Error: 35678961758.6949
- Root Mean Squared Error: 188888.7550
- Mean Absolute Error: 88157.5275
- R2 Score: 0.7536
----------------------------------
Model performance for Test set
- Me

In [41]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.623855
6,XGBRegressor,0.613559
0,Linear Regression,0.592114
3,K-Neighbors Regressor,0.576081
4,Decision Tree,0.5603
5,Random Forest Regressor,0.557105
1,Lasso,0.554001
7,AdaBoost Regressor,0.012999


In [34]:
lr=Ridge()
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)

y_test_pred=pipe.predict(X_test)
print("Mean Absolute Error:",mean_absolute_error(y_test, y_test_pred))
print("R2_SCORE : ",r2_score(y_test,y_test_pred))

Mean Absolute Error: 110033.14608952138
R2_SCORE :  0.628596990242439


In [35]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_test_pred,'Difference':y_test-y_test_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
247,255000,325954.073439,-70954.073439
575,295000,349732.323739,-54732.323739
227,320000,206993.793391,113006.206609
290,230000,299980.923451,-69980.923451
538,215000,235925.056566,-20925.056566
...,...,...,...
675,125000,238880.255500,-113880.255500
439,90000,87849.436283,2150.563717
755,425000,345226.106621,79773.893379
517,110000,177774.863111,-67774.863111
