In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [2]:
column_names = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders',  'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'    ]

In [3]:
data = pd.read_csv('data1.data')
data.columns = column_names

In [4]:
data

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.40,8.5,110,5500,19,25,15250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
200,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
201,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
202,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


replacing all the ? to NaN

In [5]:
data = data.replace('?', np.NaN)

In [6]:
data.isnull().sum()

symboling             0
normalized-losses    40
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

Checking the data description

In [7]:
data.describe()

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
count,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0
mean,0.823529,98.806373,174.075,65.916667,53.74902,2555.602941,126.892157,10.148137,25.240196,30.769608
std,1.239035,5.994144,12.362123,2.146716,2.424901,521.96082,41.744569,3.981,6.551513,6.898337
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.075,52.0,2145.0,97.0,8.575,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,119.5,9.0,24.0,30.0
75%,2.0,102.4,183.2,66.9,55.5,2939.25,142.0,9.4,30.0,34.5
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          204 non-null    int64  
 1   normalized-losses  164 non-null    object 
 2   make               204 non-null    object 
 3   fuel-type          204 non-null    object 
 4   aspiration         204 non-null    object 
 5   num-of-doors       202 non-null    object 
 6   body-style         204 non-null    object 
 7   drive-wheels       204 non-null    object 
 8   engine-location    204 non-null    object 
 9   wheel-base         204 non-null    float64
 10  length             204 non-null    float64
 11  width              204 non-null    float64
 12  height             204 non-null    float64
 13  curb-weight        204 non-null    int64  
 14  engine-type        204 non-null    object 
 15  num-of-cylinders   204 non-null    object 
 16  engine-size        204 non

Checking out the make feature

In [9]:
data['make'].nunique()

22

The make features has 22 unique values and they are categorical in nature, it should be converted into numerical format but curse of dimensionality should also be avoided so not all the 22 values will be in the dataset

In [10]:
data['make'].value_counts()

toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
subaru           12
volkswagen       12
peugot           11
volvo            11
dodge             9
mercedes-benz     8
bmw               8
plymouth          7
audi              7
saab              6
porsche           5
isuzu             4
chevrolet         3
jaguar            3
alfa-romero       2
renault           2
mercury           1
Name: make, dtype: int64

In [11]:
#Selection the 10 most frequent car make.
make_10 = data['make'].value_counts().head(10)
make_10 = list(make_10.index)

In [12]:
for sample in data['make']: 
    for value in make_10: 
        data[value] = np.where(sample == value, 1, 0)
        


In [13]:
#Dropping the make column

data = data.drop(columns = ['make'])

In [14]:
data['normalized-losses'] = pd.to_numeric(data['normalized-losses'])

The fuel type columns has only two values, gas and diesel, trying to convert it them to numeric values

In [15]:
data['fuel-type'] = data['fuel-type'].map({'gas': 0, 'diesel': 1})

Checking out the aspiration column, the aspiration column has two values which are 'std' and 'turbo'

In [16]:
data['aspiration'] = data['aspiration'].map({'std': 0, 'turbo':1})

In [17]:
data['aspiration'].value_counts()

0    167
1     37
Name: aspiration, dtype: int64

Checking the num-of-doors column

In [18]:
data['num-of-doors'] = data['num-of-doors'].map({'two': 0, 'four': 1})

Checking out the body style of the vehicle

In [19]:
#Encoding the unique values of the body-style feature
body_style_map = {'convertible': 0, 'hatchback': 1, 'sedan': 2, 'wagon':3, 'hardtop':4}
data['body-style'] = data['body-style'].map(body_style_map)

Checking the drive-wheels feature.

In [20]:
#Encoding the drive-wheels feature
mapping = {'rwd' : 0, 'fwd': 1, '4wd' : 2}
data['drive-wheels'] = data['drive-wheels'].map(mapping) 

Checking out the engine-location feature

In [21]:
mapping = {'front' : 0, 'rear' : 1}
data['engine-location'] = data['engine-location'].map(mapping)

Checking out the fuel-system feature

In [22]:
#Encoding the fuel-system feature.
mapping = {'mpfi': 0, '2bbl': 1, 'mfi': 2, '1bbl':3, 'spfi':4, '4bbl' :5, 'idi':6, 'spdi':7}
data['fuel-system'] = data['fuel-system'].map(mapping)

Converting the following features to numeric data form

In [23]:
#Converting the bore, stroke,  numeric
data['bore'] = pd.to_numeric(data['bore'])
data['stroke'] = pd.to_numeric(data['stroke'])
data['price'] = pd.to_numeric(data['price'])
data['horsepower'] = pd.to_numeric(data['horsepower'])
data['peak-rpm'] = pd.to_numeric(data['peak-rpm'])

Converting the number-of-cylinders  column  to numeric

In [24]:
mapping = {'four':4, 'six':6, 'five':5, 'three':3, 'twelve':12, 'two':2, 'eight':8}
data['num-of-cylinders'] = data['num-of-cylinders'].map(mapping)

Converting the engine-type column into a numeric form

In [25]:
#Encoding the engine-type column
mapping = {'dohc':0, 'ohcv':1, 'ohc':2, 'l':3, 'rotor':4, 'ohcf':5, 'dohcv':6}
data['engine-type'] = data['engine-type'].map(mapping)

Handling all the missing values using KNNImputer

In [26]:
imputer = KNNImputer()

In [27]:
columns = data.columns
data = imputer.fit_transform(data)

In [28]:
data.shape, len(columns)

((204, 35), 35)

In [29]:
data = pd.DataFrame(data)
data.columns = columns
data.head(5)

Unnamed: 0,symboling,normalized-losses,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,toyota,nissan,mazda,mitsubishi,honda,subaru,volkswagen,peugot,volvo,dodge
0,3.0,144.8,0.0,0.0,0.0,0.0,0.0,0.0,88.6,168.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,144.8,0.0,0.0,0.0,1.0,0.0,0.0,94.5,171.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2.0,164.0,0.0,0.0,1.0,2.0,1.0,0.0,99.8,176.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2.0,164.0,0.0,0.0,1.0,2.0,2.0,0.0,99.4,176.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2.0,131.0,0.0,0.0,0.0,2.0,1.0,0.0,99.8,177.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Confirming that there are no null values present in the data

In [30]:
data.isna().sum()

symboling            0
normalized-losses    0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
toyota               0
nissan               0
mazda                0
mitsubishi           0
honda                0
subaru               0
volkswagen           0
peugot               0
volvo                0
dodge                0
dtype: int64

Splitting the data into dependent and independent variables.

In [31]:
X = data.drop(columns = ['price'])
y = data['price']

Splitting the data into training and testing test.

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3, random_state=4)

Scaling the data using the scikit learn standardscaler 

In [33]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

Setting the list of algorithms to be used to build the model.

In [34]:
algorithms = {
    
    'Linear Regression' : {
        'Model' : LinearRegression(),
        'Params' : {
            'normalize' : [True, False]
        }
    }, 
    
    'Lasso' : {
        'Model': Lasso(),
        'Params' : {
            'alpha' : [i for i in range(1,6)],
            'normalize' : [True, False]
        }
    }, 
    
    'Ridge' : {
        'Model': Ridge(),
        'Params' : {
            'alpha' : [i for i in range(1,6)],
            'normalize' : [True, False]
        }
    }, 
    
    'SVR' : {
        'Model': SVR(),
        'Params' : {
            'kernel' : ['poly', 'linear', 'rbf', 'sigmoid' ],
            'degree' : [i for i in range(1,6)],
        }
    }, 
    
    'Decision Tree Regressor' : {
        'Model': DecisionTreeRegressor(),
        'Params' : {
            
            'criterion' : ["mse", "mae" ]
        }
    }, 
    
    'Random Forest Regressor' : {
        'Model': RandomForestRegressor(),
        'Params' : {
            'n_estimators' : [i for i in range(50, 300, 50)],
            'criterion' : ["mse", "mae" ]
        }
    }, 
    
    'Extra Trees Regressor' : {
        'Model': ExtraTreesRegressor(),
        'Params' : {
            'n_estimators' : [i for i in range(50, 300, 50)],
            'criterion' : ["mse", "mae" ]
        }
    }, 
    
}

In [35]:
acc = []
for model_name, params in algorithms.items():
    grid= GridSearchCV(params['Model'], param_grid = params['Params'], verbose=10, cv=5 )
    grid.fit(X_train, y_train)
    acc.append({'Model' : model_name, 'Best Score' : grid.best_score_, 'Best Parameters': grid.best_params_ })
    
    

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] normalize=True ..................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...................... normalize=True, score=0.570, total=   0.3s
[CV] normalize=True ..................................................
[CV] ...................... normalize=True, score=0.817, total=   0.0s
[CV] normalize=True ..................................................
[CV] ...................... normalize=True, score=0.903, total=   0.0s
[CV] normalize=True ..................................................
[CV] ...................... normalize=True, score=0.929, total=   0.0s
[CV] normalize=True ..................................................
[CV] ...................... normalize=True, score=0.817, total=   0.0s
[CV] normalize=False .................................................
[CV] ..................... normalize=False, score=0.570, total=   0.0s
[CV] normalize=False .................................................
[CV] ..................... normalize=False, score=0.817, total=   0.0s
[CV] normalize=False .................................................
[CV] .

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elaps


[CV] alpha=2, normalize=True .........................................
[CV] ............. alpha=2, normalize=True, score=0.580, total=   0.0s
[CV] alpha=2, normalize=True .........................................
[CV] ............. alpha=2, normalize=True, score=0.807, total=   0.0s
[CV] alpha=2, normalize=True .........................................
[CV] ............. alpha=2, normalize=True, score=0.929, total=   0.0s
[CV] alpha=2, normalize=True .........................................
[CV] ............. alpha=2, normalize=True, score=0.934, total=   0.0s
[CV] alpha=2, normalize=True .........................................
[CV] ............. alpha=2, normalize=True, score=0.796, total=   0.0s
[CV] alpha=2, normalize=False ........................................
[CV] ............ alpha=2, normalize=False, score=0.574, total=   0.0s
[CV] alpha=2, normalize=False ........................................
[CV] ............ alpha=2, normalize=False, score=0.816, total=   0.0s
[CV] 

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s


[CV] ............ alpha=4, normalize=False, score=0.743, total=   0.0s
[CV] alpha=5, normalize=True .........................................
[CV] ............. alpha=5, normalize=True, score=0.525, total=   0.0s
[CV] alpha=5, normalize=True .........................................
[CV] ............. alpha=5, normalize=True, score=0.515, total=   0.0s
[CV] alpha=5, normalize=True .........................................
[CV] ............. alpha=5, normalize=True, score=0.693, total=   0.0s
[CV] alpha=5, normalize=True .........................................
[CV] ............. alpha=5, normalize=True, score=0.697, total=   0.0s
[CV] alpha=5, normalize=True .........................................
[CV] ............. alpha=5, normalize=True, score=0.650, total=   0.0s
[CV] alpha=5, normalize=False ........................................
[CV] ............ alpha=5, normalize=False, score=0.588, total=   0.0s
[CV] alpha=5, normalize=False ........................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s


[CV] .............. degree=2, kernel=poly, score=-0.087, total=   0.0s
[CV] degree=2, kernel=linear .........................................
[CV] ............. degree=2, kernel=linear, score=0.133, total=   0.0s
[CV] degree=2, kernel=linear .........................................
[CV] ............ degree=2, kernel=linear, score=-0.311, total=   0.0s
[CV] degree=2, kernel=linear .........................................
[CV] ............. degree=2, kernel=linear, score=0.024, total=   0.0s
[CV] degree=2, kernel=linear .........................................
[CV] ............. degree=2, kernel=linear, score=0.074, total=   0.0s
[CV] degree=2, kernel=linear .........................................
[CV] ............. degree=2, kernel=linear, score=0.064, total=   0.0s
[CV] degree=2, kernel=rbf ............................................
[CV] ............... degree=2, kernel=rbf, score=-0.060, total=   0.0s
[CV] degree=2, kernel=rbf ............................................
[CV] .

[CV] ............... degree=5, kernel=rbf, score=-0.086, total=   0.0s
[CV] degree=5, kernel=sigmoid ........................................
[CV] ........... degree=5, kernel=sigmoid, score=-0.056, total=   0.0s
[CV] degree=5, kernel=sigmoid ........................................
[CV] ........... degree=5, kernel=sigmoid, score=-0.463, total=   0.0s
[CV] degree=5, kernel=sigmoid ........................................
[CV] ........... degree=5, kernel=sigmoid, score=-0.108, total=   0.0s
[CV] degree=5, kernel=sigmoid ........................................
[CV] ........... degree=5, kernel=sigmoid, score=-0.066, total=   0.0s
[CV] degree=5, kernel=sigmoid ........................................
[CV] ........... degree=5, kernel=sigmoid, score=-0.082, total=   0.0s
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] criterion=mse ...................................................
[CV] ....................... criterion=mse, score=0.634, total=   0.0s
[CV] criterion=ms

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 

[CV] ...... criterion=mse, n_estimators=50, score=0.583, total=   0.2s
[CV] criterion=mse, n_estimators=50 ..................................
[CV] ...... criterion=mse, n_estimators=50, score=0.879, total=   0.1s
[CV] criterion=mse, n_estimators=50 ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] ...... criterion=mse, n_estimators=50, score=0.930, total=   0.2s
[CV] criterion=mse, n_estimators=50 ..................................
[CV] ...... criterion=mse, n_estimators=50, score=0.939, total=   0.2s
[CV] criterion=mse, n_estimators=50 ..................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.6s remaining:    0.0s


[CV] ...... criterion=mse, n_estimators=50, score=0.919, total=   0.2s
[CV] criterion=mse, n_estimators=100 .................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s remaining:    0.0s


[CV] ..... criterion=mse, n_estimators=100, score=0.568, total=   0.3s
[CV] criterion=mse, n_estimators=100 .................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.1s remaining:    0.0s


[CV] ..... criterion=mse, n_estimators=100, score=0.871, total=   0.3s
[CV] criterion=mse, n_estimators=100 .................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.4s remaining:    0.0s


[CV] ..... criterion=mse, n_estimators=100, score=0.929, total=   0.3s
[CV] criterion=mse, n_estimators=100 .................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.7s remaining:    0.0s


[CV] ..... criterion=mse, n_estimators=100, score=0.938, total=   0.3s
[CV] criterion=mse, n_estimators=100 .................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    2.1s remaining:    0.0s


[CV] ..... criterion=mse, n_estimators=100, score=0.915, total=   0.3s
[CV] criterion=mse, n_estimators=150 .................................
[CV] ..... criterion=mse, n_estimators=150, score=0.573, total=   0.5s
[CV] criterion=mse, n_estimators=150 .................................
[CV] ..... criterion=mse, n_estimators=150, score=0.874, total=   0.6s
[CV] criterion=mse, n_estimators=150 .................................
[CV] ..... criterion=mse, n_estimators=150, score=0.938, total=   0.5s
[CV] criterion=mse, n_estimators=150 .................................
[CV] ..... criterion=mse, n_estimators=150, score=0.938, total=   0.6s
[CV] criterion=mse, n_estimators=150 .................................
[CV] ..... criterion=mse, n_estimators=150, score=0.919, total=   0.5s
[CV] criterion=mse, n_estimators=200 .................................
[CV] ..... criterion=mse, n_estimators=200, score=0.609, total=   0.8s
[CV] criterion=mse, n_estimators=200 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   41.7s finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] criterion=mse, n_estimators=50 ..................................
[CV] ...... criterion=mse, n_estimators=50, score=0.583, total=   0.1s
[CV] criterion=mse, n_estimators=50 ..................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ...... criterion=mse, n_estimators=50, score=0.816, total=   0.1s
[CV] criterion=mse, n_estimators=50 ..................................
[CV] ...... criterion=mse, n_estimators=50, score=0.961, total=   0.1s
[CV] criterion=mse, n_estimators=50 ..................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s


[CV] ...... criterion=mse, n_estimators=50, score=0.939, total=   0.1s
[CV] criterion=mse, n_estimators=50 ..................................
[CV] ...... criterion=mse, n_estimators=50, score=0.889, total=   0.1s
[CV] criterion=mse, n_estimators=100 .................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s


[CV] ..... criterion=mse, n_estimators=100, score=0.553, total=   0.3s
[CV] criterion=mse, n_estimators=100 .................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.8s remaining:    0.0s


[CV] ..... criterion=mse, n_estimators=100, score=0.825, total=   0.2s
[CV] criterion=mse, n_estimators=100 .................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.0s remaining:    0.0s


[CV] ..... criterion=mse, n_estimators=100, score=0.965, total=   0.2s
[CV] criterion=mse, n_estimators=100 .................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.3s remaining:    0.0s


[CV] ..... criterion=mse, n_estimators=100, score=0.938, total=   0.3s
[CV] criterion=mse, n_estimators=100 .................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.5s remaining:    0.0s


[CV] ..... criterion=mse, n_estimators=100, score=0.891, total=   0.3s
[CV] criterion=mse, n_estimators=150 .................................
[CV] ..... criterion=mse, n_estimators=150, score=0.565, total=   0.4s
[CV] criterion=mse, n_estimators=150 .................................
[CV] ..... criterion=mse, n_estimators=150, score=0.828, total=   0.4s
[CV] criterion=mse, n_estimators=150 .................................
[CV] ..... criterion=mse, n_estimators=150, score=0.966, total=   0.4s
[CV] criterion=mse, n_estimators=150 .................................
[CV] ..... criterion=mse, n_estimators=150, score=0.945, total=   0.4s
[CV] criterion=mse, n_estimators=150 .................................
[CV] ..... criterion=mse, n_estimators=150, score=0.891, total=   0.4s
[CV] criterion=mse, n_estimators=200 .................................
[CV] ..... criterion=mse, n_estimators=200, score=0.567, total=   0.5s
[CV] criterion=mse, n_estimators=200 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   39.0s finished


In [36]:
result = pd.DataFrame(acc)
result.sort_values('Best Score', ascending = False)

Unnamed: 0,Model,Best Score,Best Parameters
5,Random Forest Regressor,0.857369,"{'criterion': 'mae', 'n_estimators': 150}"
6,Extra Trees Regressor,0.854464,"{'criterion': 'mae', 'n_estimators': 250}"
4,Decision Tree Regressor,0.821123,{'criterion': 'mse'}
1,Lasso,0.812622,"{'alpha': 1, 'normalize': True}"
2,Ridge,0.809449,"{'alpha': 1, 'normalize': False}"
0,Linear Regression,0.807033,{'normalize': False}
3,SVR,-0.003126,"{'degree': 1, 'kernel': 'linear'}"


In [45]:
trees = RandomForestRegressor(criterion='mae', n_estimators = 150)
trees.fit(X_train, y_train)
pred = trees.predict(X_test)

In [46]:
train_pred = trees.predict(X_train)
train_score = r2_score(y_train, train_pred)
# test_score =r2_score(y_test, pred)
test_score = r2_score(y_test, pred)
print(f'This is the train score {train_score} ')
print(f'This is the test score {test_score} ')

This is the train score 0.9849990711205399 
This is the test score 0.917971185809812 


Checking for the root mean squared error and the mean absolute error

In [49]:
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mse)

print(f'The root mean squared error of the model is {rmse}')
print(f'The mean squared error of the model is {mse}')
print(f'The mean absolute error of the model is {mae}')

The root mean squared error of the model is 2388.823403181636
The mean squared error of the model is 5706477.251588292
The mean absolute error of the model is 1450.7719032258067
