In [None]:
'''
Topic: To perform Random Forest ensemble learning method for Regression on petrol consumption dataset.
Dataset: Petrol Consumption Dataset

METHODOLOGY
Part 1: 
Read and parse the initial dataset
Load and check the data
Display the dataset with feature information

Part 2: 
Create Random forest model for regression
Pick N random records from the dataset
Build a decision tree based on these N records
Choose the number of trees you want in your algorithm and repeat steps above

Part 3:
In case of a regression problem, for a new record, each tree in the forest predicts a value for Y (output)
The final value can be calculated by taking the average of all the values predicted by all the trees in forest
Train and evaluate a Random forest regression model
'''

In [1]:
# Importing all necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [2]:
# loading the dataset
df=pd.read_csv('petrol_consumption.csv') #Reads the csv file
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Petrol_tax                    48 non-null     float64
 1   Average_income                48 non-null     int64  
 2   Paved_Highways                48 non-null     int64  
 3   Population_Driver_licence(%)  48 non-null     float64
 4   Petrol_Consumption            48 non-null     int64  
dtypes: float64(2), int64(3)
memory usage: 2.0 KB


Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410


In [3]:
# Assign data from first 4 columns to X variable
X=df.iloc[:,0:4].values
# Assign data from last column to y variable ('Petrol_Consumption')
y=df.iloc[:,4].values

In [4]:
# create training and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=0)

In [5]:
# perform feature scaling
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
# Training and Predictions
regressor=RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train,y_train)
y_pred=regressor.predict(X_test)

In [7]:
# Evaluating the Algorithm
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse) # or mse**(0.5)  

print("Mean Absolute Error (MAE):",mae)
print("Mean Squared Error (MSE):",mse)
print("Root Mean Squared Error (RMSE):",rmse)

Mean Absolute Error (MAE): 51.76500000000001
Mean Squared Error (MSE): 4216.166749999999
Root Mean Squared Error (RMSE): 64.93201637097064


In [8]:
# for different n_estimators (number of trees in the forest) till 500.
l=[]
for i in range(1,500):
    regressor=RandomForestRegressor(n_estimators=i, random_state=0)
    regressor.fit(X_train,y_train)
    y_pred=regressor.predict(X_test)
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse) # or mse**(0.5)    
    evaluation=[i,mae,mse,rmse]
    l.append(evaluation)
out=pd.DataFrame(l,columns=['TREES','MAE','MSE','RMSE'])
print(out)

     TREES        MAE          MSE       RMSE
0        1  58.200000  5000.600000  70.714921
1        2  47.700000  3944.550000  62.805653
2        3  53.133333  4814.711111  69.388119
3        4  53.475000  4686.931250  68.461166
4        5  58.720000  5408.984000  73.545795
..     ...        ...          ...        ...
494    495  47.724444  3462.226867  58.840691
495    496  47.736290  3466.473249  58.876763
496    497  47.738431  3465.085919  58.864980
497    498  47.788554  3474.157127  58.941981
498    499  47.726854  3468.928857  58.897613

[499 rows x 4 columns]


In [9]:
out.sort_values(by=['MAE','MSE','RMSE'])

Unnamed: 0,TREES,MAE,MSE,RMSE
109,110,47.045455,3517.132529,59.305417
115,116,47.090517,3559.534029,59.661831
106,107,47.133645,3533.955952,59.447085
110,111,47.134234,3529.579149,59.410261
111,112,47.140179,3523.506449,59.359131
...,...,...,...,...
6,7,54.500000,5448.708163,73.815365
14,15,54.773333,4808.293333,69.341858
0,1,58.200000,5000.600000,70.714921
4,5,58.720000,5408.984000,73.545795
