# Ensemble Learning

Ensemble learning is a machine learning technique that enhances accuracy and resilience in forecasting by merging predictions from multiple models. It aims to mitigate errors or biases that may exist in individual models by leveraging the collective intelligence of the ensemble.

## Importing libraries

In [1]:
# Import important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import accuracy_score

# Import models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# Import warnings
import warnings
warnings.filterwarnings('ignore')

## Max Voting

Max-voting, which is generally used for classification problems, is one of the simplest ways of combining predictions from multiple machine learning algorithms. In max-voting, each base model makes a prediction and votes for each sample. Only the sample class with the highest votes is included in the final predictive class.

### Import data

In [2]:
# Read the data
data = pd.read_csv('datasets/data_cleaned.csv')

# Check the data
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [3]:
print('Shape:', data.shape)
print('Columns:', data.columns)

Shape: (891, 25)
Columns: Index(['Survived', 'Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3',
       'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2',
       'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')


In [4]:
# Seperating independent and dependent variables
x = data.drop(['Survived'], axis=1)
y = data['Survived']

print(x.shape, y.shape)

(891, 24) (891,)


In [5]:
# Splitting the data into training and testing set
train_x, test_x, train_y, test_y = train_test_split(x, y, random_state = 101, stratify = y)

print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)

(668, 24) (668,)
(223, 24) (223,)


In [6]:
# Build a logistic regression model
model1 = LogisticRegression()
model1.fit(train_x, train_y)

pred1 = model1.predict(test_x)
print(pred1[:10], round(model1.score(test_x, test_y), 3))

[0 0 0 0 1 1 0 1 0 0] 0.776


In [7]:
# Build a KNN model
model2 = KNeighborsClassifier(n_neighbors = 5)
model2.fit(train_x, train_y)

pred2 = model2.predict(test_x)
print(pred2[:10], round(model2.score(test_x, test_y), 3))

[1 0 0 0 0 1 0 0 1 0] 0.74


In [8]:
# Build a Decision Tree model
model3 = DecisionTreeClassifier(max_depth = 7)
model3.fit(train_x, train_y)

pred3 = model3.predict(test_x)
print(pred3[:10], round(model3.score(test_x, test_y), 3))

[1 0 0 1 1 1 0 0 0 0] 0.785


In [9]:
# Create a dataframe to showcase predictions
df = pd.DataFrame(columns = ['M1', 'M2', 'M3', 'Actual'])

df['M1'] = pred1
df['M2'] = pred2
df['M3'] = pred3
df['Actual'] = np.array(test_y)

# Check the data
df.head()

Unnamed: 0,M1,M2,M3,Actual
0,0,1,1,0
1,0,0,0,0
2,0,0,0,0
3,0,0,1,1
4,1,0,1,1


In [10]:
# Perform Max Voting operation
from statistics import mode

final_pred = np.array([])

for i in range(0,len(test_x)):
    final_pred = np.append(final_pred, mode([pred1[i], pred2[i], pred3[i]]))

print(final_pred)

[1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1.
 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.
 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0.
 1. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0.
 0. 1. 1. 0. 1. 1. 0.]


In [11]:
print('Accuracy:', round(accuracy_score(test_y, final_pred), 3))
print('Accuracy:', round(accuracy_score(test_y, pred1), 3), round(accuracy_score(test_y, pred2), 3), round(accuracy_score(test_y, pred3), 3))

Accuracy: 0.807
Accuracy: 0.776 0.74 0.785


## Averaging

Ensemble averaging creates a group of networks, each with low bias and high variance, then combines them to a new network with (hopefully) low bias and low variance.

In [12]:
# Read the data
data = pd.read_csv('datasets/train_cleaned.csv')

# Check the data
data.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,Item_Fat_Content_LF,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Fat_Content_low fat,Item_Fat_Content_reg,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0.016047,249.8092,1999,3735.138,0,1,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,5.92,0.019278,48.2692,2009,443.4228,0,0,1,0,0,...,0,1,0,0,0,1,0,0,1,0
2,17.5,0.01676,141.618,1999,2097.27,0,1,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,19.2,0.0,182.095,1998,732.38,0,0,1,0,0,...,0,0,0,0,0,1,1,0,0,0
4,8.93,0.0,53.8614,1987,994.7052,0,1,0,0,0,...,1,0,0,0,0,1,0,1,0,0


In [13]:
# Check for shape and columns
print('Shape:', data.shape)
print('Columns:', data.columns)

Shape: (8523, 46)
Columns: Index(['Item_Weight', 'Item_Visibility', 'Item_MRP',
       'Outlet_Establishment_Year', 'Item_Outlet_Sales', 'Item_Fat_Content_LF',
       'Item_Fat_Content_Low Fat', 'Item_Fat_Content_Regular',
       'Item_Fat_Content_low fat', 'Item_Fat_Content_reg',
       'Item_Type_Baking Goods', 'Item_Type_Breads', 'Item_Type_Breakfast',
       'Item_Type_Canned', 'Item_Type_Dairy', 'Item_Type_Frozen Foods',
       'Item_Type_Fruits and Vegetables', 'Item_Type_Hard Drinks',
       'Item_Type_Health and Hygiene', 'Item_Type_Household', 'Item_Type_Meat',
       'Item_Type_Others', 'Item_Type_Seafood', 'Item_Type_Snack Foods',
       'Item_Type_Soft Drinks', 'Item_Type_Starchy Foods',
       'Outlet_Identifier_OUT010', 'Outlet_Identifier_OUT013',
       'Outlet_Identifier_OUT017', 'Outlet_Identifier_OUT018',
       'Outlet_Identifier_OUT019', 'Outlet_Identifier_OUT027',
       'Outlet_Identifier_OUT035', 'Outlet_Identifier_OUT045',
       'Outlet_Identifier_OUT046', 'Out

In [14]:
# Seperating independent and dependent variables
x = data.drop(['Item_Outlet_Sales'], axis=1)
y = data['Item_Outlet_Sales']

In [15]:
# Splitting the data into training and testing set
train_x, test_x, train_y, test_y = train_test_split(x, y, random_state = 101, shuffle = False)

print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)

(6392, 45) (6392,)
(2131, 45) (2131,)


In [16]:
# Build a linear regression model
model1 = LinearRegression()
model1.fit(train_x, train_y)

pred1 = model1.predict(test_x)
print(pred1[:10], round(model1.score(test_x, test_y), 3))

[ 2443.32513354  2788.36657095  2169.73477778  2385.33394477
  2716.58813899  3771.41850023  5119.35227101  4133.40997069
 -1010.95412123  1365.9929378 ] 0.555


In [17]:
# Build a KNN model
model2 = KNeighborsRegressor(n_neighbors = 9)
model2.fit(train_x, train_y)

pred2 = model2.predict(test_x)
print(pred2[:10], round(model2.score(test_x, test_y), 3))

[1859.06155556 2519.09128889 2620.88471111 2277.33191111 3801.12617778
 4099.10866667 5151.88642222 5378.62831111  476.7128     1501.15706667] 0.501


In [18]:
# Build a Decision Tree model
model3 = DecisionTreeRegressor(max_depth = 7)
model3.fit(train_x, train_y)

pred3 = model3.predict(test_x)
print(pred3[:10], round(model3.score(test_x, test_y), 3))

[2413.81776312 2822.71913115  585.22631071 2348.86112052 3153.22486811
 3544.38935413 5815.65707727 5048.681504    105.25188333 1595.93215694] 0.573


In [19]:
# Create a dataframe to showcase predictions
df = pd.DataFrame(columns = ['M1', 'M2', 'M3', 'Actual'])

df['M1'] = pred1
df['M2'] = pred2
df['M3'] = pred3
df['Actual'] = np.array(test_y)

# Check the data
df.head()

Unnamed: 0,M1,M2,M3,Actual
0,2443.325134,1859.061556,2413.817763,4277.765
1,2788.366571,2519.091289,2822.719131,1070.6064
2,2169.734778,2620.884711,585.226311,1001.3632
3,2385.333945,2277.331911,2348.861121,2871.5954
4,2716.588139,3801.126178,3153.224868,4287.752


In [20]:
from statistics import mean
from sklearn.metrics import r2_score

final_pred = np.array([])
for i in range(0, len(test_x)):
    final_pred = np.append(final_pred, mean([pred1[i], pred2[i], pred3[i]]))

print(final_pred)

[2238.7348174  2710.05899699 1791.94859987 ... 1307.04481371 1610.81987463
 1252.57719966]


In [21]:
print('R2-Score:', round(r2_score(test_y, final_pred), 3))
print('R2-Score:', round(r2_score(test_y, pred1), 3), round(r2_score(test_y, pred2), 3), round(r2_score(test_y, pred3), 3))

R2-Score: 0.576
R2-Score: 0.555 0.501 0.573


## Weighted Averaging

Weighted average ensembles assume that some models in the ensemble have more skill than others and give them more contribution when making predictions. The weighted average or weighted sum ensemble is an extension over voting ensembles that assume all models are equally skillful and make the same proportional contribution to predictions made by the ensemble.

In [23]:
from statistics import mean
from sklearn.metrics import r2_score

final_pred = np.array([])

for i in range(0,len(test_x)):
    final_pred = np.append(final_pred, mean([pred1[i], pred1[i], pred2[i], pred3[i], pred3[i]]))

print(final_pred)

[2314.66946977 2748.25253862 1626.16137762 ... 1354.45086089 1565.447454
 1258.38894626]


In [24]:
print('R2-Score:', round(r2_score(test_y, final_pred), 3))
print('R2-Score:', round(r2_score(test_y, pred1), 3), round(r2_score(test_y, pred2), 3), round(r2_score(test_y, pred3), 3))

R2-Score: 0.581
R2-Score: 0.555 0.501 0.573


## Rank Averaging

In [35]:
# Calculate the scores of the models
m1_score = model1.score(test_x, test_y)
m2_score = model2.score(test_x, test_y)
m3_score = model3.score(test_x, test_y)
print(m1_score, m2_score, m3_score)

0.5551582433748833 0.5007936087911619 0.5734975681543739


In [36]:
# Create a dataframe with r2 scores
index_ = [1, 2, 3]
test_r2 = [m1_score, m2_score, m3_score]
rank_eval = pd.DataFrame({'Score': test_r2}, index = index_)

# Check the data
rank_eval.head()

Unnamed: 0,Score
1,0.555158
2,0.500794
3,0.573498


In [37]:
# Sort the dataframe
sorted_rank = rank_eval.sort_values('Score')

# Check the data
sorted_rank.head()

Unnamed: 0,Score
2,0.500794
1,0.555158
3,0.573498


In [38]:
# Create a new column rank
sorted_rank['Rank'] = [i for i in range(1,4)]

# Check the data
sorted_rank.head()

Unnamed: 0,Score,Rank
2,0.500794,1
1,0.555158,2
3,0.573498,3


In [39]:
# Create a column wait to find weighted ranks
sorted_rank['Weight'] = sorted_rank['Rank']/sorted_rank['Rank'].sum()

# Check the data
sorted_rank.head()

Unnamed: 0,Score,Rank,Weight
2,0.500794,1,0.166667
1,0.555158,2,0.333333
3,0.573498,3,0.5


In [42]:
# Calculate the weighted predictions based on rank
wt_pred1 = pred1 * float(sorted_rank.loc[[1],['Weight']].values)
wt_pred2 = pred2 * float(sorted_rank.loc[[2],['Weight']].values)
wt_pred3 = pred3 * float(sorted_rank.loc[[3],['Weight']].values)

ranked_prediction = wt_pred1 + wt_pred2 + wt_pred3
ranked_prediction[:10]

array([2331.19418533, 2760.66363737, 1452.6721998 , 2349.0971937 ,
       3115.66284335, 3712.51895492, 5472.92703268, 4798.58212741,
       -204.90663208, 1503.48990218])

In [43]:
print('R2-Score:', round(r2_score(test_y, ranked_prediction), 3))

R2-Score: 0.583
