# Ensemble Learning
Ensemble Learning adalah pendekatan meta umum untuk pembelajaran mesin yang mencari kinerja prediktif yang lebih baik dengan menggabungkan prediksi dari beberapa model.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
import statistics as st
import warnings
warnings.filterwarnings('ignore')

In [6]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
data.columns=['Id number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
data2 = data.apply(pd.to_numeric, errors='coerce') #Convert argument to numeric type and invalid parsing is set as NaN
data = data2.dropna() #Rewrrite previous data entry with new complete data 

# Display the first few records
data.head(5)

Unnamed: 0,Id number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2


In [7]:
data.drop(['Id number'],axis = 1, inplace = True)

In [8]:
data.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2


In [9]:
data.describe()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Clump Thickness              683 non-null    int64  
 1   Uniformity of Cell Size      683 non-null    int64  
 2   Uniformity of Cell Shape     683 non-null    int64  
 3   Marginal Adhesion            683 non-null    int64  
 4   Single Epithelial Cell Size  683 non-null    int64  
 5   Bare Nuclei                  683 non-null    float64
 6   Bland Chromatin              683 non-null    int64  
 7   Normal Nucleoli              683 non-null    int64  
 8   Mitoses                      683 non-null    int64  
 9   Class                        683 non-null    int64  
dtypes: float64(1), int64(9)
memory usage: 58.7 KB


In [11]:
data['Bare Nuclei']

0       1.0
1      10.0
2       2.0
3       4.0
4       1.0
       ... 
694     2.0
695     1.0
696     3.0
697     4.0
698     5.0
Name: Bare Nuclei, Length: 683, dtype: float64

In [12]:
data.replace('?',0, inplace=True)

In [13]:
data['Bare Nuclei']

0       1.0
1      10.0
2       2.0
3       4.0
4       1.0
       ... 
694     2.0
695     1.0
696     3.0
697     4.0
698     5.0
Name: Bare Nuclei, Length: 683, dtype: float64

In [18]:
# Convert the DataFrame object into NumPy array otherwise you will not be able to impute
values = data.values

# Now impute it
imputer = Imputer()
imputedData = imputer.fit_transform(values)

Dari kumpulan data yang ada, terdapat rentang fitur kumpulan data tidak sama. Hal ini dapat menyebabkan masalah. Perubahan kecil pada satu fitur mungkin tidak memengaruhi yang lain. Untuk mengatasi masalah ini, perlu untuk menormalkan rentang fitur ke rentang yang seragam, dalam hal ini, 0 - 1.

In [15]:
scaler = MinMaxScaler(feature_range=(0, 1))
normalizedData = scaler.fit_transform(imputedData)

Setelah melakukan semua prapemrosesan yang diperlukan untuk melakukan eksperimen Ensembling, selanjutnya dimulai dengan Ensembling berbasis Bagging. Dalam hal ini akan digunakan Baged Decision Tree.

In [17]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

> Mengimpor dependensi untuk Baged Decision Trees

In [19]:
# Segregate the features from the labels
X = normalizedData[:,0:9]
Y = normalizedData[:,9]

In [20]:
kfold = model_selection.KFold(n_splits=10, random_state=7)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=7)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.9576726342710998


Langkah pertama yaitu menginisialisasi lipatan validasi silang 10 kali lipat. Setelah itu, membuat Pengklasifikasi decision tree dengan 100 tree dan membungkusnya dalam Ensemble berbasis Bagging. 

Akurasi yang dihasilkan sebasar 95,71% .

In [21]:
# AdaBoost Classification

from sklearn.ensemble import AdaBoostClassifier
seed = 7
num_trees = 70
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.961999147485081


Langkah selanjutnya, melakukan klasifikasi AdaBoost (dengan 70 tree) yang didasarkan pada jenis Ensembling Boosting. Akurasi yang dihasilkan yaitu 95,57% untuk validasi silang 10 kali lipat.

> Penerapan teknik Ensemble berbasis Voting

In [22]:
# Voting Ensemble for Classification

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())

0.9620204603580564


Telah diimplementasikan model Ensemble berbasis Voting di mana mengambil Logistic Regression, Decision Tree, dan Support Vector Machine untuk tujuan pemungutan suara. Model melakukan yang terbaik sejauh ini dengan akurasi 96,42% untuk validasi silang 10 kali lipat.