# **Ensemble Methods- Bagging and Boosting** #

In [1]:
import warnings
from decimal import Decimal
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
warnings.simplefilter("ignore")


In [2]:
df_comb = pd.read_csv("Dataset/dis_sym_dataset_comb.csv")
X = df_comb.iloc[:, 1:]
Y = df_comb.iloc[:, 0:1]

In [3]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.10)

In [4]:
accuracy_list = []
cross_accuracy_list = []
model_list = []

## 1. Decision Tree

In [22]:
dt = DecisionTreeClassifier()
dt = dt.fit(X, Y)
# prediction of labels for the test data
dt_pred = dt.predict(x_test)
acc_dt = round(Decimal(accuracy_score(y_test, dt_pred) * 100), 2)
accuracy_list.append(acc_dt)
model_list.append("DT")
print(f"Accuracy (DT) : {acc_dt}%")

# Cross Validation Accuracy DT
# performing cross validation with 5 fold splitting
scores_dt = cross_val_score(dt, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_dt.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (DT): {score}%")

Accuracy (DT) : 92.31%
Cross Validation Accuracy (DT): 83.58%


### 1.1 Bagging for DT

In [23]:
bg = BaggingClassifier(DecisionTreeClassifier(),n_estimators = 10, oob_score = True)
bg.fit(X,Y)
bg_pred = bg.predict(x_test)
acc_bg = round(Decimal(accuracy_score(y_test, bg_pred) * 100), 2)
accuracy_list.append(acc_bg)
model_list.append("BG")
print(f"Accuracy (BG) : {acc_bg}%")
scores_bg = cross_val_score(bg, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_bg.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (DT): {score}%")

Accuracy (BG) : 92.99%
Cross Validation Accuracy (DT): 87.44%


In [16]:
accuracy_list

[Decimal('91.97'),
 Decimal('91.97'),
 Decimal('91.74'),
 Decimal('91.97'),
 Decimal('91.97'),
 Decimal('92.42')]

### 1.2 Boosting for DT

### 1.2.1 AdaBoost

In [24]:
adb = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 10, algorithm="SAMME")
adb.fit(X,Y)
adb_pred = adb.predict(x_test)
acc_adb = round(Decimal(accuracy_score(y_test, adb_pred) * 100), 2)
accuracy_list.append(acc_adb)
model_list.append("ADB")
print(f"Accuracy (ADB) : {acc_adb}%")
scores_adb = cross_val_score(adb, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_adb.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (ADB): {score}%")

Accuracy (ADB) : 92.19%
Cross Validation Accuracy (ADB): 85.40%


In [22]:
accuracy_list

[Decimal('91.97'),
 Decimal('91.97'),
 Decimal('91.74'),
 Decimal('91.97'),
 Decimal('91.97'),
 Decimal('92.42'),
 Decimal('92.31'),
 Decimal('92.31')]

## *GradientBoosting

In [29]:
gdb = GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.1)
gdb.fit(X,Y)
gdb_pred = gdb.predict(x_test)
acc_gdb = round(Decimal(accuracy_score(y_test, gdb_pred) * 100), 2)
accuracy_list.append(acc_gdb)
model_list.append("GDB")
print(f"Accuracy (GDB) : {acc_gdb}%")
scores_gdb = cross_val_score(gdb, X, Y, cv=5)
# mean of cross val score (accuracy)2
score = round(Decimal(scores_gdb.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (GDB): {score}%")

Accuracy (GDB) : 68.89%


KeyboardInterrupt: 

## *XGB Classifier

In [34]:
xgb = XGBClassifier(n_estimators = 100, n_jobs = 4)
xgb.fit(X,Y)
xgb_pred = xgb.predict(x_test)
acc_xgb = round(Decimal(accuracy_score(y_test, xgb_pred) * 100), 2)
accuracy_list.append(acc_xgb)
model_list.append("XGB")
print(f"Accuracy (XGB) : {acc_xgb}%")
scores_xgb = cross_val_score(xgb, X, Y, cv=5)
# mean of cross val score (accuracy)2
score = round(Decimal(scores_xgb.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (XGB): {score}%")

Accuracy (XGB) : 84.62%
Cross Validation Accuracy (XGB): 80.25%


## 2. Logistic Regression 
Note: Bagging and Boosting won't make much effect on LR, since, LR is by default a stable model unlike decision trees and overfitting is generally avoided. 

In [25]:
lr = LogisticRegression()
lr = lr.fit(X, Y)
# prediction of labels for the test data
lr_pred = lr.predict(x_test)
acc_lr = round(Decimal(accuracy_score(y_test, lr_pred) * 100), 2)
accuracy_list.append(acc_lr)
model_list.append("LR")
print(f"Accuracy (LR) : {acc_lr}%")

# Cross Validation Accuracy LR
# performing cross validation with 5 different splits
scores_lr = cross_val_score(lr, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_lr.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (LR): {score}%")

Accuracy (LR) : 92.08%
Cross Validation Accuracy (LR): 89.19%


### 2.1 Bagging for LR

In [26]:
bg2 = BaggingClassifier(LogisticRegression(), oob_score = True)
bg2.fit(X,Y)
bg2_pred = bg2.predict(x_test)
acc_bg2 = round(Decimal(accuracy_score(y_test, bg2_pred) * 100), 2)
accuracy_list.append(acc_bg2)
model_list.append("BG_LR")
print(f"Accuracy (BG_LR) : {acc_bg2}%")
scores_bg2 = cross_val_score(bg2, X, Y, cv=5)
# mean of cross val score (accuracy)
score2 = round(Decimal(scores_bg2.mean() * 100), 2)
cross_accuracy_list.append(score2)
print(f"Cross Validation Accuracy (LR): {score2}%")

Accuracy (BG_LR) : 91.74%
Cross Validation Accuracy (LR): 89.30%


### 2.2 Boosting for LR  AdaBoost/LogitBoost

In [43]:
pip install logitboost


The following command must be run outside of the IPython shell:

    $ pip install logitboost

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


In [53]:
adb2 = AdaBoostClassifier(LogisticRegression(),n_estimators = 200, algorithm="SAMME")
adb2.fit(X,Y)
adb_pred2 = adb2.predict(x_test)
acc_adb2 = round(Decimal(accuracy_score(y_test, adb_pred2) * 100), 2)
accuracy_list.append(acc_adb2)
model_list.append("ADB_LR")
print(f"Accuracy (ADB_LR) : {acc_adb2}%")
scores_adb2 = cross_val_score(adb2, X, Y, cv=5)
# mean of cross val score (accuracy)
score2 = round(Decimal(scores_adb2.mean() * 100), 2)
cross_accuracy_list.append(score2)
print(f"Cross Validation Accuracy (ADB): {score2}%")

Accuracy (ADB_LR) : 75.11%
Cross Validation Accuracy (ADB): 70.22%


## 3. Support Vector Machines 

In [6]:
svm = SVC()
svm = svm.fit(X, Y)
# prediction of labels for the test data
svm_pred = svm.predict(x_test)
acc_svm = round(Decimal(accuracy_score(y_test, svm_pred) * 100), 2)
accuracy_list.append(acc_svm)
model_list.append("SVM")
print(f"Accuracy (SVM) : {acc_svm}%")

# Cross Validation Accuracy SVM
# performing cross validation with 5 different splits
scores_svm = cross_val_score(svm, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_svm.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (SVM): {score}%")

Accuracy (SVM) : 91.40%
Cross Validation Accuracy (SVM): 88.62%


### 3.1 Bagging for SVC

In [8]:
bg3 = BaggingClassifier(SVC(), n_estimators = 10)
bg3.fit(X,Y)
bg3_pred = bg3.predict(x_test)
acc_bg3 = round(Decimal(accuracy_score(y_test, bg3_pred) * 100), 2)
accuracy_list.append(acc_bg3)
model_list.append("BG_SVC")
print(f"Accuracy (BG_SVC) : {acc_bg3}%")
scores_bg3 = cross_val_score(bg3, X, Y, cv=5)
# mean of cross val score (accuracy)
score3 = round(Decimal(scores_bg3.mean() * 100), 2)
cross_accuracy_list.append(score3)
print(f"Cross Validation Accuracy (BG_SVC): {score3}%")

Accuracy (BG_SVC) : 0.00%


KeyboardInterrupt: 

## 4.  Multilayer Perceptron Classifier

In [13]:
mlp = MLPClassifier(hidden_layer_sizes=(32, 32, 32), activation='relu', solver='adam', max_iter=50)
mlp = mlp.fit(X,Y)
# prediction of labels for the test data
mlp_pred = mlp.predict(x_test)
acc_mlp = round(Decimal(accuracy_score(y_test, mlp_pred) * 100), 2)
accuracy_list.append(acc_mlp)
model_list.append("MLP")
print(f"Accuracy (MLP) : {acc_mlp}%")

# Cross Validation Accuracy MLP
# performing cross validation with 5 different splits
scores_mlp = cross_val_score(mlp, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_mlp.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (MLP): {score}%")

Accuracy (MLP) : 92.53%
Cross Validation Accuracy (MLP): 87.20%


### 4.1 Bagging for MLP

In [7]:
bg4 = BaggingClassifier(MLPClassifier(hidden_layer_sizes=(32, 32, 32), activation='relu', solver='adam', max_iter=50), n_estimators = 10)
bg4.fit(X,Y)
bg4_pred = bg4.predict(x_test)
acc_bg4 = round(Decimal(accuracy_score(y_test, bg4_pred) * 100), 2)
accuracy_list.append(acc_bg4)
model_list.append("BG_SVC")
print(f"Accuracy (BG_SVC) : {acc_bg4}%")
scores_bg4 = cross_val_score(bg4, X, Y, cv=5)
# mean of cross val score (accuracy)
score4 = round(Decimal(scores_bg4.mean() * 100), 2)
cross_accuracy_list.append(score4)
print(f"Cross Validation Accuracy (BG_MLP): {score4}%")

Accuracy (BG_SVC) : 91.63%
Cross Validation Accuracy (BG_MLP): 88.66%


### 4.2 Boosting for MLP

In [29]:
import numpy as np
class customMLPClassifer(MLPClassifier):
    def fit(self, X, y, sample_weight=None):
        return self._fit(X, y, incremental=(self.warm_start and
                                            hasattr(self)))

adabooster = AdaBoostClassifier(base_estimator=customMLPClassifer())

adabooster.fit(X,Y)
adb_pred4 = adb4.predict(x_test)
acc_adb4 = round(Decimal(accuracy_score(y_test, adb_pred4) * 100), 2)
accuracy_list.append(acc_adb4)
model_list.append("ADB_MLP")
print(f"Accuracy (ADB_MLP) : {acc_adb4}%")
scores_adb4 = cross_val_score(adb4, X, Y, cv=5)
# mean of cross val score (accuracy)
score4 = round(Decimal(scores_adb4.mean() * 100), 2)
cross_accuracy_list.append(score4)
print(f"Cross Validation Accuracy (ADB): {score4}%")

AttributeError: 'AdaBoostClassifier' object has no attribute 'n_classes_'

## 5. K nearest Neighbors

In [14]:
knn = KNeighborsClassifier(n_neighbors=7, weights='distance', n_jobs=4)
knn = knn.fit(X, Y)
# prediction of labels for the test data
knn_pred = knn.predict(x_test)
acc_knn = round(Decimal(accuracy_score(y_test, knn_pred) * 100), 2)
accuracy_list.append(acc_knn)
model_list.append("KNN")
print(f"Accuracy (KNN) : {acc_knn}%")

# Cross Validation Accuracy KNN
# performing cross validation with 5 different splits
scores_knn = cross_val_score(knn, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_knn.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (KNN): {score}%")

Accuracy (KNN) : 91.63%
Cross Validation Accuracy (KNN): 85.25%


### 5.1 Bagging for KNN

In [16]:
bg5 = BaggingClassifier(KNeighborsClassifier(n_neighbors=7, weights='distance', n_jobs=4), n_estimators = 10)
bg5.fit(X,Y)
bg5_pred = bg5.predict(x_test)
acc_bg5 = round(Decimal(accuracy_score(y_test, bg5_pred) * 100), 2)
accuracy_list.append(acc_bg5)
model_list.append("BG_KNN")
print(f"Accuracy (BG_KNN) : {acc_bg5}%")
scores_bg5 = cross_val_score(bg5, X, Y, cv=5)
# mean of cross val score (accuracy)
score5 = round(Decimal(scores_bg5.mean() * 100), 2)
cross_accuracy_list.append(score5)
print(f"Cross Validation Accuracy (BG_MLP): {score5}%")

Accuracy (BG_KNN) : 92.42%
Cross Validation Accuracy (BG_MLP): 87.78%


### 5.2 Boosting for KNN

In [18]:
adb5 = AdaBoostClassifier(KNeighborsClassifier(n_neighbors=7, weights='distance', n_jobs=4),n_estimators = 10, algorithm="SAMME")
adb5.fit(X,Y)
adb_pred5 = adb5.predict(x_test)
acc_adb5 = round(Decimal(accuracy_score(y_test, adb_pred5) * 100), 2)
accuracy_list.append(acc_adb5)
model_list.append("ADB_KNN")
print(f"Accuracy (ADB_KNN) : {acc_adb5}%")
scores_adb5 = cross_val_score(adb5, X, Y, cv=5)
# mean of cross val score (accuracy)
score5 = round(Decimal(scores_adb5.mean() * 100), 2)
cross_accuracy_list.append(score5)
print(f"Cross Validation Accuracy (ADB_KNN): {score5}%")

ValueError: KNeighborsClassifier doesn't support sample_weight.

## 6. Multinomial Naive Bayes

In [19]:
mnb = MultinomialNB()
mnb = mnb.fit(X, Y)
# prediction of labels for the test data
mnb_pred = mnb.predict(x_test)
# calculation of accuracy score based on predictions performed
# converting to Decimal as rounding with float is inaccurate
acc_mnb = round(Decimal(accuracy_score(y_test, mnb_pred) * 100), 2)
accuracy_list.append(acc_mnb)
model_list.append("MNB")
print(f"Accuracy (MNB) : {acc_mnb}%")

# Cross Validation Accuracy MNB
# performing cross validation with 5 different splits
scores_mnb = cross_val_score(mnb, X, Y, cv=5)
# mean of cross val score (accuracy)
score = round(Decimal(scores_mnb.mean() * 100), 2)
cross_accuracy_list.append(score)
print(f"Cross Validation Accuracy (MNB): {score}%")

Accuracy (MNB) : 86.43%
Cross Validation Accuracy (MNB): 84.50%


### 6.1 Bagging for MNB

In [20]:
bg6 = BaggingClassifier(MultinomialNB(), n_estimators = 10)
bg6.fit(X,Y)
bg6_pred = bg6.predict(x_test)
acc_bg6 = round(Decimal(accuracy_score(y_test, bg6_pred) * 100), 2)
accuracy_list.append(acc_bg6)
model_list.append("BG_KNN")
print(f"Accuracy (BG_KNN) : {acc_bg6}%")
scores_bg6 = cross_val_score(bg6, X, Y, cv=5)
# mean of cross val score (accuracy)
score6 = round(Decimal(scores_bg6.mean() * 100), 2)
cross_accuracy_list.append(score6)
print(f"Cross Validation Accuracy (BG_MNB): {score6}%")

Accuracy (BG_KNN) : 87.33%
Cross Validation Accuracy (BG_MNB): 84.19%


### 6.2 Boosting for MNB

In [22]:
adb6 = AdaBoostClassifier(MultinomialNB(),n_estimators = 10, algorithm="SAMME")
adb6.fit(X,Y)
adb_pred6 = adb6.predict(x_test)
acc_adb6 = round(Decimal(accuracy_score(y_test, adb_pred6) * 100), 2)
accuracy_list.append(acc_adb6)
model_list.append("ADB_MNB")
print(f"Accuracy (ADB_MNB) : {acc_adb6}%")
scores_adb6 = cross_val_score(adb6, X, Y, cv=5)
# mean of cross val score (accuracy)
score6 = round(Decimal(scores_adb6.mean() * 100), 2)
cross_accuracy_list.append(score6)
print(f"Cross Validation Accuracy (ADB_MNB): {score6}%")

Accuracy (ADB_KNN) : 21.04%
Cross Validation Accuracy (ADB_KNN): NaN%


In [5]:
df_comb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8835 entries, 0 to 8834
Columns: 490 entries, label_dis to yellowish skin crust
dtypes: int64(489), object(1)
memory usage: 33.0+ MB


In [6]:
df_comb

Unnamed: 0,label_dis,abdominal cramp,abdominal distention,abnormal behavior,abnormal bleeding,abnormal sensation,abnormally frequent,abscess,aching,acne,...,wet,wheezing,white patch vaginal discharge,widespread pain,withdrawal occurring stopping,worrying,yellow skin,yellowish coloration skin white eye,yellowish skin,yellowish skin crust
0,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Abscess,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Acquired Capillary Haemangioma of Eyelid,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Acquired Immuno Deficiency Syndrome,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Acute encephalitis syndrome,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
