# Ensemble - Bagging & Boosting (Homogenous Model)

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [5]:
#loading iris dataset
iris=load_iris()
x=iris.data[:,:4]
y=iris.target

#train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42)

# Bagging

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier(n_estimators=100,random_state=42)
#RandomForestClassifier 100 default
rf.fit(x_train,y_train)
predrf=rf.predict(x_test)
print(accuracy_score(y_test,predrf))
print(confusion_matrix(y_test,predrf))
print(classification_report(y_test,predrf))


1.0
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



# Boosting

In [8]:
from sklearn.ensemble import AdaBoostClassifier
#AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),n_estimators=50,learning_rate=1.0,algorithm=SAMME.R)

ad=AdaBoostClassifier()
ad.fit(x_train,y_train)
ad_pred=ad.predict(x_test)

print(accuracy_score(y_test,ad_pred))
print(confusion_matrix(y_test,ad_pred))
print(classification_report(y_test,ad_pred))

1.0
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [9]:
from sklearn.svm import SVC
svc=SVC()
# svc=SVC(probability=True,kernel='linear')

ad=AdaBoostClassifier(n_estimators=50,base_estimator=svc,algorithm='SAMME')
ad.fit(x_train,y_train)
ad_pred=ad.predict(x_test)

print(accuracy_score(y_test,ad_pred))
print(confusion_matrix(y_test,ad_pred))
print(classification_report(y_test,ad_pred))

0.3
[[ 0 10  0]
 [ 0  9  0]
 [ 0 11  0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.30      1.00      0.46         9
           2       0.00      0.00      0.00        11

    accuracy                           0.30        30
   macro avg       0.10      0.33      0.15        30
weighted avg       0.09      0.30      0.14        30



In [10]:
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier()
gb.fit(x_train,y_train)
gb_pred=gb.predict(x_test)

print(accuracy_score(y_test,gb_pred))
print(confusion_matrix(y_test,gb_pred))
print(classification_report(y_test,gb_pred))

1.0
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



# Voting Classifier - Heterogeneous Model

In [11]:
from sklearn.ensemble import VotingClassifier

In [13]:
# group/ensemble of models
estimator = []
estimator.append(('LR',KNeighborsClassifier()))
estimator.append(('SVC',SVC(gamma='auto',probability=True)))
estimator.append(('DTC',DecisionTreeClassifier()))

estimator

[('LR', KNeighborsClassifier()),
 ('SVC', SVC(gamma='auto', probability=True)),
 ('DTC', DecisionTreeClassifier())]

In [14]:
#VotingClassifier with hard voting
vot_hard=VotingClassifier(estimators=estimator,voting='hard')
vot_hard.fit(x_train,y_train)
y_pred=vot_hard.predict(x_test)

print(y_pred)
#using accuracy score to predict accuracy
score=accuracy_score(y_test,y_pred)
print('Hard Voting Score % d' %score)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Hard Voting Score  1


In [15]:
#VotingClassifier with hard voting
vot_hard=VotingClassifier(estimators=estimator,voting='soft')
vot_hard.fit(x_train,y_train)
y_pred=vot_hard.predict(x_test)

print(y_pred)
#using accuracy score to predict accuracy
score=accuracy_score(y_test,y_pred)
print('Soft Voting Score % d' %score)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
Soft Voting Score  1


# GRIDSEARCHCV

In [19]:
#Grid Search for algorithm tuning

import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

#load diabetes datasets
dataset=datasets.load_diabetes()
#prepare a range of alpha values to test
#alphas=np.array([1,0.1,0.01,0.001,0.0001,0])

alphavalue={'alpha':[1,0.1,0.01,0.001,0.0001,0]}
#create and fit ridge regression model, testing each alpha
model=Ridge()

#grid=GridSearchCV(estimator=model,param_grid=dict(alpha=alphas))

grid=GridSearchCV(estimator=model,param_grid=alphavalue)

grid.fit(dataset.data,dataset.target)
print(grid)

print(grid.best_score_)
print(grid.best_estimator_.alpha)
print(grid.best_params_)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0]})
0.48232313841634866
0.0001
{'alpha': 0.0001}


In [21]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()
grid_param = {
    'criterion':['gini','entropy']
}
#gridk={'kernel':['linear','rbf','poly']}

gd_sr=GridSearchCV(estimator=dtc,
                  param_grid=grid_param,
                  scoring='accuracy',
                  cv=5)
gd_sr.fit(iris.data,iris.target)

best_parameters=gd_sr.best_params_
print(best_parameters)
best_result = gd_sr.best_score_
print(best_result)

{'criterion': 'gini'}
0.9600000000000002


In [22]:
dtc=DecisionTreeClassifier(criterion='gini')
dtc.fit(iris.data,iris.target)

DecisionTreeClassifier()

In [23]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris=datasets.load_iris()
parameters={'kernel':['linear','rbf'],'C':[1,10]}
svc=svm.SVC()
clf=GridSearchCV(svc,parameters)
clf.fit(iris.data,iris.target)


GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'kernel': ['linear', 'rbf']})

In [24]:
print(clf.best_params_)

{'C': 1, 'kernel': 'linear'}


In [25]:
sv=svm.SVC(kernel='linear',C=1)
sv.fit(iris.data,iris.target)

SVC(C=1, kernel='linear')

In [None]:
#RandomizedSearchCV same as GridSearchCV

In [1]:
#Pipeine
from pandas import read_csv

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

#load data
url="https://github.com/eliza1210/Practice_project/diabetes.csv"

names=['preg','plas','pres','test','mass','pedi','age','class']

#dataframe=read_csv("csv name",names=names)

dataframe=read_csv(url, names=names)

array=dataframe.values

X=array[:,0:8]
Y=array[:,-1]

#create pipeline
estimators=[]
estimators.append(('standardise',StandardScaler()))
estimators.append(('dtc',DecisionTreeClassifier()))
model=Pipeline(estimators)

#evaluate pipeline
results=cross_val_score(model,X,Y,cv=5)
print(results.mean())

HTTPError: HTTP Error 404: Not Found

In [None]:
s