# Bagging

In [88]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [89]:
from sklearn.model_selection import train_test_split

In [90]:
from sklearn import datasets
iris = datasets.load_iris()
x_train, x_test, y_train, y_test  = train_test_split(iris.data[:, 0:4], iris.target, test_size = 0.3)

In [91]:
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples = 0.5, max_features = 0.5)

In [92]:
scores = cross_val_score(bagging, x_train, y_train)

In [93]:
scores.mean()

0.97058823529411764

In [94]:
bagging.fit(x_train, y_train)

BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
         bootstrap=True, bootstrap_features=False, max_features=0.5,
         max_samples=0.5, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [95]:
pred = bagging.predict(x_test)

In [96]:
from sklearn.metrics import classification_report, confusion_matrix

In [97]:
print(confusion_matrix(y_test, pred))

[[14  0  0]
 [ 0 16  0]
 [ 0  1 14]]


In [98]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        14
          1       0.94      1.00      0.97        16
          2       1.00      0.93      0.97        15

avg / total       0.98      0.98      0.98        45



In [87]:
# regressione

In [1]:
from sklearn.ensemble import BaggingRegressor

In [None]:
bagging = BaggingRegressor(SGDRegressor(), n_jobs=-1, n_estimators=1000, random_state=101, max_features=0.8)
bagging.fit(x_train, y_train)
mean_absolute_error(y_test, bagging.predict(x_test))

# Boosting

In [99]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

In [100]:
from sklearn.model_selection import train_test_split

In [101]:
from sklearn import datasets
iris = datasets.load_iris()
x_train, x_test, y_train, y_test  = train_test_split(iris.data[:, 0:4], iris.target, test_size = 0.3)

In [102]:
# costruiamo uno stimatore a partire da 100 weak learners

In [103]:
clf = AdaBoostClassifier(n_estimators=100)

In [104]:
scores = cross_val_score(clf, x_train, y_train)

In [105]:
scores.mean()

0.95286195286195285

In [106]:
clf.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)

In [107]:
pred = clf.predict(x_test)

In [108]:
from sklearn.metrics import classification_report, confusion_matrix

In [109]:
print(confusion_matrix(y_test, pred))

[[15  0  0]
 [ 0 14  1]
 [ 0  0 15]]


In [110]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       1.00      0.93      0.97        15
          2       0.94      1.00      0.97        15

avg / total       0.98      0.98      0.98        45



In [None]:
# regressione

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
booster = AdaBoostRegressor(SGDRegressor(), random_state=101, n_estimators=100, learning_rate=0.01)
booster.fit(x_train, y_train)
mean_absolute_error(y_test, booster.predict(x_test))

# Gradient Boosting

In [None]:
# il gradient boostin è una versione potenziata del boosting, basata sulla discesa del gradiente

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
regr = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, random_state=101)
regr.fit(X_train, y_train)
mean_absolute_error(y_test, regr.predict(X_test))

# Random Forest

In [71]:
from sklearn.ensemble import RandomForestClassifier

In [73]:
forest = RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=1, n_jobs=2)
forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [75]:
pred = forest.predict(x_test)

In [76]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        17
          1       0.79      1.00      0.88        15
          2       1.00      0.69      0.82        13

avg / total       0.93      0.91      0.91        45



In [None]:
# Mettiamo a confronto i modelli (esempio tratto dalla documentazione di scikit-learn)

In [14]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

iris = datasets.load_iris()
x, y = iris.data[:, 1:3], iris.target

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.90 (+/- 0.05) [Logistic Regression]
Accuracy: 0.93 (+/- 0.05) [Random Forest]
Accuracy: 0.91 (+/- 0.04) [naive Bayes]
Accuracy: 0.95 (+/- 0.05) [Ensemble]
