In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from IPython.display import Image
from IPython.core.display import HTML

from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

In [2]:
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 10)
sns.set(style="ticks")

# Iris Dataset

In [3]:
iris = datasets.load_iris()

In [4]:
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris.target_names[iris.target]], 
                       columns= iris['feature_names'] + ['class'])
for col in iris['feature_names']:
    iris_df[col] = pd.to_numeric(iris_df[col])

In [5]:
iris_df.shape

(150, 5)

In [6]:
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [None]:
iris_df.groupby('class').size()

In [None]:
sns.pairplot(iris_df, hue="class", diag_kind="kde")
plt.show()

# Treinamento dos modelos isoladamente

In [None]:
X, y = iris.data[:, 0:2], iris.target

In [None]:
def train_and_report(models, X, y):
    results = []
    for name in models.keys():
        model = models[name]
        scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
        print("Accuracy: %.3f (+/- %.3f) [%s]" %(scores.mean(), scores.std(), name))

In [None]:
models = {}
models['LR'] = LogisticRegression()
models['LDA'] = LinearDiscriminantAnalysis()
models['KNN'] = KNeighborsClassifier()
models['CART'] = DecisionTreeClassifier(random_state=13)
models['NB'] = GaussianNB()
models['SVC'] = SVC(probability=True)
models['KNN_BAGGING'] = BaggingClassifier(base_estimator=models['KNN'], n_estimators=10, random_state=13)
models['CART_BAGGING'] = BaggingClassifier(base_estimator=models['CART'], n_estimators=10, random_state=13) 
models['RF'] = RandomForestClassifier(random_state=13)
models['ADABOOST_10'] = AdaBoostClassifier(base_estimator=models['CART'], n_estimators=10, random_state=13)

In [None]:
train_and_report(models, X, y)

# Votação

In [None]:
clfs = [(name, models[name]) for name in models.keys() ]

## Maioria

In [None]:
models['MAJORITY_VOTING_STACKING'] = VotingClassifier(estimators=clfs, voting='hard')

In [None]:
train_and_report(models, X, y)

In [None]:
models['WEIGHT_AVG_STACKING'] = VotingClassifier(estimators=clfs, voting='soft')

In [None]:
train_and_report(models, X, y)

In [None]:
def models_correlation(models):
    X, y = iris.data, iris.target
    validation_size = 0.20
    seed = 13
    X_train, X_validation, y_train, y_validation = \
        train_test_split(X, y, test_size=validation_size, random_state=seed)    
    predictions = []
    labels = []
    for name in models.keys():
        model = models[name]
        model.fit(X_train, y_train)
        predictions.append(model.predict(X_validation))
        labels.append(name)
    predictions_df = pd.DataFrame(data=np.transpose(predictions), columns=models.keys())
    sns.heatmap(predictions_df.corr(), annot=True, cmap='coolwarm')
    plt.show()

In [None]:
models_correlation(models)

In [None]:
#stacking_models_names = ['CART', 'LR', 'NB']
stacking_models_names = models.keys()
stacking_models = {name: models[name] for name in stacking_models_names}
stacking_clfs = [(name, models[name]) for name in stacking_models_names]

In [None]:
stacking_models['MAJORITY_VOTING_STACKING'] = VotingClassifier(estimators=stacking_clfs, voting='hard')

In [None]:
train_and_report(stacking_models, X, y)

## Ponderado

In [None]:
stacking_models['WEIGHT_AVG_STACKING'] = VotingClassifier(estimators=stacking_clfs, voting='soft')

In [None]:
train_and_report(stacking_models, X, y)

# Meta-Classificador

In [None]:
Image(url= "https://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier_files/stackingclassification_overview.png")

In [None]:
sclf = StackingClassifier(classifiers=[stacking_models[name] for name in stacking_models.keys()],
                           use_probas=True,
                          average_probas=False,
                            meta_classifier=models['LR'])

In [None]:
stacking_models['SCLF'] = sclf

In [None]:
train_and_report(stacking_models, bc_X, bc_y)

In [None]:
bc = datasets.load_breast_cancer()
bc_X, bc_y = bc.data, bc.target

In [None]:
train_and_report(models, bc_X, bc_y)