# Breast Cancer - Ensemble

Carregando dados:

In [87]:
import numpy as np
import pandas as pd

scaler = StandardScaler()

data = pd.read_csv('dataR2.csv')
data, labels = data.iloc[:, :-1], data.iloc[:, -1]

display(data.head(4))
display(labels.head(4))
display(labels.unique())

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22


0    1
1    1
2    1
3    1
Name: Classification, dtype: int64

array([1, 2])

Escalando dados e transformando labels para 0 e 1:

In [88]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

data[:] = scaler.fit_transform(data)

lenc = LabelEncoder()

labels[:] = lenc.fit_transform(labels)
display(lenc.transform(lenc.classes_))


array([0, 1])

Separando os dados para treino e teste:

In [89]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =\
    train_test_split(data, labels,
                     test_size=0.2,
                     random_state=42,
                     stratify=labels)


Criando as pipelines:

In [90]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

pipe1 = Pipeline([
    ['sc', StandardScaler()],
    ['clf', DecisionTreeClassifier(
        random_state=43,
    )]
])
pipe2 = Pipeline([
    ['sc', StandardScaler()],
    ['clf', RandomForestClassifier(
        criterion='gini',
        random_state=44,
    )]
])
pipe3 = Pipeline([
    ['sc', StandardScaler()],
    ['clf', AdaBoostClassifier(
        algorithm='SAMME.R',
        random_state=45,
    )]
])


Buscando a pipeline com melhores métricas:

In [91]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)

gs1 = GridSearchCV(pipe1, cv=kfold, n_jobs=-1, param_grid=[{
    'clf__max_depth': [30]
}])
gs2 = GridSearchCV(pipe2, cv=kfold, n_jobs=-1, param_grid=[{
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [3, 5, 7]
}])
gs3 = GridSearchCV(pipe3, cv=kfold, n_jobs=-1, param_grid=[{
    'clf__n_estimators': [50, 100, 200],
    'clf__estimator': [DecisionTreeClassifier(max_depth=i, random_state=47) for i in [3, 5, 7]],
    'clf__learning_rate': [0.01, 0.05, 0.1, 0.5]
}])

gs1.fit(X_train, y_train)
gs2.fit(X_train, y_train)
gs3.fit(X_train, y_train)

display(gs1.best_score_)
display(gs1.best_params_)
display(gs2.best_score_)
display(gs2.best_params_)
display(gs3.best_score_)
display(gs3.best_params_)


0.6514619883040936

{'clf__max_depth': 30}

0.7584795321637428

{'clf__max_depth': 3, 'clf__n_estimators': 50}

0.7263157894736841

{'clf__estimator': DecisionTreeClassifier(max_depth=3, random_state=47),
 'clf__learning_rate': 0.01,
 'clf__n_estimators': 100}

Treinando novamente o modelo com melhores hiperparâmetros e
calculando métricas usando dataset de teste:

In [92]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, precision_recall_fscore_support

pipe = Pipeline([
    ['sc', StandardScaler()],
    ['clf', RandomForestClassifier(
        max_depth=3,
        n_estimators=50,
        criterion='gini',
        random_state=48,
    )]
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

lenc_labels = lenc.transform(lenc.classes_)

acc = np.array([accuracy_score(y_test[y_test == i], y_pred[y_test == i])
               for i in lenc_labels])
rec = recall_score(y_test, y_pred, average=None, labels=lenc_labels)
f1 = f1_score(y_test, y_pred, average=None, labels=lenc_labels)
prec = precision_score(y_test, y_pred, average=None, labels=lenc_labels)

print('accuracy (each class): ', acc)
print('recall (each class): ', rec)
print('f1-score (each class): ', f1)
print('precision (each class): ', prec)

print('accuracy (average): ', np.average(acc))
print('recall (average): ', np.average(rec))
print('f1-score (average): ', np.average(f1))
print('precision (average): ', np.average(prec))


accuracy (each class):  [0.72727273 0.61538462]
recall (each class):  [0.72727273 0.61538462]
f1-score (each class):  [0.66666667 0.66666667]
precision (each class):  [0.61538462 0.72727273]
accuracy (average):  0.6713286713286714
recall (average):  0.6713286713286714
f1-score (average):  0.6666666666666667
precision (average):  0.6713286713286714
