# Imports

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

import pickle

# Load Data

In [2]:
dbc = load_breast_cancer(as_frame=True)

## Features

In [3]:
features = ["mean texture", "mean symmetry"]
X = dbc["data"][features]
X_all = dbc["data"]
X_all.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Target

In [4]:
y = dbc["target"]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

# Split data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [6]:
X_all_train, X_all_test, y_all_train, y_all_test = train_test_split(X_all, y, test_size=.2)

# Classifiers

In [7]:
tree_clf = DecisionTreeClassifier(random_state=42)
log_reg = LogisticRegression(random_state=42)
knn_clf = KNeighborsClassifier(n_neighbors=3)
vsoft_clf = VotingClassifier(
    estimators=[
        ('tree', tree_clf),
        ('log', log_reg),
        ('knn', knn_clf)
    ],
    voting='soft'
)
vhard_clf = VotingClassifier(
    estimators=[
        ('tree', tree_clf),
        ('log', log_reg),
        ('knn', knn_clf)
    ],
    voting='hard'
)

clfs = [tree_clf, log_reg, knn_clf, vsoft_clf, vhard_clf]

## fit estimators

In [8]:
for clf in clfs:
    clf.fit(X_train, y_train)

## get accuracy scores

In [9]:
def get_pred(clf, X_train, X_test):
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    return y_pred_train, y_pred_test

In [10]:
def get_acc_score(clf, y_train, y_test, y_pred_train, y_pred_test):
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    return acc_train, acc_test

In [11]:
acc_scores = dict()

In [12]:
for clf in clfs:
    acc = get_acc_score(clf, y_train, y_test, get_pred(clf, X_train, X_test)[0], get_pred(clf, X_train, X_test)[1])
    acc_scores[f"{clf}"] = (clf.__class__.__name__, acc)

for s in acc_scores:
    print(acc_scores[s])

('DecisionTreeClassifier', (1.0, 0.6403508771929824))
('LogisticRegression', (0.6703296703296703, 0.7719298245614035))
('KNeighborsClassifier', (0.8197802197802198, 0.6666666666666666))
('VotingClassifier', (0.9934065934065934, 0.6666666666666666))
('VotingClassifier', (0.9032967032967033, 0.7192982456140351))


## pickle accuracies

In [13]:
vote_acc_list = []

In [14]:
for clf in clfs:
    acc = get_acc_score(clf, y_train, y_test, get_pred(clf, X_train, X_test)[0], get_pred(clf, X_train, X_test)[1])
    vote_acc_list.append(acc)

vote_acc_list

[(1.0, 0.6403508771929824),
 (0.6703296703296703, 0.7719298245614035),
 (0.8197802197802198, 0.6666666666666666),
 (0.9934065934065934, 0.6666666666666666),
 (0.9032967032967033, 0.7192982456140351)]

In [15]:
with open("acc_vote.pkl", "wb") as f:
    pickle.dump(vote_acc_list, f, pickle.HIGHEST_PROTOCOL)

In [16]:
with open("vote.pkl", "wb") as f:
    pickle.dump(clfs, f, pickle.HIGHEST_PROTOCOL)

# Bagging, Pasting, AdaBoost...

In [17]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=30,
    bootstrap=True, random_state=42, max_features=2
)

bag_clf50 = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=30,
    max_samples=.5, bootstrap=True, random_state=42, max_features=2
)

pas_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=30,
    bootstrap=False, random_state=42, max_features=2
)

pas_clf50 = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=30,
    max_samples=.5, bootstrap=False, random_state=42, max_features=2
)

frst_clf = RandomForestClassifier(
    n_estimators=30, random_state=42
)

ada_clf = AdaBoostClassifier(n_estimators=30)

grad_clf = GradientBoostingClassifier(n_estimators=30, random_state=42)

clfs2 = [bag_clf, bag_clf50, pas_clf, pas_clf50, frst_clf, ada_clf, grad_clf]

## fit classifiers

In [18]:
for clf in clfs2:
    clf.fit(X_train, y_train)

## get accuracy scores

In [19]:
acc_scores2 = dict()

In [20]:
for clf in clfs2:
    acc = get_acc_score(clf, y_train, y_test, get_pred(clf, X_train, X_test)[0], get_pred(clf, X_train, X_test)[1])
    acc_scores2[f"{clf}"] = (clf.__class__.__name__, acc)

for s in acc_scores:
    print(acc_scores[s])

('DecisionTreeClassifier', (1.0, 0.6403508771929824))
('LogisticRegression', (0.6703296703296703, 0.7719298245614035))
('KNeighborsClassifier', (0.8197802197802198, 0.6666666666666666))
('VotingClassifier', (0.9934065934065934, 0.6666666666666666))
('VotingClassifier', (0.9032967032967033, 0.7192982456140351))


## pickle accuracies

In [21]:
bag_acc_list = []

In [22]:
for clf in clfs2:
    acc = get_acc_score(clf, y_train, y_test, get_pred(clf, X_train, X_test)[0], get_pred(clf, X_train, X_test)[1])
    bag_acc_list.append(acc)

bag_acc_list

[(0.9912087912087912, 0.7017543859649122),
 (0.9164835164835164, 0.6929824561403509),
 (1.0, 0.631578947368421),
 (0.967032967032967, 0.7192982456140351),
 (0.9912087912087912, 0.6929824561403509),
 (0.8, 0.7631578947368421),
 (0.8153846153846154, 0.7543859649122807)]

In [23]:
with open("acc_bag.pkl", "wb") as f:
    pickle.dump(bag_acc_list, f, pickle.HIGHEST_PROTOCOL)

In [24]:
with open("bag.pkl", "wb") as f:
    pickle.dump(clfs2, f, pickle.HIGHEST_PROTOCOL)

# Max Features

In [25]:
fea_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators = 30,
    bootstrap = True, bootstrap_features = False,
    max_samples = 0.5, max_features = 2)

fea = [fea_clf]

## fit the classifier

In [26]:
fea_clf.fit(X_all_train, y_all_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_features=2,
                  max_samples=0.5, n_estimators=30)

## get predictions

In [27]:
y_pred_train_fea = fea_clf.predict(X_all_train)
y_pred_test_fea = fea_clf.predict(X_all_test)

## get accuracies

In [28]:
acc_train_fea = accuracy_score(y_all_train, y_pred_train_fea)
acc_test_fea = accuracy_score(y_all_test, y_pred_test_fea)
acc_train_fea

0.9934065934065934

## pickle the results

In [29]:
fea_acc_list = [acc_train_fea, acc_test_fea]
fea_acc_list

[0.9934065934065934, 0.9649122807017544]

In [30]:
with open("acc_fea.pkl", "wb") as f:
    pickle.dump(fea_acc_list, f, pickle.HIGHEST_PROTOCOL)

In [31]:
with open("fea.pkl", "wb") as f:
    pickle.dump(fea, f, pickle.HIGHEST_PROTOCOL)

# DataFrame

In [32]:
fea_accuracies = []
df_fea = pd.DataFrame(columns=["acc_train", "acc_test", "features"])
df_fea

Unnamed: 0,acc_train,acc_test,features


In [33]:
names = []

for ft in fea_clf.estimators_features_:
    names.append([X_all_train.columns[ft[0]], X_all_train.columns[ft[1]]])

In [34]:
for i in range(len(fea_clf.estimators_)):
    
    X = dbc["data"][names[i]]
    y = dbc["target"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
    
    clf.fit(X_train, y_train)
    
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    
    df_fea.loc[len(df_fea.index)] = [acc_train, acc_test, names[i]]


## sort dataframe

In [35]:
df_sorted = df_fea.sort_values(by=["acc_test", "acc_train"], ascending=False)

## pickle results

In [36]:
with open("acc_fea_rank.pkl", "wb") as f:
    pickle.dump(df_sorted, f, pickle.HIGHEST_PROTOCOL)