In [1]:
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import pickle

def saveToFile(ll, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(ll, handle)

def f1_acc(clf, X, y):
    cancer_pred_train = clf.predict(X)
    f1_cancer_train = f1_score(y, cancer_pred_train)
    acc_cancer_train = accuracy_score(y,cancer_pred_train)
    return acc_cancer_train

data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [2]:
from sklearn.model_selection import train_test_split
cancer_X_training, cancer_X_test, cancer_y_training, cancer_y_test = train_test_split(data_breast_cancer['data'][['mean symmetry','mean texture']],data_breast_cancer['target'],test_size=0.2)

In [3]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [4]:
dokladnosc = list()
classif = list()

log_clf = LogisticRegression()
dt_clf = DecisionTreeClassifier()
kn_clf = KNeighborsClassifier()

log_clf.fit(cancer_X_training, cancer_y_training)
dt_clf.fit(cancer_X_training, cancer_y_training)
kn_clf.fit(cancer_X_training, cancer_y_training)

voting_clf_hard = VotingClassifier(
                    estimators=[('lr', log_clf),
                                ('dt', dt_clf),
                                ('kn', kn_clf)],
                    voting='hard')

In [5]:
voting_clf_hard.fit(cancer_X_training, cancer_y_training)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('dt', DecisionTreeClassifier()),
                             ('kn', KNeighborsClassifier())])

In [6]:
dokladnosc.append((f1_acc(dt_clf, cancer_X_training, cancer_y_training),
                       f1_acc(dt_clf, cancer_X_test, cancer_y_test)))
classif.append(dt_clf)

In [7]:
dokladnosc.append((f1_acc(log_clf, cancer_X_training, cancer_y_training),
                       f1_acc(log_clf, cancer_X_test, cancer_y_test)))
classif.append(log_clf)

In [8]:
dokladnosc.append((f1_acc(kn_clf, cancer_X_training, cancer_y_training),
                       f1_acc(kn_clf, cancer_X_test, cancer_y_test)))
classif.append(kn_clf)

In [9]:
dokladnosc.append((f1_acc(voting_clf_hard, cancer_X_training, cancer_y_training),
                       f1_acc(voting_clf_hard, cancer_X_test, cancer_y_test)))
classif.append(voting_clf_hard)

In [10]:
voting_clf_soft = VotingClassifier(
                    estimators=[('lr', log_clf),
                                ('dt', dt_clf),
                                ('kn', kn_clf)],
                    voting='soft')
voting_clf_soft.fit(cancer_X_training, cancer_y_training)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('dt', DecisionTreeClassifier()),
                             ('kn', KNeighborsClassifier())],
                 voting='soft')

In [11]:
dokladnosc.append((f1_acc(voting_clf_soft, cancer_X_training, cancer_y_training),
                       f1_acc(voting_clf_soft, cancer_X_test, cancer_y_test)))
classif.append(voting_clf_soft)

In [12]:
saveToFile(dokladnosc,'acc_vote.pkl')
saveToFile(classif,'vote.pkl')

In [13]:
ll = list()
classlist = list()

In [14]:
from sklearn.ensemble import BaggingClassifier
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,
                                max_samples=1.0, bootstrap=True)
bag_clf.fit(cancer_X_training, cancer_y_training)

classlist.append(bag_clf)
ll.append((f1_acc(bag_clf, cancer_X_training, cancer_y_training),f1_acc(bag_clf, cancer_X_test, cancer_y_test)))

In [15]:
half_bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,
                                max_samples=0.5, bootstrap=True)
half_bag_clf.fit(cancer_X_training, cancer_y_training)

classlist.append(half_bag_clf)
ll.append((f1_acc(half_bag_clf, cancer_X_training, cancer_y_training),f1_acc(half_bag_clf, cancer_X_test, cancer_y_test)))

In [16]:
# Pasting
pasting_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,
                                max_samples=1.0, bootstrap=True)
pasting_clf.fit(cancer_X_training, cancer_y_training)

classlist.append(pasting_clf)
ll.append((f1_acc(pasting_clf, cancer_X_training, cancer_y_training),f1_acc(pasting_clf, cancer_X_test, cancer_y_test)))

In [17]:
half_pasting_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,
                                max_samples=0.5, bootstrap=True)
half_pasting_clf.fit(cancer_X_training, cancer_y_training)

classlist.append(half_pasting_clf)
ll.append((f1_acc(half_pasting_clf, cancer_X_training, cancer_y_training),f1_acc(half_pasting_clf, cancer_X_test, cancer_y_test)))

In [18]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=30)
rnd_clf.fit(cancer_X_training, cancer_y_training)

classlist.append(rnd_clf)
ll.append((f1_acc(rnd_clf, cancer_X_training, cancer_y_training),f1_acc(rnd_clf, cancer_X_test, cancer_y_test)))

In [19]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(n_estimators=30)
ada_clf.fit(cancer_X_training, cancer_y_training)

classlist.append(ada_clf)
ll.append((f1_acc(ada_clf, cancer_X_training, cancer_y_training),f1_acc(ada_clf, cancer_X_test, cancer_y_test)))

In [20]:
from sklearn.ensemble import GradientBoostingClassifier
gbrt = GradientBoostingClassifier(n_estimators=30)
gbrt.fit(cancer_X_training, cancer_y_training)

classlist.append(gbrt)
ll.append((gbrt.score(cancer_X_training, cancer_y_training),gbrt.score(cancer_X_test, cancer_y_test)))

In [21]:
saveToFile(ll,'acc_bag.pkl')
saveToFile(classlist,'bag.pkl')

In [22]:
sampling_clf = BaggingClassifier(n_estimators=30,
                                 max_features=2, bootstrap_features=True,
                                max_samples=0.5, bootstrap=False)
sampling_clf.fit(cancer_X_training, cancer_y_training)

sampl_ll = list()
sampl_cls = list()

sampl_cls.append(sampling_clf)
sampl_ll.append(f1_acc(sampling_clf, cancer_X_training, cancer_y_training))
sampl_ll.append(f1_acc(sampling_clf, cancer_X_test, cancer_y_test))

In [23]:
saveToFile(sampl_ll,'acc_fea.pkl')
saveToFile(sampl_cls,'fea.pkl')

In [24]:
import pandas as pd
ss = list()
for i in sampling_clf.estimators_:
    ss.append((
        f1_acc(i, cancer_X_training, cancer_y_training),
        f1_acc(i, cancer_X_test, cancer_y_test),
        i.feature_importances_
    ))
df = pd.DataFrame(ss, columns=['train_acc','test_acc','features'])
df.sort_values(by=['test_acc','train_acc'], ascending=False, inplace=True)

saveToFile(df,'acc_fea_rank.pkl')