In [554]:
import sklearn.neighbors
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [555]:
X_cancer = data_breast_cancer['data'][['mean texture', 'mean symmetry']]
y_cancer = data_breast_cancer['target']

In [556]:
from sklearn.model_selection import train_test_split
X_cancer_train, X_cancer_test, y_cancer_train, y_cancer_test = train_test_split(X_cancer, y_cancer, test_size=0.2, random_state=21)

In [557]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

tree_clf = DecisionTreeClassifier()

log_clf = LogisticRegression()

knn_clf = sklearn.neighbors.KNeighborsClassifier()

voting_hard_clf = VotingClassifier(
    estimators=[('dt', tree_clf),
                ('lr', log_clf),
                ('knn', knn_clf)],
    voting='hard')

voting_soft_clf = VotingClassifier(
    estimators=[('dt', tree_clf),
                ('lr', log_clf),
                ('knn', knn_clf)],
    voting='soft')

voting_hard_clf.fit(X_cancer_train, y_cancer_train)
voting_soft_clf.fit(X_cancer_train, y_cancer_train)

VotingClassifier(estimators=[('dt', DecisionTreeClassifier()),
                             ('lr', LogisticRegression()),
                             ('knn', KNeighborsClassifier())],
                 voting='soft')

In [558]:
voting_hard_clf.predict(X_cancer_test)
voting_soft_clf.predict(X_cancer_test)

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

In [559]:
from sklearn.metrics import accuracy_score

acc_list = list()

for clf in (tree_clf, log_clf, knn_clf, voting_hard_clf, voting_soft_clf):
    clf.fit(X_cancer_train, y_cancer_train)

    y_pred_train = clf.predict(X_cancer_train)
    acc_train = accuracy_score(y_cancer_train, y_pred_train)

    y_pred_test = clf.predict(X_cancer_test)
    acc_test = accuracy_score(y_cancer_test, y_pred_test)

    acc_list.append((acc_train, acc_test))

print(acc_list)

[(1.0, 0.631578947368421), (0.6835164835164835, 0.7719298245614035), (0.7736263736263737, 0.6228070175438597), (0.8549450549450549, 0.6929824561403509), (0.9692307692307692, 0.6666666666666666)]


In [560]:
clf_list = [tree_clf, log_clf, knn_clf, voting_hard_clf, voting_soft_clf]
print(clf_list)

[DecisionTreeClassifier(), LogisticRegression(), KNeighborsClassifier(), VotingClassifier(estimators=[('dt', DecisionTreeClassifier()),
                             ('lr', LogisticRegression()),
                             ('knn', KNeighborsClassifier())]), VotingClassifier(estimators=[('dt', DecisionTreeClassifier()),
                             ('lr', LogisticRegression()),
                             ('knn', KNeighborsClassifier())],
                 voting='soft')]


In [561]:
import pickle
with open('acc_vote.pkl', 'wb') as fp:
    pickle.dump(acc_list, fp)

with open('vote.pkl', 'wb') as fp:
    pickle.dump(clf_list, fp)

In [562]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

#bagging
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=True, random_state=42)

#bagging 1/2
bag_half_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, bootstrap=True, random_state=42)

#pasting
pas_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False, random_state=42)

#pasting 1/2
pas_half_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, bootstrap=False, random_state=42)

#random forest
rnd_clf = RandomForestClassifier(n_estimators=30, random_state=42)

#ada boost
ada_clf = AdaBoostClassifier(n_estimators=30, random_state=42)

#gradient boost
gdt_clf = GradientBoostingClassifier(n_estimators=30, random_state=42)

In [563]:
acc_bag = list()
for clf in (bag_clf, bag_half_clf, pas_clf, pas_half_clf, rnd_clf, ada_clf, gdt_clf):
    clf.fit(X_cancer_train, y_cancer_train)

    y_pred_train = clf.predict(X_cancer_train)
    acc_train = accuracy_score(y_cancer_train, y_pred_train)

    y_pred_test = clf.predict(X_cancer_test)
    acc_test = accuracy_score(y_cancer_test, y_pred_test)

    acc_bag.append((acc_train, acc_test))
print(acc_bag)

[(0.9956043956043956, 0.6578947368421053), (0.9252747252747253, 0.6403508771929824), (1.0, 0.6228070175438597), (0.967032967032967, 0.6578947368421053), (0.9934065934065934, 0.631578947368421), (0.8021978021978022, 0.7456140350877193), (0.8131868131868132, 0.7719298245614035)]


In [564]:
bag_list = [bag_clf, bag_half_clf, pas_clf, pas_half_clf, rnd_clf, ada_clf, gdt_clf]

In [565]:
with open('acc_bag.pkl', 'wb') as fp:
    pickle.dump(acc_bag, fp)

with open('bag.pkl', 'wb') as fp:
    pickle.dump(bag_list, fp)

In [566]:
X = data_breast_cancer['data']
y = data_breast_cancer['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)


In [567]:
clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, max_features=2, bootstrap_features=False, bootstrap=True, random_state=42)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
acc_train = accuracy_score(y_train, y_pred_train)

y_pred_test = clf.predict(X_test)
acc_test = accuracy_score(y_test, y_pred_test)

acc_fea = [acc_train, acc_test]
fea = [clf]
print(acc_fea)

[0.9978021978021978, 0.956140350877193]


In [568]:
with open('acc_fea.pkl', 'wb') as fp:
    pickle.dump(acc_fea, fp)

with open('fea.pkl', 'wb') as fp:
    pickle.dump(fea, fp)

In [569]:
res = list()

for estimator, features in zip(clf.estimators_, clf.estimators_features_):
    print(estimator, features)

    y_pred_train = estimator.predict(X_train.iloc[:, features].values)
    acc_train = accuracy_score(y_train, y_pred_train)

    y_pred_test = estimator.predict(X_test.iloc[:, features].values)
    acc_test = accuracy_score(y_test, y_pred_test)

    res.append([acc_train, acc_test, list(X.columns[features])])



DecisionTreeClassifier(random_state=1952926171) [25 17]
DecisionTreeClassifier(random_state=1761383086) [0 6]
DecisionTreeClassifier(random_state=1449071958) [16 19]
DecisionTreeClassifier(random_state=1910541088) [ 9 16]
DecisionTreeClassifier(random_state=1341730541) [23  9]
DecisionTreeClassifier(random_state=1286572245) [ 0 12]
DecisionTreeClassifier(random_state=1005142668) [22  4]
DecisionTreeClassifier(random_state=502852014) [17 13]
DecisionTreeClassifier(random_state=186414760) [ 1 25]
DecisionTreeClassifier(random_state=1956263048) [ 3 20]
DecisionTreeClassifier(random_state=15592051) [26 21]
DecisionTreeClassifier(random_state=1628376228) [20 13]
DecisionTreeClassifier(random_state=1638437331) [18 24]
DecisionTreeClassifier(random_state=116435712) [12 20]
DecisionTreeClassifier(random_state=588556688) [26 29]
DecisionTreeClassifier(random_state=358068376) [29  3]
DecisionTreeClassifier(random_state=67998415) [ 4 12]
DecisionTreeClassifier(random_state=825108120) [9 1]
Decisi

In [570]:
import pandas as pd
print(res)
res = pd.DataFrame(res, columns=['Train accuracy', 'Test accuracy', 'Features'])
res.sort_values(['Test accuracy', 'Train accuracy'], ascending=False, inplace=True)
res

[[0.8351648351648352, 0.7192982456140351, ['worst compactness', 'concave points error']], [0.9362637362637363, 0.8859649122807017, ['mean radius', 'mean concavity']], [0.7802197802197802, 0.6842105263157895, ['concavity error', 'fractal dimension error']], [0.8, 0.7017543859649122, ['mean fractal dimension', 'concavity error']], [0.9516483516483516, 0.9035087719298246, ['worst area', 'mean fractal dimension']], [0.9010989010989011, 0.8508771929824561, ['mean radius', 'perimeter error']], [0.9274725274725275, 0.868421052631579, ['worst perimeter', 'mean smoothness']], [0.9252747252747253, 0.7894736842105263, ['concave points error', 'area error']], [0.8637362637362638, 0.7982456140350878, ['mean texture', 'worst compactness']], [0.945054945054945, 0.868421052631579, ['mean area', 'worst radius']], [0.8835164835164835, 0.8859649122807017, ['worst concavity', 'worst texture']], [0.9472527472527472, 0.8596491228070176, ['worst radius', 'area error']], [0.7978021978021979, 0.649122807017543

Unnamed: 0,Train accuracy,Test accuracy,Features
4,0.951648,0.903509,"[worst area, mean fractal dimension]"
1,0.936264,0.885965,"[mean radius, mean concavity]"
19,0.903297,0.885965,"[worst concave points, concave points error]"
10,0.883516,0.885965,"[worst concavity, worst texture]"
15,0.953846,0.877193,"[worst fractal dimension, mean area]"
14,0.868132,0.877193,"[worst concavity, worst fractal dimension]"
9,0.945055,0.868421,"[mean area, worst radius]"
6,0.927473,0.868421,"[worst perimeter, mean smoothness]"
11,0.947253,0.859649,"[worst radius, area error]"
26,0.92967,0.850877,"[mean radius, mean compactness]"


In [571]:
with open('acc_fea_rank.pkl', 'wb') as fp:
    pickle.dump(res, fp)