Przygotowanie danych:

In [22]:
from sklearn import datasets
bc_data, bc_target = datasets.load_breast_cancer(return_X_y=True, as_frame=True)

from sklearn.model_selection import train_test_split
import pandas as pd

bc_x = pd.DataFrame({ "mean texture" : bc_data["mean texture"], "mean symmetry" : bc_data["mean symmetry"] })
bc_train_x, bc_test_x, bc_train_y, bc_test_y = train_test_split(bc_x, bc_target, test_size=0.2)

Ensemble:

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import VotingClassifier

tree = DecisionTreeClassifier(max_depth=2)
logistic = LogisticRegression(solver="liblinear")
neighbors = KNeighborsClassifier(n_neighbors=5)

ensemble_hard = VotingClassifier(estimators=[('tree', tree), ('logistic', logistic), ('neighbors', neighbors)], voting='hard')
ensemble_soft = VotingClassifier(estimators=[('tree', tree), ('logistic', logistic), ('neighbors', neighbors)], voting='soft')

ensemble_hard.fit(bc_train_x, bc_train_y)
ensemble_soft.fit(bc_train_x, bc_train_y)
tree.fit(bc_train_x, bc_train_y)
logistic.fit(bc_train_x, bc_train_y)
neighbors.fit(bc_train_x, bc_train_y)

In [24]:
accuracy_list = []
accuracy_list.append( (tree.score(bc_train_x, bc_train_y), tree.score(bc_test_x, bc_test_y)) )
accuracy_list.append( (logistic.score(bc_train_x, bc_train_y), logistic.score(bc_test_x, bc_test_y)) )
accuracy_list.append( (neighbors.score(bc_train_x, bc_train_y), neighbors.score(bc_test_x, bc_test_y)) )
accuracy_list.append( (ensemble_hard.score(bc_train_x, bc_train_y), ensemble_hard.score(bc_test_x, bc_test_y)) )
accuracy_list.append( (ensemble_soft.score(bc_train_x, bc_train_y), ensemble_soft.score(bc_test_x, bc_test_y)) )

print(accuracy_list)

import pickle
with open("acc_vote.pkl", "wb") as file:
    pickle.dump(accuracy_list, file)

[(0.7736263736263737, 0.7192982456140351), (0.6857142857142857, 0.7017543859649122), (0.7802197802197802, 0.7105263157894737), (0.789010989010989, 0.7368421052631579), (0.7978021978021979, 0.7543859649122807)]


In [25]:
clfs_list = [tree, logistic, neighbors, ensemble_hard, ensemble_soft]

with open("vote.pkl", "wb") as file:
    pickle.dump(clfs_list, file)

Bagging:

In [26]:
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=True)
bag_clf_05 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=True, max_samples=0.5)
pas_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False)
pas_clf_05 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False, max_samples=0.5)

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

rand_tree = RandomForestClassifier(n_estimators=30)
ada_boost = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=30, algorithm="SAMME")
gradient_boost = GradientBoostingClassifier(n_estimators=30)

In [28]:
bag_clf.fit(bc_train_x, bc_train_y)
bag_clf_05.fit(bc_train_x, bc_train_y)
pas_clf.fit(bc_train_x, bc_train_y)
pas_clf_05.fit(bc_train_x, bc_train_y)

rand_tree.fit(bc_train_x, bc_train_y)
ada_boost.fit(bc_train_x, bc_train_y)
gradient_boost.fit(bc_train_x, bc_train_y)

In [None]:
acc_list = []

acc_list.append( (bag_clf.score(bc_train_x, bc_train_y), bag_clf.score(bc_test_x, bc_test_y)) )
acc_list.append( (bag_clf_05.score(bc_train_x, bc_train_y), bag_clf_05.score(bc_test_x, bc_test_y)) )
acc_list.append( (pas_clf.score(bc_train_x, bc_train_y), pas_clf.score(bc_test_x, bc_test_y)) )
acc_list.append( (pas_clf_05.score(bc_train_x, bc_train_y), pas_clf_05.score(bc_test_x, bc_test_y)) )
acc_list.append( (rand_tree.score(bc_train_x, bc_train_y), rand_tree.score(bc_test_x, bc_test_y)) )
acc_list.append( (ada_boost.score(bc_train_x, bc_train_y), ada_boost.score(bc_test_x, bc_test_y)) )
acc_list.append( (gradient_boost.score(bc_train_x, bc_train_y), gradient_boost.score(bc_test_x, bc_test_y)) )

with open("acc_bag.pkl", "wb") as file:
    pickle.dump(acc_list, file)

clfs_list_2 = [bag_clf, bag_clf_05, pas_clf, pas_clf_05, rand_tree, ada_boost, gradient_boost]

with open("bag.pkl", "wb") as file:
    pickle.dump(clfs_list_2, file)

(0.9978021978021978, 0.6578947368421053) 

(0.9384615384615385, 0.6929824561403509) 

(1.0, 0.6491228070175439) 

(0.9802197802197802, 0.7105263157894737) 

(0.9956043956043956, 0.6929824561403509) 

(1.0, 0.6403508771929824) 

(0.8373626373626374, 0.7105263157894737) 



Sampling:

In [30]:
bc_train_x, bc_test_x, bc_train_y, bc_test_y = train_test_split(bc_data, bc_target, test_size=0.2)

sampl = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_features=2, max_samples=0.5, bootstrap=True, bootstrap_features=False)
sampl.fit(bc_data, bc_target)

acc_3 = []
acc_3.append(sampl.score(bc_train_x, bc_train_y))
acc_3.append(sampl.score(bc_test_x, bc_test_y))

print(acc_3)

with open("acc_fea.pkl", "wb") as file:
    pickle.dump(acc_3, file)

clf_3 = [sampl]

with open("fea.pkl", "wb") as file:
    pickle.dump(clf_3, file)

[1.0, 0.9912280701754386]


Ranking estymatorów:

In [31]:
column_names = bc_data.columns

new_df = pd.DataFrame(columns=["dokładność dla zb. uczącego", "dokładnośc dla zb. testującego", "lista nazw cech"])

for i, estimator in enumerate(sampl.estimators_):
    names = [column_names[j] for j in sampl.estimators_features_[i]]

    acc_train = estimator.score(bc_train_x[names], bc_train_y)
    acc_test = estimator.score(bc_test_x[names], bc_test_y)

    new_df.loc[len(new_df)] = [acc_train, acc_test, names]

new_df = new_df.sort_values(by=["dokładnośc dla zb. testującego", "dokładność dla zb. uczącego"], ascending=False)

with open("acc_fea_rank.pkl", "wb") as f:
    pickle.dump(new_df, f)

