In [1]:
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [2]:
X = data_breast_cancer['data']
y = data_breast_cancer['target']

# tylko dla cech mean texture, mean symmetry
X = X.loc[:,["mean texture","mean symmetry"]]


In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#----------------część pierwsza----------------#

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#DRZEWA DECYZYJNE
tree_clf = DecisionTreeClassifier() #wartości domyślne
tree_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, tree_clf.predict(X_train))
accTest = accuracy_score(y_test, tree_clf.predict(X_test))
tree_clf_acc = (accTrain, accTest)
print(tree_clf_acc)

(1.0, 0.6140350877192983)


In [5]:
from sklearn.linear_model import LogisticRegression

#REGRESJA LOGISTYCZNA
log_clf = LogisticRegression() #wartości domyślne
log_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, log_clf.predict(X_train))
accTest = accuracy_score(y_test, log_clf.predict(X_test))
log_clf_acc = (accTrain, accTest)
print(log_clf_acc)

(0.7230769230769231, 0.7017543859649122)


In [6]:
from sklearn.neighbors import KNeighborsClassifier

#K NAJBLIŻSZYCH SĄSIADÓW
knn_clf = KNeighborsClassifier() #wartości domyślne
knn_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, knn_clf.predict(X_train))
accTest = accuracy_score(y_test, knn_clf.predict(X_test))
knn_clf_acc = (accTrain, accTest)
print(knn_clf_acc)

(0.7714285714285715, 0.6403508771929824)


In [7]:
from sklearn.ensemble import VotingClassifier

#GŁOSOWANIE HARD
hard_voting_clf = VotingClassifier(estimators=[('tree', tree_clf), ('log', log_clf), ('knn', knn_clf)], voting='hard')
hard_voting_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, hard_voting_clf.predict(X_train))
accTest = accuracy_score(y_test, hard_voting_clf.predict(X_test))
hard_voting_clf_acc = (accTrain, accTest)
print(hard_voting_clf_acc)

#GŁOSOWANIE SOFT
soft_voting_clf = VotingClassifier(estimators=[('tree', tree_clf), ('log', log_clf), ('knn', knn_clf)], voting='soft')
soft_voting_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, soft_voting_clf.predict(X_train))
accTest = accuracy_score(y_test, soft_voting_clf.predict(X_test))
soft_voting_clf_acc = (accTrain, accTest)
print(soft_voting_clf_acc)

(0.8351648351648352, 0.7017543859649122)
(0.9648351648351648, 0.6754385964912281)


In [11]:
# do pliku pickle
import pickle as pkl
acc_list = [tree_clf_acc, log_clf_acc, knn_clf_acc, hard_voting_clf_acc, soft_voting_clf_acc]

fileObject = open("acc_vote.pkl", 'wb')
pkl.dump(acc_list, fileObject)
fileObject.close()


clf_list = [tree_clf, log_clf, knn_clf, hard_voting_clf, soft_voting_clf]
fileObject = open("vote.pkl", 'wb')
pkl.dump(clf_list, fileObject)
fileObject.close()

In [None]:
#----------------część druga----------------#

In [12]:
from sklearn.ensemble import BaggingClassifier
#Bagging
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=True)
bag_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, bag_clf.predict(X_train))
accTest = accuracy_score(y_test, bag_clf.predict(X_test))
bag_clf_acc = (accTrain, accTest)
print(bag_clf_acc)

#Bagging z wykorzystaniem 50% instancji
half_bag_clf = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, n_estimators=30, bootstrap=True)
half_bag_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, half_bag_clf.predict(X_train))
accTest = accuracy_score(y_test, half_bag_clf.predict(X_test))
half_bag_clf_acc = (accTrain, accTest)
print(half_bag_clf_acc)

#Pasting
bagWithPasting_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False)
bagWithPasting_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, bagWithPasting_clf.predict(X_train))
accTest = accuracy_score(y_test, bagWithPasting_clf.predict(X_test))
bagWithPasting_clf_acc = (accTrain, accTest)
print(bagWithPasting_clf_acc)

#Pasting z wykorzystaniem 50% instancji
half_bagWithPasting_clf = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, n_estimators=30, bootstrap=False)
half_bagWithPasting_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, half_bagWithPasting_clf.predict(X_train))
accTest = accuracy_score(y_test, half_bagWithPasting_clf.predict(X_test))
half_bagWithPasting_clf_acc = (accTrain, accTest)
print(half_bagWithPasting_clf_acc)

(0.9956043956043956, 0.6666666666666666)
(0.9208791208791208, 0.6754385964912281)
(1.0, 0.6140350877192983)
(0.9648351648351648, 0.6403508771929824)


In [13]:
from sklearn.ensemble import RandomForestClassifier
#Random Forest
random_clf = RandomForestClassifier(n_estimators=30)
random_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, random_clf.predict(X_train))
accTest = accuracy_score(y_test, random_clf.predict(X_test))
random_clf_acc = (accTrain, accTest)
print(random_clf_acc)

(0.9934065934065934, 0.6754385964912281)


In [14]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(n_estimators=30)
ada_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, ada_clf.predict(X_train))
accTest = accuracy_score(y_test, ada_clf.predict(X_test))
ada_clf_acc = (accTrain, accTest)
print(ada_clf_acc)

(0.8, 0.7368421052631579)


In [15]:
from sklearn.ensemble import GradientBoostingClassifier
gradient_clf = GradientBoostingClassifier(n_estimators=30)
gradient_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, gradient_clf.predict(X_train))
accTest = accuracy_score(y_test, gradient_clf.predict(X_test))
gradient_clf_acc = (accTrain, accTest)
print(gradient_clf_acc)

(0.8373626373626374, 0.7105263157894737)


In [16]:
acc_list2 = [bag_clf_acc, half_bag_clf_acc, bagWithPasting_clf_acc, half_bagWithPasting_clf_acc, random_clf_acc, ada_clf_acc, gradient_clf_acc]

fileObject = open("acc_bag.pkl", 'wb')
pkl.dump(acc_list2, fileObject)
fileObject.close()


clf_list2 = [bag_clf, half_bag_clf, bagWithPasting_clf, half_bagWithPasting_clf, random_clf, ada_clf, gradient_clf]
fileObject = open("bag.pkl", 'wb')
pkl.dump(clf_list2, fileObject)
fileObject.close()