In [1]:
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [2]:
X = data_breast_cancer['data']
y = data_breast_cancer['target']

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

fea_X_train = X_train
fea_X_test = X_test

# tylko dla cech mean texture, mean symmetry
X_train = X_train.loc[:,["mean texture","mean symmetry"]]
X_test = X_test.loc[:,["mean texture","mean symmetry"]]

In [4]:
#----------------część pierwsza----------------#

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#DRZEWA DECYZYJNE
tree_clf = DecisionTreeClassifier() #wartości domyślne
tree_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, tree_clf.predict(X_train))
accTest = accuracy_score(y_test, tree_clf.predict(X_test))
tree_clf_acc = (accTrain, accTest)
print(tree_clf_acc)

(1.0, 0.6140350877192983)


In [6]:
from sklearn.linear_model import LogisticRegression

#REGRESJA LOGISTYCZNA
log_clf = LogisticRegression() #wartości domyślne
log_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, log_clf.predict(X_train))
accTest = accuracy_score(y_test, log_clf.predict(X_test))
log_clf_acc = (accTrain, accTest)
print(log_clf_acc)

(0.7230769230769231, 0.7017543859649122)


In [7]:
from sklearn.neighbors import KNeighborsClassifier

#K NAJBLIŻSZYCH SĄSIADÓW
knn_clf = KNeighborsClassifier() #wartości domyślne
knn_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, knn_clf.predict(X_train))
accTest = accuracy_score(y_test, knn_clf.predict(X_test))
knn_clf_acc = (accTrain, accTest)
print(knn_clf_acc)

(0.7714285714285715, 0.6403508771929824)


In [8]:
from sklearn.ensemble import VotingClassifier

#GŁOSOWANIE HARD
hard_voting_clf = VotingClassifier(estimators=[('tree', tree_clf), ('log', log_clf), ('knn', knn_clf)], voting='hard')
hard_voting_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, hard_voting_clf.predict(X_train))
accTest = accuracy_score(y_test, hard_voting_clf.predict(X_test))
hard_voting_clf_acc = (accTrain, accTest)
print(hard_voting_clf_acc)

#GŁOSOWANIE SOFT
soft_voting_clf = VotingClassifier(estimators=[('tree', tree_clf), ('log', log_clf), ('knn', knn_clf)], voting='soft')
soft_voting_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, soft_voting_clf.predict(X_train))
accTest = accuracy_score(y_test, soft_voting_clf.predict(X_test))
soft_voting_clf_acc = (accTrain, accTest)
print(soft_voting_clf_acc)

(0.8351648351648352, 0.7017543859649122)
(0.9648351648351648, 0.6666666666666666)


In [9]:
# do pliku pickle
import pickle as pkl
acc_list = [tree_clf_acc, log_clf_acc, knn_clf_acc, hard_voting_clf_acc, soft_voting_clf_acc]

fileObject = open("acc_vote.pkl", 'wb')
pkl.dump(acc_list, fileObject)
fileObject.close()


clf_list = [tree_clf, log_clf, knn_clf, hard_voting_clf, soft_voting_clf]
fileObject = open("vote.pkl", 'wb')
pkl.dump(clf_list, fileObject)
fileObject.close()

In [10]:
#----------------część druga----------------#

In [11]:
from sklearn.ensemble import BaggingClassifier
#Bagging
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=True)
bag_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, bag_clf.predict(X_train))
accTest = accuracy_score(y_test, bag_clf.predict(X_test))
bag_clf_acc = (accTrain, accTest)
print(bag_clf_acc)

#Bagging z wykorzystaniem 50% instancji
half_bag_clf = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, n_estimators=30, bootstrap=True)
half_bag_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, half_bag_clf.predict(X_train))
accTest = accuracy_score(y_test, half_bag_clf.predict(X_test))
half_bag_clf_acc = (accTrain, accTest)
print(half_bag_clf_acc)

#Pasting
bagWithPasting_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False)
bagWithPasting_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, bagWithPasting_clf.predict(X_train))
accTest = accuracy_score(y_test, bagWithPasting_clf.predict(X_test))
bagWithPasting_clf_acc = (accTrain, accTest)
print(bagWithPasting_clf_acc)

#Pasting z wykorzystaniem 50% instancji
half_bagWithPasting_clf = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, n_estimators=30, bootstrap=False)
half_bagWithPasting_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, half_bagWithPasting_clf.predict(X_train))
accTest = accuracy_score(y_test, half_bagWithPasting_clf.predict(X_test))
half_bagWithPasting_clf_acc = (accTrain, accTest)
print(half_bagWithPasting_clf_acc)

(0.9978021978021978, 0.6140350877192983)
(0.9208791208791208, 0.6754385964912281)
(1.0, 0.6228070175438597)
(0.9626373626373627, 0.6403508771929824)


In [12]:
from sklearn.ensemble import RandomForestClassifier
#Random Forest
random_clf = RandomForestClassifier(n_estimators=30)
random_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, random_clf.predict(X_train))
accTest = accuracy_score(y_test, random_clf.predict(X_test))
random_clf_acc = (accTrain, accTest)
print(random_clf_acc)

(0.9978021978021978, 0.6842105263157895)


In [13]:
from sklearn.ensemble import AdaBoostClassifier
#AdaBoost
ada_clf = AdaBoostClassifier(n_estimators=30)
ada_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, ada_clf.predict(X_train))
accTest = accuracy_score(y_test, ada_clf.predict(X_test))
ada_clf_acc = (accTrain, accTest)
print(ada_clf_acc)

(0.8, 0.7368421052631579)


In [14]:
from sklearn.ensemble import GradientBoostingClassifier
#Gradient Boosting
gradient_clf = GradientBoostingClassifier(n_estimators=30)
gradient_clf.fit(X_train, y_train)
accTrain = accuracy_score(y_train, gradient_clf.predict(X_train))
accTest = accuracy_score(y_test, gradient_clf.predict(X_test))
gradient_clf_acc = (accTrain, accTest)
print(gradient_clf_acc)

(0.8373626373626374, 0.7105263157894737)


In [15]:
acc_list2 = [bag_clf_acc, half_bag_clf_acc, bagWithPasting_clf_acc, half_bagWithPasting_clf_acc, random_clf_acc, ada_clf_acc, gradient_clf_acc]

fileObject = open("acc_bag.pkl", 'wb')
pkl.dump(acc_list2, fileObject)
fileObject.close()


clf_list2 = [bag_clf, half_bag_clf, bagWithPasting_clf, half_bagWithPasting_clf, random_clf, ada_clf, gradient_clf]
fileObject = open("bag.pkl", 'wb')
pkl.dump(clf_list2, fileObject)
fileObject.close()

In [16]:
#----------------część trzecia----------------#

In [17]:
twoFeatures_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, max_features=2, bootstrap_features=False, bootstrap=True)
twoFeatures_clf.fit(fea_X_train, y_train)
accTrain = accuracy_score(y_train, twoFeatures_clf.predict(fea_X_train))
accTest = accuracy_score(y_test, twoFeatures_clf.predict(fea_X_test))
twoFeatures_clf_acc = [accTrain, accTest]
print(twoFeatures_clf_acc)

[0.989010989010989, 0.9385964912280702]


In [18]:
fileObject = open("acc_fea.pkl", 'wb')
pkl.dump(twoFeatures_clf_acc, fileObject)
fileObject.close()

clf_list3 = [twoFeatures_clf]
fileObject = open("fea.pkl", 'wb')
pkl.dump(clf_list3, fileObject)
fileObject.close()

In [19]:
#----------------część czwarta----------------#

In [20]:
ranking = []

for clf, features in zip(twoFeatures_clf.estimators_, twoFeatures_clf.estimators_features_):
    accTrain = accuracy_score(y_train, clf.predict(fea_X_train.iloc[:, features]))
    accTest = accuracy_score(y_test, clf.predict(fea_X_test.iloc[:, features]))
    ranking.append([accTrain, accTest, list(X.columns[features])])



In [21]:
import pandas as pd
acc_fea_ranking = pd.DataFrame(ranking, columns=["accTrain", "accTest", "features"])

In [22]:
acc_fea_ranking.sort_values(by=["accTest", "accTrain"], ascending=False, inplace=True)

In [23]:
print(acc_fea_ranking)

    accTrain   accTest                                         features
24  0.942857  0.929825           [mean concave points, worst concavity]
23  0.931868  0.921053                      [worst radius, mean radius]
4   0.929670  0.921053                 [mean perimeter, mean concavity]
17  0.916484  0.912281               [worst perimeter, perimeter error]
7   0.909890  0.912281                   [worst concavity, mean radius]
12  0.883516  0.912281                          [mean area, area error]
20  0.931868  0.894737         [worst concave points, smoothness error]
0   0.940659  0.885965              [worst compactness, mean perimeter]
10  0.938462  0.868421      [mean concave points, concave points error]
8   0.879121  0.842105                [symmetry error, worst concavity]
16  0.868132  0.842105                [worst symmetry, perimeter error]
13  0.843956  0.842105                      [area error, texture error]
3   0.890110  0.824561               [mean smoothness, worst con

In [24]:
fileObject = open("acc_fea_rank.pkl", 'wb')
pkl.dump(acc_fea_ranking, fileObject)
fileObject.close()