In [1]:
from sklearn import datasets
import pandas as pd
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [2]:
X = data_breast_cancer.data[['mean texture', 'mean symmetry']].values
y = data_breast_cancer['target']


In [3]:
from sklearn.preprocessing import StandardScaler


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train

array([[13.12  ,  0.1337],
       [20.52  ,  0.2116],
       [18.29  ,  0.1874],
       [28.03  ,  0.1713],
       [22.14  ,  0.1765],
       [22.41  ,  0.1727],
       [15.79  ,  0.1405],
       [22.02  ,  0.1616],
       [14.95  ,  0.1571],
       [16.84  ,  0.1769],
       [18.3   ,  0.1638],
       [27.08  ,  0.1869],
       [19.67  ,  0.2556],
       [23.5   ,  0.1978],
       [24.54  ,  0.1587],
       [16.94  ,  0.1573],
       [16.62  ,  0.1511],
       [18.22  ,  0.1653],
       [16.84  ,  0.2036],
       [15.11  ,  0.1703],
       [19.07  ,  0.1533],
       [29.81  ,  0.1852],
       [14.98  ,  0.1925],
       [16.49  ,  0.159 ],
       [25.42  ,  0.184 ],
       [18.06  ,  0.1669],
       [24.49  ,  0.2275],
       [14.11  ,  0.1596],
       [17.91  ,  0.1473],
       [16.39  ,  0.1422],
       [13.98  ,  0.2086],
       [16.33  ,  0.1829],
       [23.23  ,  0.1664],
       [23.06  ,  0.1564],
       [15.51  ,  0.1881],
       [17.26  ,  0.1847],
       [13.98  ,  0.165 ],
 

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)  # Only transform the test set, do not fit
X_train

array([[-1.43851335e+00, -1.72936805e+00],
       [ 3.12640109e-01,  1.17876963e+00],
       [-2.15072353e-01,  2.75343030e-01],
       [ 2.08982423e+00, -3.25696978e-01],
       [ 6.96000731e-01, -1.31572255e-01],
       [ 7.59894168e-01, -2.73432630e-01],
       [-8.06678252e-01, -1.47551265e+00],
       [ 6.67603648e-01, -6.87814250e-01],
       [-1.00545783e+00, -8.55806799e-01],
       [-5.58203774e-01, -1.16639584e-01],
       [-2.12705929e-01, -6.05684560e-01],
       [ 1.86501399e+00,  2.56677191e-01],
       [ 1.11494103e-01,  2.82136344e+00],
       [ 1.01783434e+00,  6.63592477e-01],
       [ 1.26394239e+00, -7.96076115e-01],
       [-5.34539538e-01, -8.48340464e-01],
       [-6.10265093e-01, -1.07979686e+00],
       [-2.31637318e-01, -5.49687043e-01],
       [-5.58203774e-01,  8.80116206e-01],
       [-9.67595056e-01, -3.63028656e-01],
       [-3.04913125e-02, -9.97667174e-01],
       [ 2.51104763e+00,  1.93213340e-01],
       [-9.98358563e-01,  4.65734586e-01],
       [-6.

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

tree_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier()
log_reg = LogisticRegression()


In [7]:
from sklearn.ensemble import VotingClassifier
voting_clf_hard = VotingClassifier(
    estimators = [('tree_clf',tree_clf), ('knn_clf', knn_clf), ('log_reg',log_reg)],
    voting = 'hard'
)

voting_clf_soft = VotingClassifier(
    estimators = [('tree_clf',tree_clf), ('knn_clf', knn_clf), ('log_reg',log_reg)],
    voting = 'soft'
)

In [8]:
ex3 = []
from sklearn.metrics import accuracy_score

for clf in (tree_clf,log_reg,knn_clf,voting_clf_hard,voting_clf_soft):
    clf.fit(X_train,y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    ex3.append([accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)])
    print(clf.__class__.__name__, accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test))

ex3

DecisionTreeClassifier 1.0 0.6153846153846154
LogisticRegression 0.7488262910798122 0.7202797202797203
KNeighborsClassifier 0.8215962441314554 0.6573426573426573
VotingClassifier 0.863849765258216 0.6713286713286714
VotingClassifier 0.9741784037558685 0.6503496503496503


[[1.0, 0.6153846153846154],
 [0.7488262910798122, 0.7202797202797203],
 [0.8215962441314554, 0.6573426573426573],
 [0.863849765258216, 0.6713286713286714],
 [0.9741784037558685, 0.6503496503496503]]

In [9]:
import pickle
with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(ex3, f)

In [10]:
ex4 =[tree_clf,log_reg,knn_clf,voting_clf_hard,voting_clf_soft]

with open('vote.pkl', 'wb') as f:
    pickle.dump(ex4, f)

In [11]:
#ex5
from sklearn.ensemble import BaggingClassifier
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators = 30,
    bootstrap = True,
    n_jobs = -1
)
bag_clf

In [12]:
bag_clf_50 = BaggingClassifier(
    DecisionTreeClassifier(),
    max_samples = 0.5,
    n_estimators = 30,
    bootstrap = True,
    n_jobs = -1
)
bag_clf_50

In [13]:
pasting_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators = 30,
    bootstrap = False,
    n_jobs = -1
)
pasting_clf

In [14]:
pasting_clf_50 = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators = 30,
    max_samples = 0.5,
    bootstrap = False,
    n_jobs = -1
)
pasting_clf

In [15]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators = 30,
                                 n_jobs = -1,
                                )
rnd_clf

In [16]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(n_estimators = 30)

In [17]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=30)
gb_clf


In [18]:
acc_bag = []
bag = []
for clf in (bag_clf, bag_clf_50, pasting_clf, pasting_clf_50, rnd_clf, ada_clf, gb_clf):
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_train,y_pred_train), accuracy_score(y_test,y_pred_test))
    acc_bag.append([accuracy_score(y_train,y_pred_train),accuracy_score(y_test,y_pred_test)])
    bag.append(clf)
    
    # print(accuracy_score(y_test,y_pred_test))

print(acc_bag)
bag

BaggingClassifier 0.9929577464788732 0.6573426573426573
BaggingClassifier 0.92018779342723 0.6643356643356644
BaggingClassifier 1.0 0.6293706293706294
BaggingClassifier 0.960093896713615 0.6643356643356644
RandomForestClassifier 0.9906103286384976 0.7062937062937062
AdaBoostClassifier 0.8028169014084507 0.6993006993006993
GradientBoostingClassifier 0.8309859154929577 0.7062937062937062
[[0.9929577464788732, 0.6573426573426573], [0.92018779342723, 0.6643356643356644], [1.0, 0.6293706293706294], [0.960093896713615, 0.6643356643356644], [0.9906103286384976, 0.7062937062937062], [0.8028169014084507, 0.6993006993006993], [0.8309859154929577, 0.7062937062937062]]


[BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30,
                   n_jobs=-1),
 BaggingClassifier(estimator=DecisionTreeClassifier(), max_samples=0.5,
                   n_estimators=30, n_jobs=-1),
 BaggingClassifier(bootstrap=False, estimator=DecisionTreeClassifier(),
                   n_estimators=30, n_jobs=-1),
 BaggingClassifier(bootstrap=False, estimator=DecisionTreeClassifier(),
                   max_samples=0.5, n_estimators=30, n_jobs=-1),
 RandomForestClassifier(n_estimators=30, n_jobs=-1),
 AdaBoostClassifier(n_estimators=30),
 GradientBoostingClassifier(n_estimators=30)]

In [19]:
with open('acc_bag.pkl', 'wb') as f:
    pickle.dump(acc_bag,f)

with open('bag.pkl', 'wb') as f:
    pickle.dump(bag,f)

In [20]:
data_breast_cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [21]:
for name, score in zip(['mean texture', 'mean symmetry'],rnd_clf.feature_importances_):
    print(name, score)

mean texture 0.505212703522568
mean symmetry 0.49478729647743197


In [22]:

bag_clf.estimators_

[DecisionTreeClassifier(random_state=2103738017),
 DecisionTreeClassifier(random_state=472179976),
 DecisionTreeClassifier(random_state=1228000375),
 DecisionTreeClassifier(random_state=1562685151),
 DecisionTreeClassifier(random_state=302906606),
 DecisionTreeClassifier(random_state=956187892),
 DecisionTreeClassifier(random_state=1848061504),
 DecisionTreeClassifier(random_state=1134683407),
 DecisionTreeClassifier(random_state=54882411),
 DecisionTreeClassifier(random_state=1538812740),
 DecisionTreeClassifier(random_state=2094470337),
 DecisionTreeClassifier(random_state=1316603791),
 DecisionTreeClassifier(random_state=1098152431),
 DecisionTreeClassifier(random_state=197862105),
 DecisionTreeClassifier(random_state=1353141898),
 DecisionTreeClassifier(random_state=277944837),
 DecisionTreeClassifier(random_state=641651533),
 DecisionTreeClassifier(random_state=1220288832),
 DecisionTreeClassifier(random_state=966125883),
 DecisionTreeClassifier(random_state=2147340732),
 Decision

In [23]:
bagging = BaggingClassifier(
    n_estimators = 30,
    max_features=2,
    max_samples = 0.5,
    bootstrap_features = False,
    bootstrap = True
)
bagging

In [33]:
X = data_breast_cancer.data.values
y = data_breast_cancer['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
from sklearn.metrics import mean_squared_error
bagging.fit(X_train, y_train)
y_pred_train = bagging.predict(X_train)
y_pred_test = bagging.predict(X_test)
acc_fea = [accuracy_score(y_train,y_pred_train), accuracy_score(y_test,y_pred_test)]
fea = [bagging]

print(accuracy_score(y_train,y_pred_train),accuracy_score(y_test,y_pred_test))
print(acc_fea)
print(fea)

0.9953051643192489 0.951048951048951
[0.9953051643192489, 0.951048951048951]
[BaggingClassifier(max_features=2, max_samples=0.5, n_estimators=30)]


In [25]:
with open('acc_fea.pkl', 'wb') as f:
    pickle.dump(acc_fea, f)

with open('fea.pkl', 'wb') as f:
    pickle.dump(fea, f)

In [26]:
# bagging2 = BaggingClassifier(
#     n_estimators = 500,
#     max_features=0.8,
#     max_samples = 0.5,
#     n_jobs = -1,
#     bootstrap_features = False,
#     bootstrap = True,
#     oob_score = True
# )
# bagging2.fit(X_train, y_train)
# y_pred_train = bagging2.predict(X_train)
# y_pred_test = bagging2.predict(X_test)
# print(accuracy_score(y_train,y_pred_train), accuracy_score(y_test,y_pred_test))
# bagging2.oob_score_

In [27]:
# bagging2.fit(X_train, y_train)
# y_pred_train = bagging2.predict(X_train)
# y_pred_test = bagging2.predict(X_test)
# print(accuracy_score(y_train,y_pred_train), accuracy_score(y_test,y_pred_test))


In [28]:
data = []
for estimator, features in zip(bagging.estimators_,bagging.estimators_features_):
    # print(estimator, features)
    y_train_pred = estimator.predict(X_train[:, features])
    y_test_pred = estimator.predict(X_test[:, features])
    second_list = ['mean texture', 'mean symmetry']
    result = [second_list[i] for i in range(len(features)) if features[i] == 1]
    
    row = [accuracy_score(y_train,y_train_pred), accuracy_score(y_test,y_test_pred), result]
    print(row)
    data.append(row)
    

[0.8051643192488263, 0.6083916083916084, ['mean symmetry']]
[0.8051643192488263, 0.6713286713286714, ['mean symmetry']]
[0.8028169014084507, 0.6503496503496503, ['mean symmetry']]
[0.8169014084507042, 0.6433566433566433, ['mean symmetry']]
[0.7934272300469484, 0.6013986013986014, ['mean symmetry']]
[0.8028169014084507, 0.5874125874125874, ['mean symmetry']]
[0.8145539906103286, 0.6783216783216783, ['mean symmetry']]
[0.8098591549295775, 0.6293706293706294, ['mean symmetry']]
[0.795774647887324, 0.6643356643356644, ['mean symmetry']]
[0.795774647887324, 0.6993006993006993, ['mean symmetry']]
[0.7887323943661971, 0.6503496503496503, ['mean symmetry']]
[0.8028169014084507, 0.6293706293706294, ['mean symmetry']]
[0.7863849765258216, 0.6923076923076923, ['mean symmetry']]
[0.7934272300469484, 0.6363636363636364, ['mean symmetry']]
[0.8075117370892019, 0.6083916083916084, ['mean symmetry']]
[0.8098591549295775, 0.5804195804195804, ['mean symmetry']]
[0.7981220657276995, 0.6433566433566433, [

In [29]:
df = pd.DataFrame(data, columns = ['train','test','features']).sort_values(by=['train','test'], ascending = False)
df.to_pickle('acc_fea_rank.pkl')
df

Unnamed: 0,train,test,features
26,0.838028,0.699301,[mean symmetry]
3,0.816901,0.643357,[mean symmetry]
17,0.816901,0.622378,[mean symmetry]
19,0.814554,0.692308,[mean symmetry]
6,0.814554,0.678322,[mean symmetry]
21,0.814554,0.671329,[mean symmetry]
18,0.814554,0.664336,[mean symmetry]
7,0.809859,0.629371,[mean symmetry]
15,0.809859,0.58042,[mean symmetry]
29,0.807512,0.671329,[mean symmetry]
