In [1]:
import numpy as np
import pandas as pd
import sys

from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

import pickle

In [2]:
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [3]:
cancer_data = pd.concat([data_breast_cancer["target"], data_breast_cancer["data"]['mean texture'], data_breast_cancer["data"]['mean symmetry']], axis=1)
train, test = train_test_split(cancer_data, test_size=.2)

X_train = train[['mean texture',  'mean symmetry']]
y_train = train['target']
X_test = test[['mean texture',  'mean symmetry']]
y_test = test['target']

In [4]:
print(X_train)
print(y_train)

     mean texture  mean symmetry
397         17.46         0.1574
503         19.83         0.1505
353         25.74         0.1647
507         17.12         0.1954
304         18.16         0.1411
..            ...            ...
327         17.93         0.1382
151         20.70         0.2222
483         17.64         0.1732
495         20.21         0.1487
188         17.39         0.1718

[455 rows x 2 columns]
397    1
503    0
353    0
507    1
304    1
      ..
327    1
151    1
483    1
495    1
188    1
Name: target, Length: 455, dtype: int32


In [5]:
print(X_test)
print(y_test)

     mean texture  mean symmetry
119         20.01         0.2129
192         18.22         0.1653
158         12.74         0.1590
460         27.15         0.1793
310         19.11         0.1936
..            ...            ...
295         13.27         0.1592
285         18.40         0.1697
38          25.20         0.1565
80          20.97         0.1842
470         18.49         0.2238

[114 rows x 2 columns]
119    0
192    1
158    1
460    0
310    1
      ..
295    1
285    1
38     0
80     1
470    1
Name: target, Length: 114, dtype: int32


In [6]:
log_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()
k_neigh_clf = KNeighborsClassifier()

voting_clf_h = VotingClassifier(
    estimators=[('lr', log_clf),
                ('tc', tree_clf), 
                ('knc', k_neigh_clf)], 
    voting='hard')

voting_clf_s = VotingClassifier(
    estimators=[('lr', log_clf),
                ('tc', tree_clf), 
                ('knc', k_neigh_clf)], 
    voting='soft')

In [7]:
acc_list = []
for i, clf in enumerate((log_clf, tree_clf, k_neigh_clf, voting_clf_h, voting_clf_s)):
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    acc_list.append((
        accuracy_score(y_train, y_train_pred),
        accuracy_score(y_test, y_test_pred))
    )
    print(f"{clf.__class__.__name__}: ({accuracy_score(y_train, y_train_pred)}, {accuracy_score(y_test, y_test_pred)})")

LogisticRegression: (0.7098901098901099, 0.6578947368421053)
DecisionTreeClassifier: (1.0, 0.6403508771929824)
KNeighborsClassifier: (0.7692307692307693, 0.7280701754385965)
VotingClassifier: (0.8373626373626374, 0.7105263157894737)
VotingClassifier: (0.9714285714285714, 0.6666666666666666)


In [8]:
file_acc_name = "acc_vote.pkl"

open_file = open(file_acc_name, "wb")
pickle.dump(acc_list, open_file)
open_file.close()

open_file = open(file_acc_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 4-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 4-----------------------------------
[(0.7098901098901099, 0.6578947368421053), (1.0, 0.6403508771929824), (0.7692307692307693, 0.7280701754385965), (0.8373626373626374, 0.7105263157894737), (0.9714285714285714, 0.6666666666666666)]


In [9]:
clf_list = [log_clf, tree_clf, k_neigh_clf, voting_clf_h, voting_clf_s]
file_clf_name = "vote.pkl"

open_file = open(file_clf_name, "wb")
pickle.dump(clf_list, open_file)
open_file.close()

open_file = open(file_clf_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 4-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 4-----------------------------------
[LogisticRegression(), DecisionTreeClassifier(), KNeighborsClassifier(), VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('tc', DecisionTreeClassifier()),
                             ('knc', KNeighborsClassifier())]), VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('tc', DecisionTreeClassifier()),
                             ('knc', KNeighborsClassifier())],
                 voting='soft')]


In [10]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators = 30,
    bootstrap=True
)
bag_half_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators = 30,
    max_samples = 0.5,
    bootstrap=True
)
past_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators = 30,
    bootstrap=False
)
past_half_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators = 30,
    max_samples = 0.5,
    bootstrap=False
)
rnd_clf = RandomForestClassifier(n_estimators=30)
ada_boost_clf = AdaBoostClassifier(n_estimators=30)
gbrt_clf = GradientBoostingClassifier(n_estimators=30)

In [11]:
acc_bag_list = []
for i, clf in enumerate((bag_clf, bag_half_clf, past_clf, past_half_clf, rnd_clf, ada_boost_clf, gbrt_clf)):
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    acc_bag_list.append((
        accuracy_score(y_train, y_train_pred),
        accuracy_score(y_test, y_test_pred))
    )
    print(f"{clf.__class__.__name__}: ({accuracy_score(y_train, y_train_pred)}, {accuracy_score(y_test, y_test_pred)})")

BaggingClassifier: (0.9978021978021978, 0.6666666666666666)
BaggingClassifier: (0.9274725274725275, 0.7017543859649122)
BaggingClassifier: (1.0, 0.6403508771929824)
BaggingClassifier: (0.9714285714285714, 0.6842105263157895)
RandomForestClassifier: (1.0, 0.6578947368421053)
AdaBoostClassifier: (0.7802197802197802, 0.7456140350877193)
GradientBoostingClassifier: (0.8373626373626374, 0.7105263157894737)


In [12]:
file_acc_bag_name = "acc_bag.pkl"

open_file = open(file_acc_bag_name, "wb")
pickle.dump(acc_bag_list, open_file)
open_file.close()

open_file = open(file_acc_bag_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 6-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 6-----------------------------------
[(0.9978021978021978, 0.6666666666666666), (0.9274725274725275, 0.7017543859649122), (1.0, 0.6403508771929824), (0.9714285714285714, 0.6842105263157895), (1.0, 0.6578947368421053), (0.7802197802197802, 0.7456140350877193), (0.8373626373626374, 0.7105263157894737)]


In [13]:
clf_bag_list = [bag_clf, bag_half_clf, past_clf, past_half_clf, rnd_clf, ada_boost_clf, gbrt_clf]
file_clf_name = "bag.pkl"

open_file = open(file_clf_name, "wb")
pickle.dump(clf_bag_list, open_file)
open_file.close()

open_file = open(file_clf_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 4-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 4-----------------------------------
[BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.5, n_estimators=30), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(n_estimators=30), GradientBoostingClassifier(n_estimators=30)]


In [14]:
print(train)

     target  mean texture  mean symmetry
397       1         17.46         0.1574
503       0         19.83         0.1505
353       0         25.74         0.1647
507       1         17.12         0.1954
304       1         18.16         0.1411
..      ...           ...            ...
327       1         17.93         0.1382
151       1         20.70         0.2222
483       1         17.64         0.1732
495       1         20.21         0.1487
188       1         17.39         0.1718

[455 rows x 3 columns]


In [15]:
df1 = pd.DataFrame(data_breast_cancer.data, columns=data_breast_cancer.feature_names)
df1['target'] = data_breast_cancer.target
X2 = df1.iloc[:, 0:30]
y = data_breast_cancer.frame.target
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.2)

In [16]:
acc_fea_list = []

fea_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators = 30,
    max_samples = 0.5,
    bootstrap=True,
    bootstrap_features=True, 
    max_features=2
)
fea_clf.fit(X2_train, y2_train)

y2_train_pred_fea = fea_clf.predict(X2_train)
y2_test_pred_fea = fea_clf.predict(X2_test)
acc_fea_list.append((
    accuracy_score(y2_train, y2_train_pred_fea),
    accuracy_score(y2_test, y2_test_pred_fea))
)

In [17]:
file_acc_fea_name = "acc_fea.pkl"

open_file = open(file_acc_fea_name, "wb")
pickle.dump(acc_fea_list, open_file)
open_file.close()

open_file = open(file_acc_fea_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 8-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 8-----------------------------------
[(0.9978021978021978, 0.9385964912280702)]


In [18]:
fea_clf_list = [fea_clf]
file_clf_name = "fea.pkl"

open_file = open(file_clf_name, "wb")
pickle.dump(fea_clf_list, open_file)
open_file.close()

open_file = open(file_clf_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 8-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 8-----------------------------------
[BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=2, max_samples=0.5,
                  n_estimators=30)]


In [19]:


# y2_train_pred_0 = fea_clf.estimators_[0].predict(X2_train)
# y2_test_pred_0 = fea_clf.estimators_[0].predict(X2_test)


In [20]:
data_fea_list = []
for est_fea, est in zip(fea_clf.estimators_features_, fea_clf.estimators_):
    y_train_pred_temp = est.predict(X2_train.iloc[:, est_fea])
    y_test_pred_temp = est.predict(X2_test.iloc[:, est_fea])
    data_fea_list.append([accuracy_score(y2_train, y_train_pred_temp), accuracy_score(y2_test, y_test_pred_temp), X2_train.iloc[:, est_fea].columns.tolist()])
    print(f"{X2_train.iloc[:, est_fea].columns.tolist()}: ({accuracy_score(y2_train, y_train_pred_temp)}, {accuracy_score(y2_test, y_test_pred_temp)})")
                      
df_fea = pd.DataFrame(data_fea_list, columns = ['train_accuracy', 'test_accuracy', 'features_names'])             

['area error', 'fractal dimension error']: (0.8747252747252747, 0.8596491228070176)
['mean fractal dimension', 'smoothness error']: (0.7582417582417582, 0.5614035087719298)
['smoothness error', 'concavity error']: (0.7692307692307693, 0.6754385964912281)
['worst area', 'worst area']: (0.9252747252747253, 0.8771929824561403)
['smoothness error', 'worst symmetry']: (0.7428571428571429, 0.6666666666666666)
['worst smoothness', 'mean compactness']: (0.810989010989011, 0.6929824561403509)
['smoothness error', 'mean radius']: (0.9054945054945055, 0.8947368421052632)
['worst radius', 'worst compactness']: (0.9494505494505494, 0.9210526315789473)
['concave points error', 'worst radius']: (0.9362637362637363, 0.9210526315789473)
['mean compactness', 'mean smoothness']: (0.8307692307692308, 0.7456140350877193)
['radius error', 'mean perimeter']: (0.9120879120879121, 0.8859649122807017)
['mean perimeter', 'mean radius']: (0.9164835164835164, 0.868421052631579)
['concave points error', 'worst frac

In [21]:
df_fea_sorted = df_fea.sort_values(by=['train_accuracy', 'test_accuracy'], ascending = False)
df_fea_sorted

Unnamed: 0,train_accuracy,test_accuracy,features_names
7,0.949451,0.921053,"[worst radius, worst compactness]"
15,0.949451,0.912281,"[mean perimeter, worst smoothness]"
14,0.938462,0.894737,"[worst radius, concave points error]"
8,0.936264,0.921053,"[concave points error, worst radius]"
28,0.934066,0.868421,"[worst area, worst radius]"
13,0.931868,0.868421,"[area error, worst smoothness]"
21,0.92967,0.877193,"[worst radius, radius error]"
19,0.92967,0.868421,"[perimeter error, worst radius]"
23,0.92967,0.859649,"[worst radius, worst texture]"
3,0.925275,0.877193,"[worst area, worst area]"


In [22]:
file_acc_fea_rank_name = "acc_fea_rank.pkl"

open_file = open(file_acc_fea_rank_name, "wb")
pickle.dump(df_fea_sorted, open_file)
open_file.close()

open_file = open(file_acc_fea_rank_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 9-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 9-----------------------------------
    train_accuracy  test_accuracy  \
7         0.949451       0.921053   
15        0.949451       0.912281   
14        0.938462       0.894737   
8         0.936264       0.921053   
28        0.934066       0.868421   
13        0.931868       0.868421   
21        0.929670       0.877193   
19        0.929670       0.868421   
23        0.929670       0.859649   
3         0.925275       0.877193   
11        0.916484       0.868421   
18        0.916484       0.868421   
10        0.912088       0.885965   
17        0.909890       0.877193   
6         0.905495       0.894737   
16        0.887912       0.842105   
0         0.874725       0.859649   
9         0.830769       0.745614   
27        0.830769       0.719298   
26        0.813187       0.675439   
25        0.810989       0.771930   
5         0.810989       0.692982   
29        0.808791       0.675439   
20        0.795604       0.