In [4]:
import numpy as np
import pandas as pd
import sys

from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

import pickle

In [5]:
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [6]:
cancer_data = pd.concat([data_breast_cancer["target"], data_breast_cancer["data"]['mean texture'], data_breast_cancer["data"]['mean symmetry']], axis=1)
train, test = train_test_split(cancer_data, test_size=.2)

X_train = train[['mean texture',  'mean symmetry']]
y_train = train['target']
X_test = test[['mean texture',  'mean symmetry']]
y_test = test['target']

In [7]:
print(X_train)
print(y_train)

     mean texture  mean symmetry
44          21.81         0.1746
39          20.82         0.1720
464         18.22         0.1454
84          15.65         0.2079
95          23.03         0.2095
..            ...            ...
467         18.10         0.1680
495         20.21         0.1487
201         19.32         0.1506
144         14.97         0.1399
453         13.98         0.1650

[455 rows x 2 columns]
44     0
39     0
464    1
84     1
95     0
      ..
467    1
495    1
201    0
144    1
453    1
Name: target, Length: 455, dtype: int32


In [8]:
print(X_test)
print(y_test)

     mean texture  mean symmetry
65          23.94         0.1953
3           20.38         0.2597
292         16.02         0.1730
184         22.41         0.1727
74          16.52         0.1720
..            ...            ...
491         13.23         0.1220
118         22.91         0.2096
411         16.83         0.1714
196         22.29         0.1834
78          23.97         0.2906

[114 rows x 2 columns]
65     0
3      0
292    1
184    0
74     1
      ..
491    1
118    0
411    1
196    0
78     0
Name: target, Length: 114, dtype: int32


In [9]:
log_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()
k_neigh_clf = KNeighborsClassifier()

voting_clf_h = VotingClassifier(
    estimators=[('lr', log_clf),
                ('tc', tree_clf), 
                ('knc', k_neigh_clf)], 
    voting='hard')

voting_clf_s = VotingClassifier(
    estimators=[('lr', log_clf),
                ('tc', tree_clf), 
                ('knc', k_neigh_clf)], 
    voting='soft')

In [10]:
acc_list = []
for i, clf in enumerate((log_clf, tree_clf, k_neigh_clf, voting_clf_h, voting_clf_s)):
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    acc_list.append((
        accuracy_score(y_train, y_train_pred),
        accuracy_score(y_test, y_test_pred))
    )
    print(f"{clf.__class__.__name__}: ({accuracy_score(y_train, y_train_pred)}, {accuracy_score(y_test, y_test_pred)})")

LogisticRegression: (0.7120879120879121, 0.7192982456140351)
DecisionTreeClassifier: (1.0, 0.6754385964912281)
KNeighborsClassifier: (0.7648351648351648, 0.6842105263157895)
VotingClassifier: (0.8483516483516483, 0.7192982456140351)
VotingClassifier: (0.9604395604395605, 0.6929824561403509)


In [11]:
file_acc_name = "acc_vote.pkl"

open_file = open(file_acc_name, "wb")
pickle.dump(acc_list, open_file)
open_file.close()

open_file = open(file_acc_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 4-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 4-----------------------------------
[(0.7120879120879121, 0.7192982456140351), (1.0, 0.6754385964912281), (0.7648351648351648, 0.6842105263157895), (0.8483516483516483, 0.7192982456140351), (0.9604395604395605, 0.6929824561403509)]


In [12]:
clf_list = [log_clf, tree_clf, k_neigh_clf, voting_clf_h, voting_clf_s]
file_clf_name = "vote.pkl"

open_file = open(file_clf_name, "wb")
pickle.dump(clf_list, open_file)
open_file.close()

open_file = open(file_clf_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 4-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 4-----------------------------------
[LogisticRegression(), DecisionTreeClassifier(), KNeighborsClassifier(), VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('tc', DecisionTreeClassifier()),
                             ('knc', KNeighborsClassifier())]), VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('tc', DecisionTreeClassifier()),
                             ('knc', KNeighborsClassifier())],
                 voting='soft')]


In [15]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators = 30,
    bootstrap=True
)
bag_half_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators = 30,
    max_samples = 0.5,
    bootstrap=True
)
past_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators = 30,
    bootstrap=False
)
past_half_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators = 30,
    max_samples = 0.5,
    bootstrap=False
)
rnd_clf = RandomForestClassifier(n_estimators=30)
ada_boost_clf = AdaBoostClassifier(n_estimators=30)
gbrt_clf = GradientBoostingClassifier(n_estimators=30)

In [16]:
acc_bag_list = []
for i, clf in enumerate((bag_clf, bag_half_clf, past_clf, past_half_clf, rnd_clf, ada_boost_clf, gbrt_clf)):
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    acc_bag_list.append((
        accuracy_score(y_train, y_train_pred),
        accuracy_score(y_test, y_test_pred))
    )
    print(f"{clf.__class__.__name__}: ({accuracy_score(y_train, y_train_pred)}, {accuracy_score(y_test, y_test_pred)})")

BaggingClassifier: (0.989010989010989, 0.7192982456140351)
BaggingClassifier: (0.9164835164835164, 0.7456140350877193)
BaggingClassifier: (1.0, 0.6842105263157895)
BaggingClassifier: (0.9582417582417583, 0.7456140350877193)
RandomForestClassifier: (1.0, 0.7543859649122807)
AdaBoostClassifier: (0.778021978021978, 0.7631578947368421)
GradientBoostingClassifier: (0.8197802197802198, 0.7807017543859649)


In [17]:
file_acc_bag_name = "acc_bag.pkl"

open_file = open(file_acc_bag_name, "wb")
pickle.dump(acc_bag_list, open_file)
open_file.close()

open_file = open(file_acc_bag_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 6-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 6-----------------------------------
[(0.989010989010989, 0.7192982456140351), (0.9164835164835164, 0.7456140350877193), (1.0, 0.6842105263157895), (0.9582417582417583, 0.7456140350877193), (1.0, 0.7543859649122807), (0.778021978021978, 0.7631578947368421), (0.8197802197802198, 0.7807017543859649)]


In [18]:
clf_bag_list = [bag_clf, bag_half_clf, past_clf, past_half_clf, rnd_clf, ada_boost_clf, gbrt_clf]
file_clf_name = "bag.pkl"

open_file = open(file_clf_name, "wb")
pickle.dump(clf_bag_list, open_file)
open_file.close()

open_file = open(file_clf_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 4-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 4-----------------------------------
[BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  n_estimators=30), BaggingClassifier(base_estimator=DecisionTreeClassifier(), bootstrap=False,
                  max_samples=0.5, n_estimators=30), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(n_estimators=30), GradientBoostingClassifier(n_estimators=30)]


In [19]:
print(train)

     target  mean texture  mean symmetry
44        0         21.81         0.1746
39        0         20.82         0.1720
464       1         18.22         0.1454
84        1         15.65         0.2079
95        0         23.03         0.2095
..      ...           ...            ...
467       1         18.10         0.1680
495       1         20.21         0.1487
201       0         19.32         0.1506
144       1         14.97         0.1399
453       1         13.98         0.1650

[455 rows x 3 columns]


In [20]:
df1 = pd.DataFrame(data_breast_cancer.data, columns=data_breast_cancer.feature_names)
df1['target'] = data_breast_cancer.target
X2 = df1.iloc[:, 0:30]
y = data_breast_cancer.frame.target
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.2)

In [21]:
acc_fea_list = []

fea_clf = BaggingClassifier(
    DecisionTreeClassifier(), 
    n_estimators = 30,
    max_samples = 0.5,
    bootstrap=True,
    bootstrap_features=True, 
    max_features=2
)
fea_clf.fit(X2_train, y2_train)

y2_train_pred_fea = fea_clf.predict(X2_train)
y2_test_pred_fea = fea_clf.predict(X2_test)
acc_fea_list.append((
    accuracy_score(y2_train, y2_train_pred_fea),
    accuracy_score(y2_test, y2_test_pred_fea))
)

In [22]:
file_acc_fea_name = "acc_fea.pkl"

open_file = open(file_acc_fea_name, "wb")
pickle.dump(acc_fea_list, open_file)
open_file.close()

open_file = open(file_acc_fea_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 8-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 8-----------------------------------
[(0.9934065934065934, 0.9385964912280702)]


In [23]:
fea_clf_list = [fea_clf]
file_clf_name = "fea.pkl"

open_file = open(file_clf_name, "wb")
pickle.dump(fea_clf_list, open_file)
open_file.close()

open_file = open(file_clf_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 8-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 8-----------------------------------
[BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                  bootstrap_features=True, max_features=2, max_samples=0.5,
                  n_estimators=30)]


In [24]:


# y2_train_pred_0 = fea_clf.estimators_[0].predict(X2_train)
# y2_test_pred_0 = fea_clf.estimators_[0].predict(X2_test)


In [25]:
data_fea_list = []
for est_fea, est in zip(fea_clf.estimators_features_, fea_clf.estimators_):
    y_train_pred_temp = est.predict(X2_train.iloc[:, est_fea])
    y_test_pred_temp = est.predict(X2_test.iloc[:, est_fea])
    data_fea_list.append([accuracy_score(y2_train, y_train_pred_temp), accuracy_score(y2_test, y_test_pred_temp), X2_train.iloc[:, est_fea].columns.tolist()])
    print(f"{X2_train.iloc[:, est_fea].columns.tolist()}: ({accuracy_score(y2_train, y_train_pred_temp)}, {accuracy_score(y2_test, y_test_pred_temp)})")
                      
df_fea = pd.DataFrame(data_fea_list, columns = ['train_accuracy', 'test_accuracy', 'features_names'])             

['worst symmetry', 'mean compactness']: (0.8087912087912088, 0.631578947368421)
['area error', 'mean fractal dimension']: (0.9010989010989011, 0.8070175438596491)
['symmetry error', 'worst symmetry']: (0.8, 0.6754385964912281)
['mean area', 'radius error']: (0.9208791208791208, 0.868421052631579)
['worst symmetry', 'smoothness error']: (0.7714285714285715, 0.5877192982456141)
['mean smoothness', 'texture error']: (0.7626373626373626, 0.5350877192982456)
['mean symmetry', 'mean concave points']: (0.9318681318681319, 0.8947368421052632)
['mean smoothness', 'perimeter error']: (0.8835164835164835, 0.7280701754385965)
['mean compactness', 'worst smoothness']: (0.8461538461538461, 0.6666666666666666)
['mean texture', 'mean smoothness']: (0.8065934065934066, 0.7192982456140351)
['smoothness error', 'compactness error']: (0.7604395604395604, 0.5877192982456141)
['mean symmetry', 'mean symmetry']: (0.7362637362637363, 0.6140350877192983)
['mean radius', 'worst area']: (0.9406593406593406, 0.91

In [21]:
df_fea_sorted = df_fea.sort_values(by=['train_accuracy', 'test_accuracy'], ascending = False)
df_fea_sorted

Unnamed: 0,train_accuracy,test_accuracy,features_names
3,0.96044,0.921053,"[worst perimeter, worst smoothness]"
20,0.953846,0.912281,"[worst radius, worst compactness]"
14,0.940659,0.833333,"[fractal dimension error, worst radius]"
26,0.938462,0.833333,"[concave points error, worst radius]"
22,0.938462,0.798246,"[area error, mean area]"
7,0.936264,0.842105,"[mean concavity, compactness error]"
9,0.931868,0.859649,"[mean concave points, worst concave points]"
11,0.931868,0.815789,"[mean radius, worst texture]"
13,0.92967,0.877193,"[worst smoothness, mean area]"
4,0.920879,0.850877,"[mean smoothness, worst area]"


In [22]:
file_acc_fea_rank_name = "acc_fea_rank.pkl"

open_file = open(file_acc_fea_rank_name, "wb")
pickle.dump(df_fea_sorted, open_file)
open_file.close()

open_file = open(file_acc_fea_rank_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('---------------------------------------EXERCICE 9-----------------------------------')
print(loaded_list)

---------------------------------------EXERCICE 9-----------------------------------
    train_accuracy  test_accuracy  \
3         0.960440       0.921053   
20        0.953846       0.912281   
14        0.940659       0.833333   
26        0.938462       0.833333   
22        0.938462       0.798246   
7         0.936264       0.842105   
9         0.931868       0.859649   
11        0.931868       0.815789   
13        0.929670       0.877193   
4         0.920879       0.850877   
12        0.914286       0.798246   
27        0.914286       0.798246   
10        0.907692       0.780702   
25        0.907692       0.780702   
2         0.903297       0.833333   
15        0.903297       0.798246   
1         0.901099       0.850877   
24        0.898901       0.833333   
29        0.870330       0.815789   
28        0.870330       0.754386   
0         0.850549       0.771930   
19        0.832967       0.684211   
16        0.826374       0.701754   
18        0.797802       0.