# Test technique entretien Audensiel

Notebook réalisé par Marie Bétend dans le cadre d'un entretien.

Les données sont sous la forme : 
* Class: no-recurrence-events, recurrence-events
* age: 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70-79, 80-89, 90-99.
* menopause: lt40, ge40, premeno.
* tumor-size: 0-4, 5-9, 10-14, 15-19, 20-24, 25-29, 30-34, 35-39, 40-44,45-49, 50-54, 55-59.
* inv-nodes: 0-2, 3-5, 6-8, 9-11, 12-14, 15-17, 18-20, 21-23, 24-26,27-29, 30-32, 33-35, 36-39.
* node-caps: yes, no.
* deg-malig: 1, 2, 3.
* breast: left, right.
* breast-quad: left-up, left-low, right-up,	right-low, central.
* irradiat:	yes, no.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns


In [2]:
colnames = ['Class','age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat']
data = pd.read_csv('breast-cancer.data', delimiter = ',', names = colnames, na_values = '?')

In [3]:
data.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [4]:
def to_category(df, labels):
    for e in labels:
        df[e] = df[e].astype('category')

to_category(data, data.columns)

In [5]:
data.dtypes

Class          category
age            category
menopause      category
tumor-size     category
inv-nodes      category
node-caps      category
deg-malig      category
breast         category
breast-quad    category
irradiat       category
dtype: object

In [6]:
data[data.isna().any(axis=1)]

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
145,no-recurrence-events,40-49,premeno,25-29,0-2,,2,left,right_low,yes
163,no-recurrence-events,60-69,ge40,25-29,3-5,,1,right,left_up,yes
164,no-recurrence-events,60-69,ge40,25-29,3-5,,1,right,left_low,yes
183,no-recurrence-events,50-59,ge40,30-34,9-11,,3,left,left_up,yes
184,no-recurrence-events,50-59,ge40,30-34,9-11,,3,left,left_low,yes
206,recurrence-events,50-59,ge40,30-34,0-2,no,3,left,,no
233,recurrence-events,70-79,ge40,15-19,9-11,,1,left,left_low,yes
263,recurrence-events,50-59,lt40,20-24,0-2,,1,left,left_up,no
264,recurrence-events,50-59,lt40,20-24,0-2,,1,left,left_low,no


In [7]:
data = data.dropna()

Comme les données sont "céatégoriques", on doit les encoder pour pouvoir utiliser un arbre de décision, nous allons tester différentes approches.

## Encodage
### Encodage numérique


In [8]:
data_numeric = data.copy()

In [9]:
from sklearn.preprocessing import LabelEncoder

def to_numeric (col, encoder):
    data_numeric[col] = encoder.fit_transform(data_numeric[col])

for col in data_numeric.columns :
    encoder = LabelEncoder()
    to_numeric(col, encoder)

In [10]:
data_numeric.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,0,1,2,5,0,0,2,0,1,0
1,0,2,2,3,0,0,1,1,4,0
2,0,2,2,3,0,0,1,0,1,0
3,0,4,0,2,0,0,1,1,2,0
4,0,2,2,0,0,0,1,1,3,0


### Encodage one hot

In [11]:
data_oneHot = data.copy()

In [12]:
def to_oneHot (col) :
    global data_oneHot
    data_oneHot = pd.get_dummies(data_oneHot,columns = [col])



In [13]:
columns = data_oneHot.columns.copy()
for col in columns:
    if col == 'Class' :
        continue 
    to_oneHot(col)

In [14]:
data_oneHot.head()

Unnamed: 0,Class,age_20-29,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,menopause_ge40,menopause_lt40,menopause_premeno,...,deg-malig_3,breast_left,breast_right,breast-quad_central,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,irradiat_no,irradiat_yes
0,no-recurrence-events,0,1,0,0,0,0,0,0,1,...,1,1,0,0,1,0,0,0,1,0
1,no-recurrence-events,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0
2,no-recurrence-events,0,0,1,0,0,0,0,0,1,...,0,1,0,0,1,0,0,0,1,0
3,no-recurrence-events,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0
4,no-recurrence-events,0,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,1,0


## Arbre de décision 

In [15]:
X = data_numeric.drop(['Class'], axis = 1)
Y = data_numeric['Class']

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20,random_state = 42, stratify = Y)

In [17]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy')
classifier.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [18]:
y_pred = classifier.predict(X_test)

In [19]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[30 10]
 [ 9  7]]
              precision    recall  f1-score   support

           0       0.77      0.75      0.76        40
           1       0.41      0.44      0.42        16

    accuracy                           0.66        56
   macro avg       0.59      0.59      0.59        56
weighted avg       0.67      0.66      0.66        56



In [20]:
print('Model test Score: %.3f, ' %classifier.score(X_test, y_test), 
      'Model training Score: %.3f' %classifier.score(X_train, y_train))

Model test Score: 0.661,  Model training Score: 0.977


In [21]:
X_oH  = data_oneHot.drop(['Class'], axis = 1)
Y_oH = data_oneHot['Class']
X_oHtrain, X_oHtest, y_oHtrain, y_oHtest = train_test_split(X_oH, Y_oH, test_size=0.20, random_state = 56, stratify = Y_oH)

In [22]:
classifieroH = DecisionTreeClassifier(criterion='entropy')
classifieroH.fit(X_oHtrain, y_oHtrain)

DecisionTreeClassifier(criterion='entropy')

In [23]:
print('Model test Score: %.3f, ' %classifieroH.score(X_oHtest, y_oHtest), 
      'Model training Score: %.3f' %classifieroH.score(X_oHtrain, y_oHtrain))

Model test Score: 0.696,  Model training Score: 0.977


In [24]:
y_oHpred = classifieroH.predict(X_oHtest)
print(confusion_matrix(y_oHtest, y_oHpred))
print(classification_report(y_oHtest, y_oHpred))

[[32  8]
 [ 9  7]]
                      precision    recall  f1-score   support

no-recurrence-events       0.78      0.80      0.79        40
   recurrence-events       0.47      0.44      0.45        16

            accuracy                           0.70        56
           macro avg       0.62      0.62      0.62        56
        weighted avg       0.69      0.70      0.69        56



## Piste d'amélioration

### Bagging

In [25]:
from sklearn.ensemble import BaggingClassifier

bgclassifier = BaggingClassifier(base_estimator=classifier)
bgclassifier.fit(X_train, y_train)

print('Model test Score: %.3f, ' %bgclassifier.score(X_test, y_test), 
      'Model training Score: %.3f' %bgclassifier.score(X_train, y_train))

Model test Score: 0.732,  Model training Score: 0.959


In [26]:
bgclassifieroH = BaggingClassifier(base_estimator=classifieroH)
bgclassifieroH.fit(X_oHtrain, y_oHtrain)

print('Model test Score: %.3f, ' %bgclassifieroH.score(X_oHtest, y_oHtest), 
      'Model training Score: %.3f' %bgclassifieroH.score(X_oHtrain, y_oHtrain))

Model test Score: 0.714,  Model training Score: 0.950


### Boosting

In [27]:
from sklearn.ensemble import AdaBoostClassifier

aBclassifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200 )
aBclassifier.fit(X_train, y_train)

print('Model test Score: %.3f, ' %aBclassifier.score(X_test, y_test), 
      'Model training Score: %.3f' %aBclassifier.score(X_train, y_train))

Model test Score: 0.679,  Model training Score: 0.774


In [28]:
aBclassifieroH = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2), n_estimators=200 )
aBclassifieroH.fit(X_oHtrain, y_oHtrain)

print('Model test Score: %.3f, ' %aBclassifieroH.score(X_oHtest, y_oHtest), 
      'Model training Score: %.3f' %aBclassifieroH.score(X_oHtrain, y_oHtrain))

Model test Score: 0.661,  Model training Score: 0.977
