# DM2: "Connexionism: backpropagation algorithm"

_Eole Cervenka, Nov 13th 2017_

+ Python version: 3.6
+ libraries: sklean, numpy, pandas
+ dependencies:

    + `Eole_Cervenka_DM2_preparation.ipynb`
    + `Eole_Cervenka_DM2_exploration.ipynb`
    + `Eole_Cervenka_DM2_MLP.ipynb`
        
+ Data:
    + `data.csv` (cf Preparation section)

-------------------------------------------------
## I - Breast cancer data

### Preparation


The input data in file `breast-cancer.arff` is converted to `.csv` formatted file: `data.csv` such as:

```
'age','menopause','tumor-size','inv-nodes','node-caps','deg-malig','breast','breast-quad','irradiat','Class'
'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
...
```

### Load helper functions

In [1]:
%run Eole_Cervenka_DM2_preparation.ipynb

In [2]:
%run Eole_Cervenka_DM2_exploration.ipynb

In [3]:
%run Eole_Cervenka_DM2_MLP.ipynb

### Load data

I use the `pandas` library to load and manipulate the dataset.

In [4]:
import pandas as pd

fpath = "/home/eolus/Desktop/DAUPHINE/data_mining/DM/DM2/data.csv"
df = pd.read_csv(fpath, quotechar="'") # Load csv as pandas df

df.rename( columns={
        'tumor-size': 'tumor_size',
        'inv-nodes': 'inv_nodes',
        'node-caps' : 'node_caps',
        'deg-malig' : 'deg_malig',
        'breast-quad' : 'breast_quad'
    }, inplace=True)

df.head()

Unnamed: 0,age,menopause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat,Class
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
1,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
2,50-59,ge40,35-39,0-2,no,2,left,ft_low,,recurrence-events
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events


### Data exploration

In [5]:
attr_dict = attr_val_freq(df)
pprint(attr_dict)

{
  "age": {
    "40-49": 90,
    "50-59": 96,
    "60-69": 57,
    "30-39": 36,
    "70-79": 6,
    "20-29": 1
  },
  "menopause": {
    "premeno": 150,
    "ge40": 129,
    "lt40": 7
  },
  "tumor_size": {
    "15-19": 30,
    "35-39": 19,
    "30-34": 60,
    "25-29": 54,
    "40-44": 22,
    "10-14": 28,
    "0-4": 8,
    "20-24": 50,
    "45-49": 3,
    "50-54": 8,
    "5-9": 4
  },
  "inv_nodes": {
    "0-2": 213,
    "3-5": 36,
    "15-17": 6,
    "6-8": 17,
    "9-11": 10,
    "24-26": 1,
    "12-14": 3
  },
  "node_caps": {
    "yes": 56,
    "no": 222,
    "?": 8
  },
  "deg_malig": {
    "3": 85,
    "1": 71,
    "2": 130
  },
  "breast": {
    "right": 134,
    "left": 152
  },
  "breast_quad": {
    "left_up": 97,
    "central": 21,
    "ft_low": 1,
    "left_low": 109,
    "right_up": 33,
    "right_low": 24,
    "?": 1
  },
  "irradiat": {
    "no": 217,
    "NaN": 1,
    "yes": 68
  },
  "Class": {
    "recurrence-events": 85,
    "no-recurrence-events": 201
  }
}


In [6]:
# Attribut description
# Histogram de frequence des valeurs possible par attribut

# np.histogram...

# import matplotlib.pyplot as plt
# rng = np.random.RandomState(10)  # deterministic random data
# a = np.hstack((rng.normal(size=1000),
#                 rng.normal(loc=5, scale=2, size=1000)))

# plt.hist(a, bins='auto')  # arguments are passed to np.histogram
# plt.title("Histogram with 'auto' bins")
# plt.show()

### Data preparation

1. Deal with missing attributes
2. Categorical values encoding

#### Missing values

Remove records with value `'?'` in attribute `node_caps` or `breast_quad`, or with value `'NaN'` in attribute `irradiat`

In [21]:
df = remove_missing_values(df)
df.head()

## OOPSIE; NaN in "irradiat"...

Unnamed: 0,age,menopause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat,Class
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
1,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,no-recurrence-events


#### Categorical values encoding

In [22]:
df_encoded, label_encoder = encode_df(df)
df_encoded.head()

Unnamed: 0,age,menopause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat,Class
0,2,2,2,0,2,2,1,3,0,1
1,3,0,2,0,1,0,1,1,0,0
3,2,2,6,0,2,2,1,2,1,0
4,2,2,5,4,2,1,0,5,0,1
5,3,2,4,4,1,1,1,3,1,0


### Attribute overview

In [23]:
val_dict = attr_val_dict(df_encoded)

fpath = "/tmp/DM2_attr_val_encoded.json"
save_json(val_dict, fpath)

for k, v in val_dict.items(): print(k, sorted(v))

age [0, 1, 2, 3, 4, 5]
menopause [0, 1, 2]
tumor_size [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
inv_nodes [0, 1, 2, 3, 4, 5, 6]
node_caps [0, 1, 2]
deg_malig [0, 1, 2]
breast [0, 1]
breast_quad [0, 1, 2, 3, 4, 5]
irradiat [0, 1]
Class [0, 1]


### Attribute value-frequency

In [24]:
# Extract encoded attribute-values freq
freq = attr_val_freq(df_encoded)

# Display
pprint(freq)

{
  "age": {
    "2": 90,
    "3": 95,
    "4": 57,
    "1": 36,
    "5": 6,
    "0": 1
  },
  "menopause": {
    "2": 150,
    "0": 128,
    "1": 7
  },
  "tumor_size": {
    "2": 30,
    "6": 18,
    "5": 60,
    "4": 54,
    "7": 22,
    "1": 28,
    "0": 8,
    "3": 50,
    "8": 3,
    "10": 8,
    "9": 4
  },
  "inv_nodes": {
    "0": 212,
    "4": 36,
    "2": 6,
    "5": 17,
    "6": 10,
    "3": 1,
    "1": 3
  },
  "node_caps": {
    "2": 56,
    "1": 221,
    "0": 8
  },
  "deg_malig": {
    "2": 85,
    "0": 71,
    "1": 129
  },
  "breast": {
    "1": 134,
    "0": 151
  },
  "breast_quad": {
    "3": 97,
    "1": 21,
    "2": 109,
    "5": 33,
    "4": 24,
    "0": 1
  },
  "irradiat": {
    "0": 217,
    "1": 68
  },
  "Class": {
    "1": 84,
    "0": 201
  }
}


### Cross validation training

In [25]:
# matrix input X and label vector y
X, y = get_nn_inputs(df_encoded)

# Preview 
print(X[:10])
print()
print(y[:10])

[(2, 2, 2, 0, 2, 2, 1, 3, 0), (3, 0, 2, 0, 1, 0, 1, 1, 0), (2, 2, 6, 0, 2, 2, 1, 2, 1), (2, 2, 5, 4, 2, 1, 0, 5, 0), (3, 2, 4, 4, 1, 1, 1, 3, 1), (3, 0, 7, 0, 1, 2, 0, 3, 0), (2, 2, 1, 0, 1, 1, 0, 3, 0), (2, 2, 0, 0, 1, 1, 1, 4, 0), (2, 0, 7, 2, 2, 1, 1, 3, 1), (3, 2, 4, 0, 1, 1, 0, 2, 0)]

[1, 0, 0, 1, 0, 0, 0, 0, 0, 0]


In [26]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='sgd', max_iter=200) # default parameters
                    
from sklearn.model_selection import cross_val_score  
scores = cross_val_score(clf, X, y, cv=10)
for s in scores: print(s)

0.666666666667
0.689655172414
0.793103448276
0.689655172414
0.714285714286
0.714285714286
0.678571428571
0.785714285714
0.75
0.607142857143


### Hyper-parameter optimization

#### Define hyper-parameter space

In [27]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    clf,
    param_distributions=param_grid,
    n_iter=30, # 30 (random) search iteration
    n_jobs=4, # 4 parallel jobs
    refit=True,
    cv=10, # 10-fold cross-validation
    verbose=0,
    random_state=None
)

random_search.fit(X, y)
print("best params:\n{}".format(random_search.best_params_))
print("best score :\n{}".format(random_search.best_score_))



best params:
{'learning_rate': 'adaptive', 'hidden_layer_sizes': (37, 44)}
best score :
0.7368421052631579




### Discussion on choosing K in K-fold

blah