In [2]:
import pandas as pd
from pandas.tools.plotting import scatter_matrix as smplot
import numpy as np
import sklearn 
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam, SGD
from keras.regularizers import l2
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.metrics import r2_score, accuracy_score, precision_score, f1_score, zero_one_loss, classification_report
from sklearn.model_selection import train_test_split, cross_validate, KFold, StratifiedKFold
from sklearn.decomposition import PCA

% matplotlib inline

Using TensorFlow backend.


In [3]:
df = pd.read_csv('../data/HR_comma_sep.csv')
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [4]:
jobs = df.sales.unique()
# categorize jobs column
dfjob = pd.get_dummies(df['sales'], prefix='job')
df = df.drop('sales', axis=1)
df = pd.concat([df, dfjob], axis=1)
# categorize salary column
dfslry = pd.get_dummies(df['salary'], prefix='salary_level')
df = df.drop('salary', axis=1)
df = pd.concat([df, dfslry], axis=1)

In [5]:
dffeature = df.drop('left', axis=1)

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

dffeature.iloc[:,2:5] = ss.fit_transform(dffeature.iloc[:,2:5])

X = dffeature.values

In [6]:
stopping_variance = 0.95 # stopping criteria
for n_comps in range(X.shape[1], 0, -1):
    pca = PCA(n_components = n_comps)
    pca.fit(X)
    if sum(pca.explained_variance_ratio_) >= stopping_variance:
        pca_sv = pca
    
    print('======================== ', n_comps, ' components =========================')
    print('===== explained variance ratio: ===========================================')
    print(pca.explained_variance_ratio_)
    print('\n')
    print("total variance explained: {:0.3f}".format(sum(pca.explained_variance_ratio_)))
    print('\n')

[  3.27878050e-01   1.95000057e-01   1.24071661e-01   9.81812308e-02
   5.09566149e-02   3.53485973e-02   2.67422105e-02   2.55568736e-02
   2.31983493e-02   1.58645338e-02   1.30853374e-02   1.26442883e-02
   1.17932476e-02   1.10656420e-02   1.07162373e-02   8.67959679e-03
   4.96001542e-03   4.25745676e-03   3.77508942e-33   1.39687060e-33]


total variance explained: 1.000


[  3.27878050e-01   1.95000057e-01   1.24071661e-01   9.81812308e-02
   5.09566149e-02   3.53485973e-02   2.67422105e-02   2.55568736e-02
   2.31983493e-02   1.58645338e-02   1.30853374e-02   1.26442883e-02
   1.17932476e-02   1.10656420e-02   1.07162373e-02   8.67959679e-03
   4.96001542e-03   4.25745676e-03   3.77508942e-33]


total variance explained: 1.000


[ 0.32787805  0.19500006  0.12407166  0.09818123  0.05095661  0.0353486
  0.02674221  0.02555687  0.02319835  0.01586453  0.01308534  0.01264429
  0.01179325  0.01106564  0.01071624  0.0086796   0.00496002  0.00425746]


total variance explained: 1.000


In [7]:
y = df.left.values
X_1pca = pca_sv.transform(X)
A, C, B, D = train_test_split(X_1pca, y, test_size=0.1)

In [8]:
def CreateDNNModel(input_dim, epos, opt=Adam(lr=0.1), lossfunc = 'binary_crossentropy', metrics = None,
                   hidden_nodes=[1], activators = ['sigmoid'], reg=None, verbose=1):
    
    model = Sequential()
    NNlayout = zip(hidden_nodes, activators)
    for i_layer, layerparam in enumerate(NNlayout):
        if i_layer == 0:
            model.add(Dense(layerparam[0], input_dim=input_dim, kernel_regularizer = reg))
        else:
            model.add(Dense(layerparam[0], kernel_regularizer = reg))
        model.add(Activation(layerparam[1]))
    if verbose:
        print(model.summary())    
    model.compile(optimizer = opt, loss = lossfunc, metrics=metrics)

    return model

### use "CLOSURE" to pass parameters in the KerasClassifier fn

In [9]:
# use a closure
def CreateModel(input_dim, epos, opt=Adam(lr=0.1), lossfunc = 'binary_crossentropy',
                metrics = None, hidden_nodes=[1], activators = ['sigmoid'], reg=None, verbose=0):        
    def do():
        return CreateDNNModel(input_dim, epos, reg=reg, hidden_nodes=hidden_nodes, opt=opt, lossfunc=lossfunc,
                              activators=activators, verbose=verbose, metrics=metrics)
    return do

def KerasCrossVal(A, B, myepos, myreg, myhidden_nodes, myactivators, myopt=Adam(lr=0.1), mybatch_size=64): 
    model = KerasClassifier(build_fn=CreateModel(A.shape[1], myepos, reg=myreg, opt=myopt, hidden_nodes=myhidden_nodes, 
                                                 activators=myactivators, metrics=['accuracy']), 
                            verbose=0)
    mycv = StratifiedKFold(n_splits=5, shuffle=True)
    results = cross_validate(model, A, B, cv=mycv, scoring=['accuracy','precision','recall','f1'], fit_params={'batch_size': mybatch_size})

    return results

##### a little study of the scoring metrics

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
y_true = [0,0,0,1,1]
y_pred = [1,0,0,1,0]
print("accuracy = ", accuracy_score(y_true, y_pred)) # (tp+tn)/(tp+tn+fp+fn)
print("precision = ", precision_score(y_true, y_pred)) # tp/(tp+fp)
print("recall = ", recall_score(y_true, y_pred)) # tp/(tp+fn)  
print("f1 = ", f1_score(y_true, y_pred)) # 2(precision*recall)/(precision+recall)

accuracy =  0.6
precision =  0.5
recall =  0.5
f1 =  0.5


In [12]:
y_true = [1-i for i in y]
y_pred = np.ones(len(y))
print("accuracy = ", accuracy_score(y_true, y_pred)) # (tp+tn)/(tp+tn+fp+fn)
print("precision = ", precision_score(y_true, y_pred)) # tp/(tp+fp)
print("recall = ", recall_score(y_true, y_pred)) # tp/(tp+fn)  
print("f1 = ", f1_score(y_true, y_pred)) # 2(precision*recall)/(precision+recall)

accuracy =  0.761917461164
precision =  0.761917461164
recall =  1.0
f1 =  0.864873046505


### Prove of Concept: try with one DNN config

##### cross_validation in scikit learn v0.19.0
http://scikit-learn.org/stable/modules/cross_validation.html

scoring metrics:

http://scikit-learn.org/stable/modules/model_evaluation.html

In [13]:
myepos = 100
myreg = l2(0.01)
myhidden_nodes=[20, 1]
myactivators = ['relu','sigmoid']

scores = KerasCrossVal(A, B, myepos, myreg, myhidden_nodes, myactivators)
print('Fit Time = ', scores['fit_time'].mean(), " ~ ", scores['fit_time'].std())
print('Train Accuracy = ', scores['train_accuracy'].mean(), " ~ ", scores['train_accuracy'].std()) 
print('Test Accuracy = ', scores['test_accuracy'].mean(), " ~ ", scores['test_accuracy'].std()) 
print('Train Precision = ', scores['train_precision'].mean(), " ~ ", scores['train_precision'].std()) 
print('Test Precision = ', scores['test_precision'].mean(), " ~ ", scores['test_precision'].std()) 
print('Train Recall = ', scores['train_recall'].mean(), " ~ ", scores['train_recall'].std()) 
print('Test Recall = ', scores['test_recall'].mean(), " ~ ", scores['test_recall'].std()) 
print('Train F1 = ', scores['train_f1'].mean(), " ~ ", scores['train_f1'].std()) 
print('Test F1 = ', scores['test_f1'].mean(), " ~ ", scores['test_f1'].std()) 

Fit Time =  [ 4.12968898  3.97153687  4.19236994  3.69917607  3.85341311]
Train Accuracy =  0.88532536073  ~  0.0112667899373
Test Accuracy =  0.882508577715  ~  0.0105546116568
Train Precision =  0.860330922711  ~  0.0373554865657
Test Precision =  0.855503756531  ~  0.0397984360453
Train Recall =  0.624463414649  ~  0.03759309408
Test Recall =  0.615560421318  ~  0.0314560602616
Train F1 =  0.722678956136  ~  0.0292219809325
Test F1 =  0.715088184372  ~  0.0254974441931


## find the best parameter settings

### calculate all configurations

In [14]:
def ConfLayout(num_layers, num_nodes, output_nodes, activators):
    layoutconf = {}
    for n_layers in num_layers:
        
        last_layer_nodes = [output_nodes]
        output_activator = 'sigmoid' if output_nodes==1 else 'softmax'
        
        if n_layers == 0: # logistic regression
            layoutconf[n_layer] = np.array(last_layer_nodes.reverse())
            Activator = [output_activator]
            
        else: # at least one layer NN
            for i_layer in range(1,n_layers+1):
                if i_layer == 1:
                    last_layer_nodes = last_layer_nodes * len(num_nodes)
                    last_layer_nodes = [last_layer_nodes] + [num_nodes]
                else:
                    last_layer_nodes = [i*len(num_nodes) for i in last_layer_nodes]
                    curr_layer_nodes = [val for val in num_nodes for _ in range(len(num_nodes)**(i_layer-1))]
                    last_layer_nodes.append(curr_layer_nodes)
            
            Activator = [[i]*n_layers for i in activators]
        
        for i in range(len(Activator)):
            Activator[i].append(output_activator)
            
        last_layer_nodes.reverse()
        
        layoutconf[n_layers] = {'layerconf': np.array(last_layer_nodes),
                               'activatorconf': Activator}
        
    return layoutconf

#### write all config into csv

In [18]:
num_layer_opts = [1, 2, 3,]
num_node_opts = [5, 10, 20]
output_nodes = 1
activator_opts = ['relu']
batch_sizes = [32, 64, 256]
myepos = 200
myopts = ['SGD(lr=0.01)', 
          'Adam(lr=0.01)',
          'Adagrad(lr=0.01)',
          'RMSprop(lr=0.01)']
myregs = ['l2(0.1)',
          'l2(0.01)',
          'l2(0.001)']

layerconfigall = ConfLayout(num_layer_opts, num_node_opts, output_nodes, activator_opts)

In [27]:
D = []
for nlayer in num_layer_opts:
    for myactivators in layerconfigall[nlayer]['activatorconf']:
        for i in range(layerconfigall[nlayer]['layerconf'].shape[1]):
            myhidden_nodes = layerconfigall[nlayer]['layerconf'][:,i].tolist()
            for myreg in myregs:  
                for myopt in myopts:
                    for mybatchsize in batch_sizes:
                        
                        d = {'nlayers':nlayer,'layers':myhidden_nodes,'activators':myactivators,
                             'reg': myreg,
                             'opt': myopt,
                             'batch_size': mybatchsize}
                        D.append(d)
                        print(d)
               
pd.DataFrame(D, columns=['nlayers','layers','activators',
                         'reg','opt','batch_size']).to_csv('Ex2_cv_configs.csv')

{'layers': [5, 1], 'opt': 'SGD(lr=0.01)', 'activators': ['relu', 'sigmoid'], 'batch_size': 32, 'reg': 'l2(0.1)', 'nlayers': 1}
{'layers': [5, 1], 'opt': 'SGD(lr=0.01)', 'activators': ['relu', 'sigmoid'], 'batch_size': 64, 'reg': 'l2(0.1)', 'nlayers': 1}
{'layers': [5, 1], 'opt': 'SGD(lr=0.01)', 'activators': ['relu', 'sigmoid'], 'batch_size': 256, 'reg': 'l2(0.1)', 'nlayers': 1}
{'layers': [5, 1], 'opt': 'Adam(lr=0.01)', 'activators': ['relu', 'sigmoid'], 'batch_size': 32, 'reg': 'l2(0.1)', 'nlayers': 1}
{'layers': [5, 1], 'opt': 'Adam(lr=0.01)', 'activators': ['relu', 'sigmoid'], 'batch_size': 64, 'reg': 'l2(0.1)', 'nlayers': 1}
{'layers': [5, 1], 'opt': 'Adam(lr=0.01)', 'activators': ['relu', 'sigmoid'], 'batch_size': 256, 'reg': 'l2(0.1)', 'nlayers': 1}
{'layers': [5, 1], 'opt': 'Adagrad(lr=0.01)', 'activators': ['relu', 'sigmoid'], 'batch_size': 32, 'reg': 'l2(0.1)', 'nlayers': 1}
{'layers': [5, 1], 'opt': 'Adagrad(lr=0.01)', 'activators': ['relu', 'sigmoid'], 'batch_size': 64, 're

In [31]:
dfconf = pd.read_csv('Ex2_cv_configs.csv', index_col=0)
dfconf.head()

Unnamed: 0,nlayers,layers,activators,reg,opt,batch_size
0,1,"[5, 1]","['relu', 'sigmoid']",l2(0.1),SGD(lr=0.01),32
1,1,"[5, 1]","['relu', 'sigmoid']",l2(0.1),SGD(lr=0.01),64
2,1,"[5, 1]","['relu', 'sigmoid']",l2(0.1),SGD(lr=0.01),256
3,1,"[5, 1]","['relu', 'sigmoid']",l2(0.1),Adam(lr=0.01),32
4,1,"[5, 1]","['relu', 'sigmoid']",l2(0.1),Adam(lr=0.01),64


In [37]:
conf = dfconf.iloc[0].to_dict()

{'activators': "['relu', 'sigmoid']",
 'batch_size': 32,
 'layers': '[5, 1]',
 'nlayers': 1,
 'opt': 'SGD(lr=0.01)',
 'reg': 'l2(0.1)'}

## Iterate through all configs to find the best param setting

In [None]:
dscores = []
for i, conf in enumerate(dfconf):
    conf = conf.to_dict()
    scores = KerasCrossVal(A, B, myepos, conf['reg'], conf['layers'], 
                           conf['activators'], conf['opt'], conf['batch_size'])
    
    d['Fit Time'] = (scores['fit_time'].mean(), scores['fit_time'].std())
    d['Train Accuracy'] = (scores['train_accuracy'].mean(), scores['train_accuracy'].std()) 
    d['Test Accuracy'] = (scores['test_accuracy'].mean(), scores['test_accuracy'].std()) 
    d['Train Precision'] = (scores['train_precision'].mean(), scores['train_precision'].std()) 
    d['Test Precision'] = (scores['test_precision'].mean(), scores['test_precision'].std()) 
    d['Train Recall'] = (scores['train_recall'].mean(), scores['train_recall'].std()) 
    d['Test Recall'] = (scores['test_recall'].mean(), scores['test_recall'].std()) 
    d['Train F1'] = (scores['train_f1'].mean(), scores['train_f1'].std()) 
    d['Test F1'] = (scores['test_f1'].mean(), scores['test_f1'].std()) 
    dscores.append(d)

dfscore = pd.DataFrame(dscores)