In [1]:
import warnings

import numpy as np


import matplotlib
from pca import analyze_componenets
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier



from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
 f1_score, roc_auc_score, mean_absolute_error, make_scorer, brier_score_loss

from tensorflow import keras

from data_loader import load_data

In [2]:
def get_scoring():
    return dict(accuracy=make_scorer(accuracy_score),
                precision=make_scorer(precision_score),
                recall=make_scorer(recall_score),
                f1_score=make_scorer(f1_score),
                roc_auc_scorer=make_scorer(roc_auc_score),
                mean_absolute_error=make_scorer(mean_absolute_error),
                brier_score=make_scorer(brier_score_loss))

def create_model(optimizer='adam', activation='linear', init_mode='uniform'
                 , dropout_rate=0.1, first_layer=40, second_layer=20, dim = None):
    model = keras.Sequential()
    model.add(keras.layers.Dropout(dropout_rate, input_shape=(dim,)))
    model.add(keras.layers.Dense(first_layer, kernel_initializer=init_mode, activation=activation))
    model.add(keras.layers.Dense(second_layer, kernel_initializer=init_mode, activation=activation))
    model.add(keras.layers.Dense(5, kernel_initializer=init_mode, activation=activation))
    model.add(keras.layers.Dense(2, kernel_initializer=init_mode, activation='softmax'))

    model.compile(optimizer=optimizer,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model


def simpleGrid(top_pca_features = None):
    print('Importing data')

    if top_pca_features == None:
        data_x, data_y, features, number_of_features = load_data(
            effective_non_effective=True, coverage=False,
            grano_test=True, grano_production=True, my_test=True,
            my_production=True, scale=True)
    else:
        data_x, data_y, features, number_of_features = load_data(
            effective_non_effective=True, scale=True, include=analyze_componenets(top_pca_features))
    print('Import: DONE')

    batch_size = [100]
    activation = ['relu']
    optimizer = ['Adam']
    dropout_rate = [0.15]
    first_layer = [100]
    second_layer = [20]

    param_grid = dict(
        batch_size=batch_size, optimizer=optimizer, activation=activation,
        dropout_rate=dropout_rate, first_layer=first_layer,
        second_layer=second_layer, dim = [number_of_features])

    inner_cv = StratifiedKFold(n_splits=10, shuffle=True)
    model = KerasClassifier(build_fn=create_model,verbose=0, epochs=2000, batch_size=50)

    early_stopping_monitor = keras.callbacks.EarlyStopping(
        monitor='accuracy', min_delta=0.0003, patience=10,
        verbose=0, mode='max', restore_best_weights=True)


    results = GridSearchCV(
        estimator=model, cv=inner_cv, param_grid=param_grid,
        scoring=get_scoring(), refit='roc_auc_scorer',verbose=20, n_jobs=-1)

    results.fit(data_x, data_y, callbacks=[early_stopping_monitor])


    print("-----------------------------")
    print(results)
    print(results.cv_results_.get('mean_test_accuracy'))
    print(max(results.cv_results_.get('mean_test_accuracy')))
    print('The best configuration is {}'.format(results.best_params_))
    config_index = np.argmax(results.cv_results_.get('mean_test_accuracy'))
    print(config_index)
    print("-----------------------------")
    accuracy = results.cv_results_.get('mean_test_accuracy')[config_index]
    precision = results.cv_results_.get('mean_test_precision')[config_index]
    recall = results.cv_results_.get('mean_test_recall')[config_index] 
    f1_score = results.cv_results_.get('mean_test_f1_score')[config_index] 
    roc_auc = results.cv_results_.get('mean_test_roc_auc_scorer')[config_index] 
    mae = results.cv_results_.get('mean_test_mean_absolute_error')[config_index]
    brier = results.cv_results_.get('mean_test_brier_score')[config_index]

    print("---------------------------------")
    print('Performances:\n'
          'Accuracy\t {:.3f}\n'
          'Precision\t {:.3f}\n'
          'Recall\t {:.3f}\n'
          'F1 Score\t {:.3f}\n'
          'ROC AUC\t {:.3f}\n'
          'MAE\t {:.3f}\n'
          'Brier Score\t {:.3f}\n'
          .format(accuracy, precision, recall, f1_score, roc_auc, mae, brier))
    print("---------------------------------")

    means = results.cv_results_.get('mean_test_accuracy')
    params = results.cv_results_.get('params')
    for mean, param in zip(means, params):
        print("%f with: %r" % (mean, param))
    return ['{:.3f}'.format(accuracy),
            '{:.3f}'.format(precision),
            '{:.3f}'.format(recall),
            '{:.3f}'.format(f1_score),
            '{:.3f}'.format(roc_auc),
            '{:.3f}'.format(mae),
            '{:.3f}'.format(brier)]


def main():
    simpleGrid(top_pca_features = 36)

if __name__ == '__main__':
    main()

Importing data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  frame_low['mutation'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  frame_high['mutation'] = 1


(0) category: grano,production,code-smell - implication: 9.639312970427174e-18
§§§§§§§§§§§§§§§§ csm_FD ±±±±±±±±±±±±±±±
(1) category: grano,test,test-smell - implication: 2.207349574830871e-17
§§§§§§§§§§§§§§§§ isLazyTest ±±±±±±±±±±±±±±±
(2) category: mine,production,ast-shape - implication: 0.039421250014853
§§§§§§§§§§§§§§§§ DegPerm_production ±±±±±±±±±±±±±±±
(3) category: mine,test,mccabe-style - implication: 0.03960143262361034
§§§§§§§§§§§§§§§§ No. Continue ±±±±±±±±±±±±±±±
(4) category: mine,test,ast-shape - implication: 0.04277777844328933
§§§§§§§§§§§§§§§§ DegPerm ±±±±±±±±±±±±±±±
(5) category: mine,production,ast-shape - implication: 0.04588887021409247
§§§§§§§§§§§§§§§§ Avg Depth^2_production ±±±±±±±±±±±±±±±
(6) category: mine,production,ast-shape - implication: 0.046081459531892965
§§§§§§§§§§§§§§§§ Avg Depth^(-1)_production ±±±±±±±±±±±±±±±
(7) category: mine,production,ast-shape - implication: 0.04608433726459722
§§§§§§§§§§§§§§§§ Avg Depth_production ±±±±±±±±±±±±±±±
(8) category: mi

Import: DONE
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20 [CV] dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20 

[CV] dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20 [CV] dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20 
[CV] dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20 
[CV] dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20 

[CV] dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20 [CV] dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20 

[CV] d

[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:    3.1s remaining:   12.3s


[CV]  dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20, brier_score=0.21052631578947367, f1_score=0.7910447761194029, roc_auc_scorer=0.7895748530076888, precision=0.7794117647058824, accuracy=0.7894736842105263, mean_absolute_error=0.21052631578947367, recall=0.803030303030303, total=   2.8s
[CV]  dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20, brier_score=0.18045112781954886, f1_score=0.8208955223880597, roc_auc_scorer=0.8196517412935324, precision=0.8088235294117647, accuracy=0.8195488721804511, mean_absolute_error=0.18045112781954886, recall=0.8333333333333334, total=   2.8s


[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    3.1s remaining:    7.3s
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    3.2s remaining:    4.7s


[CV]  dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20, brier_score=0.20454545454545456, f1_score=0.7874015748031495, roc_auc_scorer=0.7950631458094145, precision=0.8064516129032258, accuracy=0.7954545454545454, mean_absolute_error=0.20454545454545456, recall=0.7692307692307693, total=   3.3s
[CV]  dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20, brier_score=0.2196969696969697, f1_score=0.7716535433070866, roc_auc_scorer=0.7799081515499426, precision=0.7903225806451613, accuracy=0.7803030303030303, mean_absolute_error=0.2196969696969697, recall=0.7538461538461538, total=   3.4s
[CV]  dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20, brier_score=0.21804511278195488, f1_score=0.7716535433070867, roc_auc_scorer=0.7816598824061511, precision=0.8032786885245902, accuracy=0.7819548872180451, mean_absolute_error=0.

[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    3.6s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.7s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    3.8s remaining:    1.6s


[CV]  dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20, brier_score=0.20300751879699247, f1_score=0.8085106382978724, roc_auc_scorer=0.7974898236092266, precision=0.76, accuracy=0.7969924812030075, mean_absolute_error=0.20300751879699247, recall=0.8636363636363636, total=   3.5s
[CV]  dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20, brier_score=0.14285714285714285, f1_score=0.8549618320610687, roc_auc_scorer=0.8570782451379467, precision=0.8615384615384616, accuracy=0.8571428571428571, mean_absolute_error=0.14285714285714285, recall=0.8484848484848485, total=   3.6s


[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:    3.9s remaining:    1.0s


[CV]  dim=36, dropout_rate=0.15, batch_size=100, optimizer=Adam, activation=relu, first_layer=100, second_layer=20, brier_score=0.18181818181818182, f1_score=0.8260869565217391, roc_auc_scorer=0.8190585533869116, precision=0.7808219178082192, accuracy=0.8181818181818182, mean_absolute_error=0.18181818181818182, recall=0.8769230769230769, total=   3.9s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.2s finished


-----------------------------
GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=True),
       error_score='raise',
       estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7fdf1f8547b8>,
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'dim': [36], 'dropout_rate': [0.15], 'first_layer': [100], 'batch_size': [100], 'optimizer': ['Adam'], 'activation': ['relu'], 'second_layer': [20]},
       pre_dispatch='2*n_jobs', refit='roc_auc_scorer',
       return_train_score='warn',
       scoring={'brier_score': make_scorer(brier_score_loss), 'f1_score': make_scorer(f1_score), 'roc_auc_scorer': make_scorer(roc_auc_score), 'precision': make_scorer(precision_score), 'accuracy': make_scorer(accuracy_score), 'mean_absolute_error': make_scorer(mean_absolute_error), 'recall': make_scorer(recall_score)},
       verbose=20)
[0.79728711]
0.7972871137905049
The best configuration is {'dim': 36, 'dropout_rate': 0.15, 'batch_size': 100, 