In [3]:
import numpy, pandas, json

from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, roc_curve

from xgboost import XGBClassifier

from skops.io import dump, load

In [4]:
random_seed = 144
n_folds=10    
optimisation_score='recall'
scoring_metrics=['recall', 'roc_auc', 'precision', ]
mapping_dict = {'S':0, 'R':1}
folds = StratifiedKFold(n_splits=n_folds, random_state=random_seed, shuffle=True)
line=[]
very_major_errors = {}
major_errors = {}

In [5]:
X={}
Y={}
Z={}

for i in ['train', 'test', 'validation', 'mic']:
    X[i]={}
    Y[i]={}
    Z[i]={}
    with open('data/ds-'+i+'.npy', 'rb') as f:
        Y[i]['input'] = numpy.load(f)
        X[i]['input'] = numpy.load(f)
        Z[i]['input'] = numpy.load(f, allow_pickle=True)

suspectpza={}
for i in ['train', 'test', 'validation', 'mic']:
    suspectpza[i]={}
    with open('data/suspectpza-'+i+'.npy', 'rb') as f:
        suspectpza[i]['input'] = numpy.load(f)
        suspectpza[i]['predicted'] = numpy.load(f)


In [6]:
def construct_line(model_name , dataset, scores, y, best_parameters):
  
    row=[]
    row.append(model_name)
    row.append(dataset)

    if scores is not None:
        for i in scores:
            row.append(i)
    else:        
        if 'predicted' in y.keys():
            row.append(100*recall_score(y['input'],y['predicted'],pos_label=1))   
            row.append(None)     
            row.append(100*recall_score(y['input'],y['predicted'],pos_label=0))        
            row.append(None)
        else:
            row.append(None)
            row.append(None)
            row.append(None)
            row.append(None)
        if 'scores' in y.keys():
            row.append(100*roc_auc_score(y['input'],y['scores']))
        else:
            row.append(None)
        row.append(None)    
            
    table = confusion_matrix(y['input'], y['predicted'])
    row.append(table[0][0])
    row.append(table[0][1])
    row.append(table[1][0])
    row.append(table[1][1])
    row.append(json.dumps(best_parameters))
    return row

In [7]:
def tune_model(line, model, model_name, X, Y, param_grid, folds, optimisation_score):

    # hyperparameter tuning
    grid_search = GridSearchCV( model, 
                                param_grid, 
                                cv=folds, 
                                n_jobs=-1, 
                                return_train_score=True,
                                scoring=scoring_metrics,
                                refit=optimisation_score )

    grid_search.fit(X['train']['input'], Y['train']['input'])

    cv_results = pandas.DataFrame(grid_search.cv_results_)

    # get the best model
    best_model = grid_search.best_estimator_

    for dataset in ['train', 'test', 'validation', 'mic']: #, 'orphan']:
        
        if dataset=='train':
            scores = []
            for score in scoring_metrics:
                mean_score = cv_results[cv_results['rank_test_'+score]==1]['mean_test_'+score].values[0]
                std_score = cv_results[cv_results['rank_test_'+score]==1]['std_test_'+score].values[0]
                scores.append(100*mean_score)
                scores.append(100*std_score)
        else:
            scores=None
        Y[dataset]['predicted'] = best_model.predict(X[dataset]['input'])
        Y[dataset]['scores'] = best_model.predict_proba(X[dataset]['input'])[:,1]

        row = construct_line(model_name , dataset, scores, Y[dataset], grid_search.best_params_)
        line.append(row)

    return(line, best_model)

In [8]:
model_logistic_regression = LogisticRegression(random_state=random_seed, class_weight='balanced')

param_grid = [ 
                {   'penalty': ['l1', 'l2'],
                    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'], 
                    'C': numpy.logspace(-3,3,7)
                } 
            ]

line, best_LR = tune_model(line, model_logistic_regression, "LR", X, Y, param_grid, folds, optimisation_score)

dump(best_LR, 'models/lr.skops')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [10]:
# model_random_forest = RandomForestClassifier(random_state=random_seed, class_weight='balanced')

# param_grid = [ 
#                 { 'n_estimators': [2, 5, 10, 50, 100, 200, 500],                        
#                 'max_features': [1, 2, 4, 8, 16, 20],
#                 'max_depth':[1,2,3,4,5,6],
#                 'bootstrap':[True,False] } 
#             ]

# line = tune_model(line, model_random_forest, 'RF', X, Y, param_grid, folds, optimisation_score)


In [11]:
model_nn = MLPClassifier(random_state=random_seed)

param_grid = [ 
        {   'solver': ['lbfgs'],
            'max_iter': [400, 800],
            'alpha':10.0 ** -numpy.arange(1, 7),
            'hidden_layer_sizes':[(100,), (20,10,5), (10,5), (20,10), (100,50,10)] ,
            'activation':['relu','logistic','tanh']} 
    ]

line, best_NN = tune_model(line, model_nn, "NN", X, Y, param_grid, folds, optimisation_score)

dump(best_NN, 'models/nn.skops')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [13]:
model_xgb = XGBClassifier()

param_grid = [ 
        {   'n_estimators': [25, 50, 100, 150, 200],
            'subsample': [0.6, 0.7, 0.8, 0.9, 1],
            'max_depth':[2,4,5,6,8],
            'min_child_weight': [0, 0.1, 1, 10],
            'learning_rate':[0.005, 0.01, 0.05, 0.075, 0.1, 0.125, 0.1] } 
    ]

line, best_XB = tune_model(line, model_xgb, "XB", X, Y, param_grid, folds, optimisation_score)

dump(best_XB, 'models/xb.skops')


In [27]:
for dataset in ['train', 'test', 'validation', 'mic']:
    line.append(construct_line('SP', dataset, None, suspectpza[dataset], None))

In [14]:
results = pandas.DataFrame(line, columns=['model', 'dataset', 'sensitivity_mean', 'sensitivity_std', 'specificity_mean', 'specificity_std' ,'roc_auc_mean', 'roc_auc_std','TN','FP','FN','TP', 'model_parameters'])
results

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters
0,LR,train,78.8,6.200717,84.357576,5.1144,82.48406,7.276335,180,38,49,197,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline..."
1,LR,test,78.640777,,70.103093,,82.684416,,68,29,22,81,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline..."
2,LR,validation,97.557471,,43.845535,,80.006802,,545,698,68,2716,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline..."
3,LR,mic,100.0,,14.285714,,68.0,,1,6,0,50,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline..."
4,NN,train,78.916667,5.14795,83.682792,5.282488,81.760028,7.781499,218,0,0,246,"{""activation"": ""logistic"", ""alpha"": 0.01, ""hid..."
5,NN,test,76.699029,,67.010309,,79.861876,,65,32,24,79,"{""activation"": ""logistic"", ""alpha"": 0.01, ""hid..."
6,NN,validation,94.755747,,48.833467,,77.079895,,607,636,146,2638,"{""activation"": ""logistic"", ""alpha"": 0.01, ""hid..."
7,NN,mic,96.0,,28.571429,,69.714286,,2,5,2,48,"{""activation"": ""logistic"", ""alpha"": 0.01, ""hid..."
8,XB,train,79.216667,5.468115,85.781926,4.776355,83.502833,7.88022,192,26,43,203,"{""learning_rate"": 0.05, ""max_depth"": 4, ""min_c..."
9,XB,test,77.669903,,75.257732,,82.764488,,73,24,23,80,"{""learning_rate"": 0.05, ""max_depth"": 4, ""min_c..."


In [15]:
results['diagnostic_odds_ratio'] = (results['TN']*results['TP'])/(results['FN']*results['FP'])
results.to_csv('results.csv')
results[:3]

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters,diagnostic_odds_ratio
0,LR,train,78.8,6.200717,84.357576,5.1144,82.48406,7.276335,180,38,49,197,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline...",19.044039
1,LR,test,78.640777,,70.103093,,82.684416,,68,29,22,81,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline...",8.633229
2,LR,validation,97.557471,,43.845535,,80.006802,,545,698,68,2716,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline...",31.186162


In [30]:
results

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters,diagnostic_odds_ratio
0,LR,train,78.8,6.200717,84.357576,5.1144,82.48406,7.276335,180,38,49,197,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline...",19.044039
1,LR,test,78.640777,,70.103093,,82.684416,,68,29,22,81,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline...",8.633229
2,LR,validation,97.557471,,43.845535,,80.006802,,545,698,68,2716,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline...",31.186162
3,LR,mic,100.0,,14.285714,,68.0,,1,6,0,50,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline...",inf
4,NN,train,78.916667,5.14795,83.682792,5.282488,81.760028,7.781499,218,0,0,246,"{""activation"": ""logistic"", ""alpha"": 0.01, ""hid...",inf
5,NN,test,76.699029,,67.010309,,79.861876,,65,32,24,79,"{""activation"": ""logistic"", ""alpha"": 0.01, ""hid...",6.686198
6,NN,validation,94.755747,,48.833467,,77.079895,,607,636,146,2638,"{""activation"": ""logistic"", ""alpha"": 0.01, ""hid...",17.244615
7,NN,mic,96.0,,28.571429,,69.714286,,2,5,2,48,"{""activation"": ""logistic"", ""alpha"": 0.01, ""hid...",9.6
8,XB,train,79.216667,5.468115,85.781926,4.776355,83.502833,7.88022,192,26,43,203,"{""learning_rate"": 0.05, ""max_depth"": 4, ""min_c...",34.862254
9,XB,test,77.669903,,75.257732,,82.764488,,73,24,23,80,"{""learning_rate"": 0.05, ""max_depth"": 4, ""min_c...",10.57971
