In [18]:
import numpy, pandas

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier

from skops.io import dump

from misc import construct_line

### Model training

Now we can perform hyperparameter tuning using crossfold validation on the `training` set, select the best model and record its performance, which as it has been measured independently on each fold, naturally has a variance.

Note that 
* the `random_seed` is fixed to ensure reproducibility
* since the `1` state is resistance we are predicting resistance
* we are optimising using `recall` to ensure the very major error (VME) rate is minimised


In [19]:
random_seed = 144
n_folds=10    
optimisation_score='recall'
scoring_metrics=['recall', 'roc_auc', 'precision', ]
mapping_dict = {'S':0, 'R':1}
folds = StratifiedKFold(n_splits=n_folds, random_state=random_seed, shuffle=True)
line=[]
very_major_errors = {}
major_errors = {}

Let's load the `training` arrays and also the result of SuspectPZA applied to our `training` dataset

In [20]:
X={}
Y={}
Z={}

X['train']={}
Y['train']={}
Z['train']={}

with open('data/ds-train.npy', 'rb') as f:
    Y['train']['input'] = numpy.load(f)
    X['train']['input'] = numpy.load(f)
    Z['train']['input'] = numpy.load(f, allow_pickle=True)

suspectpza={}
suspectpza['train']={}
with open('data/suspectpza-train.npy', 'rb') as f:
    suspectpza['train']['input'] = numpy.load(f)
    suspectpza['train']['predicted'] = numpy.load(f)


The below function performs the hyperparameter tuning on a provided model and records several performance metrics

In [21]:
def tune_model(line, model, model_name, X, Y, param_grid, folds, optimisation_score):

    # hyperparameter tuning
    grid_search = GridSearchCV( model, 
                                param_grid, 
                                cv=folds, 
                                n_jobs=-1, 
                                return_train_score=True,
                                scoring=scoring_metrics,
                                refit=optimisation_score )

    grid_search.fit(X['train']['input'], Y['train']['input'])

    cv_results = pandas.DataFrame(grid_search.cv_results_)

    # get the best model
    best_model = grid_search.best_estimator_
        
    scores = []
    for score in scoring_metrics:
        mean_score = cv_results[cv_results['rank_test_'+score]==1]['mean_test_'+score].values[0]
        std_score = cv_results[cv_results['rank_test_'+score]==1]['std_test_'+score].values[0]
        scores.append(100*mean_score)
        scores.append(100*std_score)

    Y['train']['predicted'] = best_model.predict(X['train']['input'])
    Y['train']['scores'] = best_model.predict_proba(X['train']['input'])[:,1]

    row = construct_line(model_name , 'train', scores, Y['train'], grid_search.best_params_)
    line.append(row)

    return(line, best_model)

### 1. Logistic regression

In [22]:
model_logistic_regression = LogisticRegression(random_state=random_seed, class_weight='balanced')

param_grid = [ 
                {   'penalty': ['l1', 'l2'],
                    'solver'  : ['newton-cg', 'lbfgs', 'liblinear'], 
                    'C': numpy.logspace(-3,3,7)
                } 
            ]

line, best_LR = tune_model(line, model_logistic_regression, "LR", X, Y, param_grid, folds, optimisation_score)

dump(best_LR, 'models/lr.skops')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

### 2. Multilayer perception classifier

In [24]:
model_nn = MLPClassifier(random_state=random_seed)

param_grid = [ 
        {   'solver': ['lbfgs'],
            'max_iter': [400, 800],
            'alpha':10.0 ** -numpy.arange(1, 7),
            'hidden_layer_sizes':[(100,), (20,10,5), (10,5), (20,10), (100,50,10)] ,
            'activation':['relu','logistic','tanh']} 
    ]

line, best_NN = tune_model(line, model_nn, "NN", X, Y, param_grid, folds, optimisation_score)

dump(best_NN, 'models/nn.skops')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

### 3. XGBoost Classifier

In [25]:
model_xgb = XGBClassifier()

param_grid = [ 
        {   'n_estimators': [25, 50, 100, 150, 200],
            'subsample': [0.6, 0.7, 0.8, 0.9, 1],
            'max_depth':[2,4,5,6,8],
            'min_child_weight': [0, 0.1, 1, 10],
            'learning_rate':[0.005, 0.01, 0.05, 0.075, 0.1, 0.125, 0.1] } 
    ]

line, best_XB = tune_model(line, model_xgb, "XB", X, Y, param_grid, folds, optimisation_score)

dump(best_XB, 'models/xb.skops')


### 4. SuspectPZA

Now let's add the results for the SuspectPZA model

In [27]:
line.append(construct_line('SP', 'train', None, suspectpza['train'], None))

Finally let's store everything in a Pandas dataframe to make it easy to query and draw figures etc

In [28]:
training_results = pandas.DataFrame(line, columns=['model', 'dataset', 'sensitivity_mean', 'sensitivity_std', 'specificity_mean', 'specificity_std' ,'roc_auc_mean', 'roc_auc_std','TN','FP','FN','TP', 'model_parameters'])

# calculate the diagnostic odds ration
training_results['diagnostic_odds_ratio'] = (training_results['TN']*training_results['TP'])/(training_results['FN']*training_results['FP'])

# save to disc as a CSV
training_results.to_csv('results-training.csv', index=False)

training_results

Unnamed: 0,model,dataset,sensitivity_mean,sensitivity_std,specificity_mean,specificity_std,roc_auc_mean,roc_auc_std,TN,FP,FN,TP,model_parameters,diagnostic_odds_ratio
0,LR,train,78.8,6.200717,84.357576,5.1144,82.48406,7.276335,180,38,49,197,"{""C"": 1.0, ""penalty"": ""l1"", ""solver"": ""libline...",19.044039
1,NN,train,78.916667,5.14795,83.682792,5.282488,81.760028,7.781499,218,0,0,246,"{""activation"": ""logistic"", ""alpha"": 0.01, ""hid...",inf
2,XB,train,79.216667,5.468115,85.781926,4.776355,83.502833,7.88022,192,26,43,203,"{""learning_rate"": 0.05, ""max_depth"": 4, ""min_c...",34.862254
3,SP,train,97.96748,,95.412844,,,,208,10,5,241,,1002.56
4,SP,train,97.96748,,95.412844,,,,208,10,5,241,,1002.56
