In [None]:
import pandas as pd
import numpy as np
import geopandas as gp
import geoplot
import seaborn

In [None]:
import matplotlib.pyplot as plt
import time
import shapely
import rtree

In [None]:
import sklearn
from sklearn.ensemble import HistGradientBoostingClassifier as GBT
from sklearn.metrics import roc_curve, auc, roc_auc_score, make_scorer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_val_score, GridSearchCV
from scipy.stats import randint,uniform,loguniform

In [None]:
from joblib import dump, load

# Load data

In [None]:
## Load data
data_raw = pd.read_csv('/Data/TB_Diagnostics/inputVars.csv',parse_dates=['dateOfTest'],dtype=float)
data_vet = pd.read_csv('/Data/TB_Diagnostics/inputVars_VetOnly.csv',parse_dates=['dateOfTest'],dtype=float)

In [None]:
## Choose a random subset, same size as vet data
data = data_raw.sample(len(data_vet))

In [None]:
len(data)

In [None]:
min(data.dateOfTest) , max(data.dateOfTest)

In [None]:
# Get target feature (confirmed breakdowns) as binary class
data_y = data.confirmedBreakdown.to_numpy().astype(bool)

In [None]:
# Get observed features
data_X = data.drop(columns=['confirmedBreakdown'])

In [None]:
# Convert dates to float
data_X.dateOfTest = data_X.dateOfTest.astype(int).astype(float)
# Add Random features
data_X['rand'] = np.random.random_sample(len(data_X))

In [None]:
# Detect categorical features (<= 3 categories and explicit named features)
named_cat_features = ['vetPractice','batchBovine','batchAvian']
cat_features = []
for c in data_X.columns:
    catf = len(data_X[c].unique())<=3
    if c in named_cat_features:
        catf = True
    cat_features.append(catf)

# NB: this is fine for boolean features (inc. missing values)
#      but needs a proper encoding for true categorical features.

In [None]:
# Convery all to float matrix
data_X = data_X.to_numpy()

# Training and testing sets

In [None]:
# Hold a final test set (random)
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.20)

# Model scoring functions

In [None]:
## Function: sensitivity(prediction,target)
# returns sensitivity of prediction vs. target
# Se = TP / (TP + FN)
def sensitivity(p,t):
    TP = (p&t).sum()
    FN = (~p&t).sum()
    return TP / (TP + FN)

## Function: specificity(prediction,target)
# returns specificity of prediction vs. target
# Sp = TN / (TN + FP)
def specificity(p,t):
    TN = (~p&~t).sum()
    FP = (p&~t).sum()
    return TN / (TN + FP)

### SICCT Test performance

In [None]:
sicct = X_test[:,1].astype(bool)

In [None]:
## Sensitivity
Se_sicct = sensitivity(sicct,y_test)
Se_sicct

In [None]:
## Specificity
Sp_sicct = specificity(sicct,y_test)
Sp_sicct

In [None]:
## Accuracy
(sicct==y_test).sum() / len(y_test)

### Custom model scoring function

In [None]:
# Set specificity threshold to level for SICCT-only prediction
specificity_threshold = Sp_sicct

In [None]:
# define a custom score function:
#    score keeps specificity above SICCT, maximises sensitivity
def sensspec_score(t,p): #input: true (t) and predicted (p) classes
    if specificity(p,t) < specificity_threshold:
        return 0
    else:
        return sensitivity(p,t)

custom_score = make_scorer(sensspec_score)

# Hyperparameter tuning / cross-validation

In [None]:
# Create model
gbt = GBT(categorical_features=cat_features, class_weight='balanced')

In [None]:
# define parameter spaces
param_grid = {'learning_rate':[1.0,0.1,0.01,0.001,0.0005,0.0001,0.00005,0.00001],
              'max_leaf_nodes':[2,5,10,20,30,50,100,500,1000]}

#param_dists = {'learning_rate':loguniform(0.00001,1.0),
#               'max_leaf_nodes':randint(2,10000)}

param_dists = {'learning_rate':loguniform(0.01,1.0),
               'max_leaf_nodes':randint(2,2000)}

In [None]:
# perform grid search
#model = GridSearchCV(gbt, param_grid, n_jobs=-1, cv=5)#, scoring='recall') #5-fold cross-validation #n_jobs -1 for all procs
#start_time = time.time()
#model.fit(X_train, y_train)
#print("%0.2f seconds" % (time.time() - start_time))

In [None]:
# perform random search
model = RandomizedSearchCV(gbt, param_dists, n_jobs=-1, cv=10, verbose=1, n_iter=100, scoring='roc_auc')
start_time = time.time()
model.fit(X_train, y_train)
print("%0.2f seconds" % (time.time() - start_time))

In [None]:
model.best_params_

In [None]:
model.score(X_test,y_test)

In [None]:
tuning_results = pd.DataFrame(model.cv_results_)

In [None]:
seaborn.relplot(tuning_results,x='param_learning_rate',y='mean_test_score',hue='param_max_leaf_nodes')
plt.xscale('log')
plt.grid()

In [None]:
seaborn.relplot(tuning_results,x='param_max_leaf_nodes',y='mean_test_score',hue='param_learning_rate')
#plt.xscale('log')
plt.grid()

In [None]:
seaborn.relplot(tuning_results,x='param_learning_rate',y='param_max_leaf_nodes', hue='mean_fit_time',size='mean_test_score',sizes=(1,200))
#plt.axhline(model.best_params_['max_leaf_nodes'],ls='--', label='Best fit')
#plt.axvline(model.best_params_['learning_rate'],ls='--')
plt.scatter(model.best_params_['learning_rate'],model.best_params_['max_leaf_nodes'], marker='+', c='b', s=300)
plt.xscale('log')

# Evaluate performance

In [None]:
## Model score on testing set: (score is metric set at training time)
model.score(X_test,y_test)

In [None]:
## Get test predictions for more detailed evaluation:
y_test_result = model.predict(X_test)
y_score = model.decision_function(X_test)

In [None]:
## Sensitivity
Se = sensitivity(y_test_result,y_test)
Se

In [None]:
## Specificity
Sp = specificity(y_test_result,y_test)
Sp

In [None]:
## Accuracy
(y_test_result==y_test).sum() / len(y_test)

---
# ROC Curves

In [None]:
fpr, tpr, _ = roc_curve(y_test,y_score)
roc_auc = auc(fpr,tpr)

In [None]:
roc_auc

In [None]:
plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    lw=lw,
    label="ROC curve, model (area = %0.2f)" % roc_auc,
)
plt.plot(1-Sp_sicct,Se_sicct,'+', label="SICCT only", ms='15')
plt.plot([0, 1], [0, 1], lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel("(1 - Specificity)")
plt.ylabel("Sensitivity")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()

---
# Decision threshold choice

In [None]:
# function to apply decision threshold
def predict_with_threshold(X, model, decision_threshold):
    return model.predict_proba(X)[:,1]>=decision_threshold

In [None]:
# try different thresholds
thresholds = np.linspace(0.0,1.0,101)
sens = np.zeros(len(thresholds)) #sensitivity at threshold
spec = np.zeros(len(thresholds)) #specificity at threshold
for x in range(len(thresholds)):
    y_th = predict_with_threshold(X_test,model,thresholds[x])
    sens[x] = sensitivity(y_th,y_test)
    spec[x] = specificity(y_th,y_test)

best_sens = max(sens[spec >= Sp_sicct]) #sensitivity s.t. specificity >= SICCT
best_thresh = min(thresholds[spec >= Sp_sicct]) #threshold with max sensitivity s.t. specificity >= SICCT

In [None]:
# plot thresholds
plt.plot(thresholds,sens,label='Model sensitivity')
plt.plot(thresholds,spec,label='Model specificity')
best_sens_label = 'Best sensitivity = '+str(round(best_sens*100,1))+'%'
sicct_sens_label = 'SICCT sensitivity = '+str(round(Se_sicct*100,1))+'%'
sicct_spec_label = 'SICCT specificity = '+str(round(Sp_sicct*100,1))+'%'
best_thresh_label = 'Best threshold = '+str(round(best_thresh,3))
plt.axvline(best_thresh,c='k',ls='-.',label=best_thresh_label)
plt.axhline(best_sens,c='k',ls='--',label=best_sens_label)
plt.axhline(Se_sicct,c='tab:blue',ls=':',label=sicct_sens_label)
plt.axhline(Sp_sicct,c='tab:orange',ls=':',label=sicct_spec_label)
plt.xlabel('Decision Threshold')
plt.legend(bbox_to_anchor=(1.0, 0.7))

In [None]:
# Increase in sensitivity
str(round((best_sens-Se_sicct)/Se_sicct * 100,1))+'% increase in sensitivity over SICCT alone.'

---
# Save model

In [None]:
# Save training / testing datasets to disk
dump((X_train, X_test, y_train, y_test), '/Data/TB_Diagnostics/final_data_split_VetOnly_Control_5.data')
# Save model to disk
dump(model, '/Data/TB_Diagnostics/final_model_VetOnly_Control_5.model')