#Mounting GDrive

In [None]:
!pip install --upgrade xlrd

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

# Imports

In [None]:
!pip install shap
!pip install xgboost
!pip install

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.impute import KNNImputer 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RepeatedStratifiedKFold
import time
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import svm, tree
from sklearn.ensemble import RandomForestClassifier
import shap
import ipywidgets as widgets

# Utility functions

Following utilty function for train-test split and standard scaling.
Note: scaling corruptses data nature so be sure to remove it if you need to visualize data or models

In [None]:
def train_test_split_standard_scaler(X, Y, trainSize, randomState):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=trainSize, random_state=randomState)
    scaler = StandardScaler() 
    scaler.fit(X_train)  
    X_train = scaler.transform(X_train)  
    X_test = scaler.transform(X_test)
    return (X_train, X_test, Y_train, Y_test, scaler)

Compressed training and tuning

In [None]:
def trainModel(model, params_to_optmise, X_train, X_test, Y_train, Y_test):
    rskf = RepeatedStratifiedKFold( random_state=42)
    grid_search_cv = GridSearchCV(model, params_to_optmise, n_jobs=-1, cv=rskf, scoring='balanced_accuracy', verbose=10)
    grid_search_cv.fit(X_train, Y_train)
    Y_pred = grid_search_cv.predict(X_test)
    cm = confusion_matrix(Y_test, Y_pred)
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp/(tp+fn) #treu positive rate -> to maximise
    specificity = tn/(tn+fp) #true negative rate -> to maximise
    balanced_accuracy = (sensitivity + specificity) / 2 #-> to maximise
    print(grid_search_cv.best_params_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=grid_search_cv.classes_)
    disp.plot()
    return grid_search_cv.best_estimator_, sensitivity, specificity, balanced_accuracy

Utility function for a non-built-in shap plot

In [None]:
def ABS_SHAP(df_shap,df):
    #import matplotlib as plt
    # Make a copy of the input data
    shap_v = pd.DataFrame(df_shap)
    feature_list = df.columns
    shap_v.columns = feature_list
    df_v = df.copy().reset_index().drop('index',axis=1)
    
    # Determine the correlation in order to plot with different colors
    corr_list = list()
    for i in feature_list:
        b = np.corrcoef(shap_v[i],df_v[i])[1][0]
        corr_list.append(b)
    corr_df = pd.concat([pd.Series(feature_list),pd.Series(corr_list)],axis=1).fillna(0)
    # Make a data frame. Column 1 is the feature, and Column 2 is the correlation coefficient
    corr_df.columns  = ['Variable','Corr']
    corr_df['Sign'] = np.where(corr_df['Corr']>0,'red','blue')
    
    # Plot it
    shap_abs = np.abs(shap_v)
    k=pd.DataFrame(shap_abs.mean()).reset_index()
    k.columns = ['Variable','SHAP_abs']
    k2 = k.merge(corr_df,left_on = 'Variable',right_on='Variable',how='inner')
    k2 = k2.sort_values(by='SHAP_abs',ascending = True)
    colorlist = k2['Sign']
    ax = k2.plot.barh(x='Variable',y='SHAP_abs',color = colorlist, figsize=(5,6),legend=False)
    ax.set_xlabel("SHAP Value (Red = Positive Impact)")

# Test 1: drop columns that are empty in test set

## Reading, dropping and encoding

Importing dataset

In [None]:
base_path = "/content/drive/Shareddrives/IDA covidcxr-hackaton/"
df = pd.read_excel(base_path+'trainClinData_brixiascore.xls')

Dropping some columns, in particular Ox_percentage and Position

In [None]:
df.drop(['Unnamed: 0', 'Row_number', 'ImageFile', 'Death', 'Ox_percentage', 'Position','Fibrinogen',
         'Brixia_score_prob_0', 'Brixia_score_prob_1', 'Brixia_score_prob_2', 'Brixia_score_prob_3', 'Brixia_score_prob_4', 'Brixia_score_prob_5'], axis=1, inplace=True)

Replace single brixia scores with their sum

In [None]:
sumBsScore = df.Brixia_score_0 + df.Brixia_score_1 + df.Brixia_score_2 + df.Brixia_score_3 + df.Brixia_score_4 + df.Brixia_score_5
df['Brixia_score'] = sumBsScore
df.drop(['Brixia_score_0','Brixia_score_1','Brixia_score_2','Brixia_score_3','Brixia_score_4','Brixia_score_5'], axis=1, inplace=True)

Number encoding of Hospital

In [None]:
df.Hospital = df.Hospital.replace('A', 1)
df.Hospital = df.Hospital.replace('B', 2)
df.Hospital = df.Hospital.replace('C', 3)
df.Hospital = df.Hospital.replace('D', 4)
df.Hospital = df.Hospital.replace('E', 5)
df.Hospital = df.Hospital.replace('F', 6)

## Filling NaNs

Get features coming from bsnet aside.
Get classification label aside





In [None]:
paramsFromBsNet =  ['Brixia_score_correctness', 'Brixia_score_confidence', 'Brixia_score']
classParam = ['Prognosis']

Impute columns from beginning to CardiovascularDisease excluded

In [None]:
imputer = KNNImputer(n_neighbors=20, weights="distance")
excludedColumns = ['CardiovascularDisease','IschemicHeartDisease',	'AtrialFibrillation','HeartFailure','Ictus','HighBloodPressure','Diabetes',	'Dementia',	'BPCO',	'Cancer','ChronicKidneyDisease','RespiratoryFailure','Obesity']
df.loc[:, ~df.columns.isin(excludedColumns+paramsFromBsNet+classParam)] = imputer.fit_transform(df.loc[:, ~df.columns.isin(excludedColumns+paramsFromBsNet+classParam)])

Resetting bool columns to 0 or 1:




In [None]:
df.PositivityAtAdmission = df.PositivityAtAdmission.round()
df.Cough = df.Cough.round()
df.DifficultyInBreathing = df.DifficultyInBreathing.round()

Filling NaNs in excluded columns using 0.5 as value

In [None]:
df[excludedColumns] = df[excludedColumns].replace(np.nan, .5)

## Model-based Feature Selection

Removing records with Brixia_score_correctness = 0

In [None]:
df = df[df.Brixia_score_correctness != 0]

Since *df* dataframe is compltely filled, now we can discover feature importance and conduct feature selection process using models.
The beneath process should tell us which features to keep

In [None]:
clf = ExtraTreesClassifier(n_jobs=-1, criterion='gini', n_estimators=1000, random_state=42, max_samples=.5, bootstrap=True, class_weight='balanced_subsample')
clf = clf.fit(df.loc[:, df.columns[ ~df.columns.isin(['Brixia_score_correctness']+classParam)]], df.Prognosis)
#for name, score in zip(df.columns, clf.feature_importances_):
    #print('{}  :   {}'.format(name, np.round(score, 6)))
model = SelectFromModel(clf, prefit=True, threshold=-np.inf, max_features=19) #17 molto buono
X_new = model.transform(df.loc[:, df.columns[ ~df.columns.isin(['Brixia_score_correctness']+classParam)]])

Discover which are the selected features

In [None]:
print(X_new.shape)
selectedFeatures = df.columns[ ~df.columns.isin(['Brixia_score_correctness']+classParam)][model.get_support()].tolist() #features selected by extra tress
print(selectedFeatures)

## Training some models

In [None]:
X = df.loc[:, selectedFeatures]
Y = df.Prognosis
X_train, X_test, Y_train, Y_test, _ = train_test_split_standard_scaler(X, Y, 0.75, 42)

### Let's start with a simple tree using selectedFeatures

In [None]:
params_to_optmise = {'max_depth' : [3, 4, 5], 'criterion' : ['gini', 'entropy'], 'min_samples_split' : [20, 30, 40, 50], 'min_samples_leaf' : [20, 30, 40, 50], 'splitter' : ['best', 'random']}
decisionTree = DecisionTreeClassifier(class_weight='balanced', random_state=42)
start = time.time()
model, se, sp, ba = trainModel(decisionTree, params_to_optmise, X_train, X_test, Y_train, Y_test)
end = time.time()
print('Sensitivity : {} \nSpecificity : {} \nBalanced Accuracy : {}\nTraining time : {}\nModel : {}'.format(se,sp,ba, end-start, model))

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=selectedFeatures)
plt.figure(figsize=(8, 6))
feat_importances.plot(kind='barh', title='Decision Tree Feature Importances')

In [None]:
'''fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(model, feature_names=selectedFeatures, class_names=model.classes_, filled=True)
fig.savefig('Decision Tree.png')'''

###Let's continue with a SVM

In [None]:
#higher C, higher the allowed number of misclassifications
#higher gamma, higher the curvature of decision surface and the risk of overfitting

param_grid = {'gamma' : [1e-3, 1e-2, 1e-1, 1, 10], 'C' : [1e-3, 1e-2, 1e-1, 1, 10]}
supportVectorMachine = svm.SVC(class_weight='balanced',random_state=42)
start = time.time()
model, se, sp, ba = trainModel(supportVectorMachine, param_grid, X_train, X_test, Y_train, Y_test)
end = time.time()
print('Sensitivity : {} \nSpecificity : {} \nBalanced Accuracy : {}\n Training time : {}\n Model : {}'.format(se,sp,ba, end-start, model))

###Let's continue with a random forest

In [None]:
#max_samples = .6, min_samples_split = 30, n_estimators=500
param_grid = {
    'min_samples_split' : [30],
    'n_estimators' : [500], 
    'max_samples' : [.6]
}
randomForest = RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample',
                                      random_state=42,criterion = 'gini',  bootstrap = True)
start = time.time()
model, se, sp, ba = trainModel(randomForest, param_grid, X_train, X_test, Y_train, Y_test)
end = time.time()
print('Sensitivity : {} \nSpecificity : {} \nBalanced Accuracy : {}\n Training time : {}\n Model : {}'.format(se,sp,ba, end-start, model))

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=selectedFeatures)
plt.figure(figsize=(8, 6))
feat_importances.plot(kind='barh', title='Random Forest Feature Importances')

In [None]:
explainer = shap.TreeExplainer(model)

In [None]:
X_train_df = scaler.inverse_transform(X_train)
X_train_df = pd.DataFrame(X_train_df, columns=selectedFeatures)

To understand the following shap plot please see the report

In [None]:
i=20
shap.initjs()
shap_value = explainer.shap_values(X3_train_df.iloc[i])
shap.force_plot(explainer.expected_value[1], shap_value[1], X3_train_df.iloc[i]) #for severe class considering only 1 sample

### Let's try with extra tree classifier

In [None]:
param_grid = {
    'min_samples_split' : [40, 50], 
    'max_samples' : [.5, .6],
    'n_estimators' : [500, 1000, 1500]
}
xtc = ExtraTreesClassifier(n_jobs=-1, criterion='gini', random_state=42, class_weight='balanced_subsample', bootstrap=True)
start = time.time()
model, se, sp, ba = trainModel(xtc, param_grid, X_train, X_test, Y_train, Y_test)
end = time.time()
print('Sensitivity : {} \nSpecificity : {} \nBalanced Accuracy : {}\n Training time : {}\n Model : {}'.format(se,sp,ba, end-start, model))

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=selectedFeatures)
plt.figure(figsize=(8, 6))
feat_importances.plot(kind='barh', title='Extra Trees Feature Importances')

###Let's try XGBoost 




In [None]:
param_grid = {
     'reg_alpha':[1e-2, 1e-1],
     'gamma' : [.1, .2, .3],
     'subsample' : [.7, .8, .9],
     'colsample_bytree' : [.7, .8, .9],
     'n_estimators' : [100, 120, 150, 300, 500]
}

xgbc = XGBClassifier(objective= 'binary:logistic', seed=27, random_state=42) #BEST
start = time.time()
model, se, sp, ba = trainModel(xgbc, param_grid, X_train, X_test, Y_train, Y_test)
end = time.time()
print('Sensitivity : {} \nSpecificity : {} \nBalanced Accuracy : {}\n Training time : {}\n Model : {}'.format(se,sp,ba, end-start, model))

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=selectedFeatures)
plt.figure(figsize=(8, 6))
feat_importances.plot(kind='barh', title='XGB Feature Importances')

#  Final steps: re-tuning on F records and re-training over 99% of training set

Select F records from dataset. They will be useful for tuning of a new XGBoost focalized on records coming from F hospital

In [None]:
F_records = df[df.Hospital == 6] #records for tuning hyperparameter - dataframe

In [None]:
cv2 = RepeatedStratifiedKFold(random_state=42)
param_grid = {
    'min_samples_split' : [40],
    'n_estimators' : [500], 
    'max_samples' : [ .6]
}
rf =  RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample',random_state=42,  bootstrap = True)
gridSearchCV = GridSearchCV(rf, param_grid,scoring= 'balanced_accuracy', n_jobs=-1, cv=cv2, verbose=10)
gridSearchCV.fit(F_records[selectedFeatures], F_records.Prognosis)
print('Tuned model on F records: \n')
print(gridSearchCV.best_params_)

Re-training of the tuned model over the entire dataset

In [None]:
trainedModel_F  = gridSearchCV.best_estimator_
X3_train, X3_test, Y3_train, Y3_test, scaler = train_test_split_standard_scaler(df[selectedFeatures], df.Prognosis, 0.99, 42) 
trainedModel_F = trainedModel_F.fit(X3_train, Y3_train)

Y3_pred = trainedModel_F.predict(X3_test)
cm = confusion_matrix(Y3_test, Y3_pred)
tn, fp, fn, tp = cm.ravel()
sensitivity = tp/(tp+fn) #treu positive rate -> to maximise
specificity = tn/(tn+fp) #true negative rate -> to maximise
balanced_accuracy = (sensitivity + specificity) / 2 #-> to maximise
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=trainedModel_F.classes_)
disp.plot()
print('Sensitivity : {}'.format(sensitivity))
print('Specificity : {}'.format(specificity))
print('Balanced accuracy : {}'.format(balanced_accuracy))


Feature importances

In [None]:
importances = trainedModel_F.feature_importances_
sorted_indices = np.argsort(importances)

feat_importances = pd.Series(importances[sorted_indices]*100, index=np.array(selectedFeatures)[sorted_indices])
plt.figure(figsize=(8, 6))
plt.xlabel('Feature importance [%]')
feat_importances.plot(kind='barh')

In [None]:
explainer = shap.TreeExplainer(trainedModel_F)

In [None]:
X3_train_df = scaler.inverse_transform(X3_train)
X3_train_df = pd.DataFrame(X3_train_df, columns=selectedFeatures)

In [None]:
shap.initjs()
shap_values = explainer.shap_values(X3_train_df)
ABS_SHAP(shap_values[1],X3_train_df) #for severe class 

In [None]:
print(Y3_train[20])

In [None]:
i=20
shap.initjs()
shap_value = explainer.shap_values(X3_train_df.iloc[i])
shap.force_plot(explainer.expected_value[1], shap_value[1], X3_train_df.iloc[i]) #for severe class considering only 1 sample

# Predictions on test set records

In [None]:
base_path = "/content/drive/Shareddrives/IDA covidcxr-hackaton/"
test_set = pd.read_excel(base_path+'testClinData_brixiascore.xls')

In [None]:
df5 = pd.read_excel(base_path+'trainClinData_brixiascore.xls')

In [None]:
df5.drop(['Unnamed: 0', 'Row_number', 'Death', 'Ox_percentage', 'Position','Fibrinogen',
         'Brixia_score_prob_0', 'Brixia_score_prob_1', 'Brixia_score_prob_2', 'Brixia_score_prob_3', 'Brixia_score_prob_4', 'Brixia_score_prob_5'], axis=1, inplace=True)

In [None]:
test_set.drop(['Unnamed: 0', 'Row_number', 'Death', 'Ox_percentage', 'Position','Fibrinogen',
         'Brixia_score_prob_0', 'Brixia_score_prob_1', 'Brixia_score_prob_2', 'Brixia_score_prob_3', 'Brixia_score_prob_4', 'Brixia_score_prob_5'], axis=1, inplace=True)

In [None]:
sumBsScore = df5.Brixia_score_0 + df5.Brixia_score_1 + df5.Brixia_score_2 + df5.Brixia_score_3 + df5.Brixia_score_4 + df5.Brixia_score_5
df5['Brixia_score'] = sumBsScore
df5.drop(['Brixia_score_0','Brixia_score_1','Brixia_score_2','Brixia_score_3','Brixia_score_4','Brixia_score_5'], axis=1, inplace=True)

In [None]:
sumBsScore = test_set.Brixia_score_0 + test_set.Brixia_score_1 + test_set.Brixia_score_2 + test_set.Brixia_score_3 + test_set.Brixia_score_4 + test_set.Brixia_score_5
test_set['Brixia_score'] = sumBsScore
test_set.drop(['Brixia_score_0','Brixia_score_1','Brixia_score_2','Brixia_score_3','Brixia_score_4','Brixia_score_5'], axis=1, inplace=True)

In [None]:
paramsFromBsNet =  ['Brixia_score_correctness', 'Brixia_score_confidence', 'Brixia_score']
classParam = ['Prognosis']

In [None]:
df5.Hospital = df5.Hospital.replace('A', 1)
df5.Hospital = df5.Hospital.replace('B', 2)
df5.Hospital = df5.Hospital.replace('C', 3)
df5.Hospital = df5.Hospital.replace('D', 4)
df5.Hospital = df5.Hospital.replace('E', 5)
df5.Hospital = df5.Hospital.replace('F', 6)

In [None]:
test_set.Hospital = test_set.Hospital.replace('A', 1)
test_set.Hospital = test_set.Hospital.replace('B', 2)
test_set.Hospital = test_set.Hospital.replace('C', 3)
test_set.Hospital = test_set.Hospital.replace('D', 4)
test_set.Hospital = test_set.Hospital.replace('E', 5)
test_set.Hospital = test_set.Hospital.replace('F', 6)

In [None]:
test_set.shape[1] == df5.shape[1]

In [None]:
df2 = pd.concat([df5, test_set])

In [None]:
df2.columns.values

In [None]:
imputer = KNNImputer(n_neighbors=20, weights="distance")
excludedColumns = ['ImageFile', 'CardiovascularDisease','IschemicHeartDisease',	'AtrialFibrillation','HeartFailure','Ictus','HighBloodPressure','Diabetes',	'Dementia',	'BPCO',	'Cancer','ChronicKidneyDisease','RespiratoryFailure','Obesity']
df2.loc[:, ~df2.columns.isin(excludedColumns+paramsFromBsNet+classParam)] = imputer.fit_transform(df2.loc[:, ~df2.columns.isin(excludedColumns+paramsFromBsNet+classParam)])

In [None]:
df2.PositivityAtAdmission = df2.PositivityAtAdmission.round()
df2.Cough = df2.Cough.round()
df2.DifficultyInBreathing = df2.DifficultyInBreathing.round()

In [None]:
df2[excludedColumns] = df2[excludedColumns].replace(np.nan, .5)

In [None]:
test_set2 = df2.tail(486)

In [None]:
test_set2 = test_set2[selectedFeatures+['ImageFile']]

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
for index, row in test_set2.iterrows():
    imagefile = row.ImageFile
    row = row.drop(labels=['ImageFile'])
    #print(row.to_numpy())
    prediction = trainedModel_F.predict(scaler.transform(np.expand_dims(row.to_numpy(), axis=0)))
    #prediction = randomForest.predict(np.expand_dims(row.to_numpy(), axis=0))
    #prediction = model.predict(np.expand_dims(row.to_numpy(), axis=0))
    print(imagefile+', '+prediction[0])
