In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

In [2]:
brset_labels = pd.read_csv('labels.csv') # From Original BRSET Dataset
brset_embed = pd.read_csv('embeddings.csv') # From Embeddings archive
brset_labels['split'] = brset_embed['split']
brset_labels['DR_2'] = brset_embed['DR_2']

### Preprocessing

Taken from `brset_preprocessing` from `src/get_data.py`:

```python
    df['text'] = df.apply(lambda row: (
        f"An image from the {convert_eye(row['exam_eye'])} eye of a {convert_sex(row['patient_sex'])} patient, "
        f"aged {'no age reported' if pd.isnull(row['patient_age']) else str(float(str(row['patient_age']).replace('O', '0').replace(',', '.')))} years, "
        f"{'with no comorbidities reported' if pd.isnull(row['comorbidities']) else 'with comorbidities: ' + row['comorbidities']}, "
        f"{'with no diabetes duration reported' if pd.isnull(row['diabetes_time_y']) or row['diabetes_time_y'] == 'Não' else 'diabetes diagnosed for ' + str(float(str(row['diabetes_time_y']).replace('O', '0').replace(',', '.'))) + ' years'}, "
        f"{'not using insulin' if row['insuline'] == 'no' else 'using insulin'}. "
        f"The optic disc is {convert_presence(row['optic_disc'])}, vessels are {convert_presence(row['vessels'])}, "
        f"and the macula is {convert_presence(row['macula'])}. "
        f"Conditions include macular edema: {convert_presence(row['macular_edema'])}, scar: {convert_presence(row['scar'])}, "
        f"nevus: {convert_presence(row['nevus'])}, amd: {convert_presence(row['amd'])}, vascular occlusion: {convert_presence(row['vascular_occlusion'])}, "
        f"drusens: {convert_presence(row['drusens'])}, hemorrhage: {convert_presence(row['hemorrhage'])}, "
        f"retinal detachment: {convert_presence(row['retinal_detachment'])}, myopic fundus: {convert_presence(row['myopic_fundus'])}, "
        f"increased cup disc ratio: {convert_presence(row['increased_cup_disc'])}, and other conditions: {convert_presence(row['other'])}."
    ), axis=1)
```

In [3]:
brset_labels.loc[brset_labels['diabetes_time_y']=='Não', 'diabetes_time_y'] = np.NaN
brset_labels['diabetes_time_y'] = brset_labels['diabetes_time_y'].str.replace('O', '0').str.replace(',','.').astype(float)

In [4]:
brset_labels['optic_disc'] = (brset_labels['optic_disc'] == "1").astype(int)


In [5]:
brset_labels['insuline'] = brset_labels['insuline'].map({'no': 0, 'yes': 1})


In [6]:
exam_columns = ['optic_disc', 'macula', 'macular_edema', 'scar', 'nevus', 'amd', 'vascular_occlusion', 'drusens', 'hemorrhage', 
                'retinal_detachment', 'myopic_fundus', 'increased_cup_disc', 'other']
def convert_presence(df, columns):
    new_df = df.copy()
    for column in columns:
        new_df[column] = (df[column] == 1).astype(int)
    return df

brset_labels = convert_presence(brset_labels, exam_columns)

In [7]:
# Convert brset_labels to have similar columns as would have been available in the text note.
note_columns = ['exam_eye', 'patient_sex', 'patient_age', 'diabetes_time_y', 'insuline'] + exam_columns
comorbidities = brset_labels['comorbidities'].str.get_dummies(sep=', ')
X = pd.concat([brset_labels[note_columns], comorbidities], axis=1)
y = brset_labels['DR_2']

In [8]:
X_train = X[brset_labels['split'] == 'train']
X_test = X[brset_labels['split'] == 'test']
y_train = y[brset_labels['split'] == 'train']
y_test = y[brset_labels['split'] == 'test']

### All Columns available in the simulated note

In [9]:
# Use cross validation to determine how long to train xgb model
f1_eval = lambda y_pred, dtrain: ('f1', f1_score(dtrain.get_label(), y_pred > 0.5))

xgb_params = {'objective': 'binary:logistic', 'nthread': 8, 'seed': 42}
cv_results = xgb.cv(params=xgb_params, 
                    dtrain=xgb.DMatrix(X_train, y_train),
                    num_boost_round=200,
                    nfold=10,
                    custom_metric=f1_eval,
                    maximize=True,
                    early_stopping_rounds=10,
                    seed=42)

best_round = np.argmax(cv_results['test-f1-mean'])

xgb_model = xgb.train(xgb_params, xgb.DMatrix(X_train, y_train), num_boost_round=best_round)

In [10]:
# Calculate auc, f1, and accuracy
y_pred = xgb_model.predict(xgb.DMatrix(X_test))
print(f'AUC: {roc_auc_score(y_test, y_pred)}')
print(f'F1: {f1_score(y_test, y_pred > 0.5)}')
print(f'Accuracy: {accuracy_score(y_test, y_pred > 0.5)}')

AUC: 0.9831467761453916
F1: 0.8712871287128713
Accuracy: 0.984019668100799


### Only columns for opthalmologist exam

In [11]:
cv_results = xgb.cv(params=xgb_params, 
                    dtrain=xgb.DMatrix(X_train[exam_columns], y_train),
                    num_boost_round=200,
                    nfold=10,
                    custom_metric=f1_eval,
                    maximize=True,
                    early_stopping_rounds=10,
                    seed=42)

best_round = np.argmax(cv_results['test-f1-mean'])

xgb_model = xgb.train(xgb_params, xgb.DMatrix(X_train[exam_columns], y_train), num_boost_round=best_round)

In [12]:
y_pred = xgb_model.predict(xgb.DMatrix(X_test[exam_columns]))
print(f'AUC: {roc_auc_score(y_test, y_pred)}')
print(f'F1: {f1_score(y_test, y_pred > 0.5)}')
print(f'Accuracy: {accuracy_score(y_test, y_pred > 0.5)}')

AUC: 0.9217947180460788
F1: 0.8585858585858586
Accuracy: 0.9827904118008605


### Only Patient History columns

In [13]:
history_columns = list(set(X.columns) - set(exam_columns))
cv_results = xgb.cv(params=xgb_params, 
                    dtrain=xgb.DMatrix(X_train[history_columns], y_train),
                    num_boost_round=200,
                    nfold=10,
                    custom_metric=f1_eval,
                    maximize=True,
                    early_stopping_rounds=10,
                    seed=42)

best_round = np.argmax(cv_results['test-f1-mean'])

xgb_model = xgb.train(xgb_params, xgb.DMatrix(X_train[history_columns], y_train), num_boost_round=best_round)

In [14]:
y_pred = xgb_model.predict(xgb.DMatrix(X_test[history_columns]))

print(f'AUC: {roc_auc_score(y_test, y_pred)}')
print(f'F1: {f1_score(y_test, y_pred > 0.5)}')
print(f'Accuracy: {accuracy_score(y_test, y_pred > 0.5)}')


AUC: 0.8812276797186612
F1: 0.5157232704402516
Accuracy: 0.9526736324523664
