In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import multilabel_confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from tqdm import tqdm, trange
from transformers import AdamW
from torch.nn import BCEWithLogitsLoss, BCELoss
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score

# Data

## Reading the data

In [2]:
df = pd.read_csv('data/OpenI/OpenI_cheXpertLabels.csv')
print("The total No. of rows:", len(df))
df.head()

The total No. of rows: 2452


Unnamed: 0,fileNo,COMPARISON,INDICATION,FINDINGS,IMPRESSION,expert_labels,No Finding,Cardiomegaly,Lung Opacity,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Fracture,SupportDevices
0,881,None available.,XXXX-year-old XXXX with dyspnea.,The lungs are without focal air space opacity....,No acute cardiopulmonary abnormality.,['No Finding'],1,0,0,0,0,0,0,0,0,0,0
1,1734,,Back pain,Heart size and mediastinal contour are normal....,No acute cardiopulmonary process.,['No Finding'],1,0,0,0,0,0,0,0,0,0,0
2,306,,,The lungs are clear. Heart size is normal. No ...,Clear lungs. No acute cardiopulmonary abnormal...,['No Finding'],1,0,0,0,0,0,0,0,0,0,0
3,1951,,,Cardiomediastinal silhouette is normal. Pulmon...,No acute cardiopulmonary disease.,['No Finding'],1,0,0,0,0,0,0,0,0,0,0
4,1005,,Pruritic.,Cardiac and mediastinal contours are within no...,No acute findings.,['No Finding'],1,0,0,0,0,0,0,0,0,0,0


In [3]:
cols = df.columns
labels = list(cols[6:])
print('Count of 1 per label: \n', df[labels].sum(), '\n') # Label counts, may need to downsample or upsample

Count of 1 per label: 
 No Finding          1391
Cardiomegaly         375
Lung Opacity         406
Edema                 46
Consolidation         30
Pneumonia             42
Atelectasis          332
Pneumothorax          23
Pleural Effusion     161
Fracture              84
SupportDevices       106
dtype: int64 



## preprocessing the data

1. removing the rows without any Impression and Finding

In [4]:
print('No. of rows with Finding:', len(df[df['FINDINGS'].notnull()]))
print('No. of rows with Impression:', len(df[df['IMPRESSION'].notnull()]))
print('No. of rows with Impression or Finding:', 
      len(df[df['IMPRESSION'].notnull() | df['FINDINGS'].notnull()]))
print('No. of rows without Impression and Finding:', 
      len(df[df['IMPRESSION'].isna() & df['FINDINGS'].isna()]))

No. of rows with Finding: 2106
No. of rows with Impression: 2420
No. of rows with Impression or Finding: 2424
No. of rows without Impression and Finding: 28


In [5]:
idx = df[df['IMPRESSION'].isna() & df['FINDINGS'].isna()].index
df = df.drop(idx)
print('No. of rows without Impression and Finding:', 
      len(df[df['IMPRESSION'].isna() & df['FINDINGS'].isna()]))

No. of rows without Impression and Finding: 0


2. Converting the labels to a single list

In [31]:
labels = ['No Finding', 'Cardiomegaly','Lung Opacity','Edema','Consolidation','Pneumonia','Atelectasis','Pneumothorax','Pleural Effusion','Fracture','SupportDevices']

df_cls = pd.DataFrame(columns = ['text', 'labels'])

def concat_cols(impression, findings):
    if impression is np.nan:
        return findings
    elif findings is np.nan:
        return impression
    else:
        return findings+impression

# create the text column:
df_cls['text'] = df.apply(lambda row: concat_cols(row['IMPRESSION'], row['FINDINGS']), axis=1)
df_cls['labels'] = df.apply(lambda row: row[labels].values, axis=1) #.to_list()

df_cls.head(10)

Unnamed: 0,text,labels
0,The lungs are without focal air space opacity....,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Heart size and mediastinal contour are normal....,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,The lungs are clear. Heart size is normal. No ...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,Cardiomediastinal silhouette is normal. Pulmon...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Cardiac and mediastinal contours are within no...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,Mediastinal contours are normal. Lungs are cle...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6,Heart size and mediastinal contours are unrema...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,"The heart is again enlarged, stable. The left ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
8,Normal heart size. No focal air space consolid...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
9,Heart size within normal limits. No focal airs...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


3. Removing the duplicate reports

In [7]:
print('Unique texts: ', df_cls.text.nunique() == df_cls.shape[0])

Unique texts:  False


In [8]:
print("Length of whole dataframe:", len(df_cls))
print("No. of unique reports:", df_cls.text.nunique())

Length of whole dataframe: 2424
No. of unique reports: 1748


In [9]:
#let's take a look at what are the duplicated texts looks like:
df_cls.text.value_counts()

The heart is normal in size. The mediastinum is unremarkable. The lungs are clear.No acute disease.                                                                                                                                                                                                                                                                                 49
Heart size normal. Lungs are clear. XXXX are normal. No pneumonia, effusions, edema, pneumothorax, adenopathy, nodules or masses.Normal chest                                                                                                                                                                                                                                       35
The lungs are clear bilaterally. Specifically, no evidence of focal consolidation, pneumothorax, or pleural effusion.. Cardio mediastinal silhouette is unremarkable. Visualized osseous structures of the thorax are without acute abnormality.No acute c

In [10]:
# remove the duplicates
df_cls.drop_duplicates('text', inplace=True)

In [11]:
print('Unique texts: ', df_cls.text.nunique() == df_cls.shape[0])

Unique texts:  True


## Splitting the data

In [12]:
# Using stratify
# trainX, valX, trainY, valY = train_test_split(df_cls['text'].values.tolist(), 
#                                               df_cls['labels'].values.tolist(), 
#                                               test_size=0.2,
#                                               stratify = df_cls['labels'].values.tolist())

# ValueError: The least populated class in y has only 1 member, which is too few. 
# The minimum number of groups for any class cannot be less than 2.

# This because of the nature of stratification. The stratify parameter set it 
# to split data in a way to allocate test_size amount of data to each class. 
# In this case, you don't have sufficient class labels of one of your classes 
# to keep the data splitting ratio equal to test_size

train_val_df, test_df = train_test_split(df_cls, test_size=0.2)
train_df, val_df = train_test_split(train_val_df, test_size=0.2)

print('Train: ', len(train_df))
print('Test: ', len(test_df))
print('Val: ', len(val_df))


Train:  1118
Test:  350
Val:  280


# simpletransformers implementation

* MultiLabelClassificationModel has an additional `threshold` parameter with default value 0.5
* MultiLabelClassificationModel takes in an additional optional argument pos_weight. This should be a list with the same length as the number of labels. This enables using different weights for each label when calculating loss during training and evaluation.

In [13]:
from simpletransformers.classification import MultiLabelClassificationModel
import pandas as pd
import logging


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Create a MultiLabelClassificationModel
model = MultiLabelClassificationModel('roberta', 'roberta-base', num_labels=len(labels), args={'reprocess_input_data': True, 'overwrite_output_dir': True, 'num_train_epochs': 5})
# You can set class weights by using the optional weight argument
print(train_df.head())

# Train the model
model.train_model(train_df)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_df)
# print(result)
# print(model_outputs)

predictions, raw_outputs = model.predict(test_df['text'])
# print(predictions)
# print(raw_outputs)


- This IS expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                                   text  \
1328  Normal heart size and mediastinal contours. Lo...   
911            Heart size is normal the lungs are clear   
1280  Borderline enlarged heart size. Atherosclerosi...   
1525  Single view of chest was obtained in AP projec...   
281   The cardiomediastinal silhouette is within nor...   

                                 labels  
1328  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]  
911   [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
1280  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
1525  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
281   [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  


KeyboardInterrupt: 

In [None]:
y_pred = predictions
y_true = test_df['labels'].to_list()

In [None]:
cms = multilabel_confusion_matrix(y_true, y_pred)
def plot_hm(label, conMat):

    df_cm = pd.DataFrame(conMat, range(2), range(2))
    # plt.figure(figsize=(10,7))
    sn.set(font_scale=1.2) # for label size
    sn.heatmap(df_cm, annot=True, annot_kws={"size": 12}, cmap="YlGnBu", fmt="d") # font size
    plt.show()

for cm,label in zip(cms, labels):
    print(label)
#     print(cm)
    plot_hm(label, cm)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

# Hugging Face implementation

part of the code is from : 

post: https://towardsdatascience.com/transformers-for-multilabel-classification-71a1a0daf5e1

code: https://colab.research.google.com/github/rap12391/transformers_multilabel_toxic/blob/master/toxic_multilabel.ipynb#scrollTo=uorMX_zrnISM

In [14]:
df_cls.head()

Unnamed: 0,text,labels
0,The lungs are without focal air space opacity....,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Heart size and mediastinal contour are normal....,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,The lungs are clear. Heart size is normal. No ...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,Cardiomediastinal silhouette is normal. Pulmon...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Cardiac and mediastinal contours are within no...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [15]:
print('average report length: ', round(df_cls.text.str.split().str.len().mean(),2))
print('stdev report length: ', round(df_cls.text.str.split().str.len().std(),2))

average report length:  40.89
stdev report length:  22.79


In [16]:
cols = df.columns
label_cols = list(cols[6:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)

Label columns:  ['No Finding', 'Cardiomegaly', 'Lung Opacity', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Fracture', 'SupportDevices']


In [17]:
label_counts = np.array(df_cls.labels.to_list()).sum(axis=0)
for col, cnt in zip(label_cols, label_counts):
    print(f'{col}: {cnt}')

No Finding: 689
Cardiomegaly: 375
Lung Opacity: 406
Edema: 46
Consolidation: 30
Pneumonia: 42
Atelectasis: 332
Pneumothorax: 23
Pleural Effusion: 160
Fracture: 84
SupportDevices: 105


In [18]:
print('Train: ', len(train_df))
print('Test: ', len(test_df))
print('Val: ', len(val_df))

Train:  1118
Test:  350
Val:  280


## tokenization

* The input_ids are the indices corresponding to each token in our sentence.
* We can now see what the attention_mask is all about: it points out which tokens the model should pay attention to and which ones it should not (because they represent padding in this case).
* token_type_ids are for: they indicate to the model which part of the inputs correspond to the first sentence and which part corresponds to the second sentence. Note that token_type_ids are not required or handled by all models.

In [19]:
max_length = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

reports_train = train_df.text.to_list()
reports_test = test_df.text.to_list()
reports_val   = val_df.text.to_list()

train = tokenizer(reports_train, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")
test = tokenizer(reports_test, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")
val = tokenizer(reports_val, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")
# labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

train_labels = torch.from_numpy(np.array(train_df.labels.to_list()))
test_labels = torch.from_numpy(np.array(test_df.labels.to_list()))
val_labels = torch.from_numpy(np.array(val_df.labels.to_list()))

In [20]:
train.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [32]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 16
use_data_loader = True
if use_data_loader: # if the dataset is huge in size
    # Create an iterator of our data with torch DataLoader. This helps save on memory during training because, 
    # unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory
    train_data = TensorDataset(train.input_ids, train.attention_mask, train_labels, train.token_type_ids)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    validation_data = TensorDataset(val.input_ids, val.attention_mask, val_labels, val.token_type_ids)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
    
    test_data = TensorDataset(test.input_ids, test.attention_mask, test_labels, test.token_type_ids)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=validation_sampler, batch_size=batch_size)
    
else: #if the dataset is small in size
    pass

In [22]:
# Load model, the pretrained model will include a single linear classification layer on top for classification. 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
model.cuda()

- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [23]:
# setting custom optimization parameters. You may implement a scheduler here as well.
# param_optimizer = list(model.named_parameters())
# no_decay = ['bias', 'gamma', 'beta']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
#      'weight_decay_rate': 0.01},
#     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
#      'weight_decay_rate': 0.0}
# ]
# optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

## Train model

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce RTX 2070 SUPER'

In [25]:
n_gpu

2

In [None]:
torch.cuda.empty_cache()

In [None]:
torch.cuda.ipc_collect()

In [29]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()

    # # Forward pass for multiclass classification
    # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    # loss = outputs[0]
    # logits = outputs[1]

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, batch in enumerate(validation_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    with torch.no_grad():
      # Forward pass
      outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  threshold = 0.50
  pred_bools = [pl>threshold for pl in pred_labels]
  true_bools = [tl==1 for tl in true_labels]
  val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

  print('F1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)


Epoch:   0%|          | 0/3 [00:00<?, ?it/s][A

Train loss: 0.12058082616754941



Epoch:  33%|███▎      | 1/3 [00:14<00:29, 14.93s/it][A

F1 Validation Accuracy:  88.30409356725146
Flat Validation Accuracy:  76.42857142857142
Train loss: 0.09757612320993628



Epoch:  67%|██████▋   | 2/3 [00:29<00:14, 14.92s/it][A

F1 Validation Accuracy:  89.42857142857143
Flat Validation Accuracy:  80.0
Train loss: 0.08204593727631228



Epoch: 100%|██████████| 3/3 [00:44<00:00, 14.91s/it][A

F1 Validation Accuracy:  91.03840682788051
Flat Validation Accuracy:  82.14285714285714





## Test

In [34]:
# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

We need to threshold our sigmoid function outputs which range from [0, 1]. Below I use 0.50 as a threshold.

In [40]:
test_label_cols

Index(['fileNo', 'COMPARISON', 'INDICATION', 'FINDINGS', 'IMPRESSION',
       'expert_labels', 'No Finding', 'Cardiomegaly', 'Lung Opacity', 'Edema',
       'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax',
       'Pleural Effusion', 'Fracture', 'SupportDevices'],
      dtype='object')

In [41]:
test_label_cols = label_cols
pred_bools = [pl>0.50 for pl in pred_labels] #boolean output after thresholding

# Print and save classification report
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')
clf_report = classification_report(true_bools,pred_bools,target_names=test_label_cols)
# pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
print(clf_report)

Test F1 Accuracy:  0.9215116279069766
Test Flat Accuracy:  0.8571428571428571 

                  precision    recall  f1-score   support

      No Finding       0.98      0.99      0.99       107
    Cardiomegaly       0.95      0.95      0.95        57
    Lung Opacity       0.96      0.98      0.97        54
           Edema       0.00      0.00      0.00         5
   Consolidation       0.00      0.00      0.00         5
       Pneumonia       0.00      0.00      0.00         4
     Atelectasis       0.94      0.88      0.91        58
    Pneumothorax       0.00      0.00      0.00         5
Pleural Effusion       0.89      0.86      0.87        28
        Fracture       1.00      0.67      0.80        15
  SupportDevices       0.95      1.00      0.97        19

       micro avg       0.96      0.89      0.92       357
       macro avg       0.61      0.57      0.59       357
    weighted avg       0.91      0.89      0.90       357
     samples avg       0.93      0.91      0.92 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Optimizing threshold value for micro F1 score

In [43]:
# Calculate Accuracy - maximize F1 accuracy by tuning threshold values. First with 'macro_thresholds' on the order of e^-1 then with 'micro_thresholds' on the order of e^-2

macro_thresholds = np.array(range(1,10))/10

f1_results, flat_acc_results = [], []
for th in macro_thresholds:
  pred_bools = [pl>th for pl in pred_labels]
  test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  f1_results.append(test_f1_accuracy)
  flat_acc_results.append(test_flat_accuracy)

best_macro_th = macro_thresholds[np.argmax(f1_results)] #best macro threshold value

micro_thresholds = (np.array(range(10))/100)+best_macro_th #calculating micro threshold values

f1_results, flat_acc_results = [], []
for th in micro_thresholds:
  pred_bools = [pl>th for pl in pred_labels]
  test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  f1_results.append(test_f1_accuracy)
  flat_acc_results.append(test_flat_accuracy)

best_f1_idx = np.argmax(f1_results) #best threshold value

# Printing and saving classification report
print('Best Threshold: ', micro_thresholds[best_f1_idx])
print('Test F1 Accuracy: ', f1_results[best_f1_idx])
print('Test Flat Accuracy: ', flat_acc_results[best_f1_idx], '\n')

best_pred_bools = [pl>micro_thresholds[best_f1_idx] for pl in pred_labels]
clf_report_optimized = classification_report(true_bools,best_pred_bools, target_names=label_cols)
# pickle.dump(clf_report_optimized, open('classification_report_optimized.txt','wb'))
print(clf_report_optimized)

Best Threshold:  0.4
Test F1 Accuracy:  0.9285714285714286
Test Flat Accuracy:  0.8785714285714286 

                  precision    recall  f1-score   support

      No Finding       0.98      0.99      0.99       107
    Cardiomegaly       0.95      0.96      0.96        57
    Lung Opacity       0.96      0.98      0.97        54
           Edema       1.00      0.20      0.33         5
   Consolidation       0.00      0.00      0.00         5
       Pneumonia       1.00      0.75      0.86         4
     Atelectasis       0.95      0.91      0.93        58
    Pneumothorax       0.00      0.00      0.00         5
Pleural Effusion       0.83      0.86      0.84        28
        Fracture       0.85      0.73      0.79        15
  SupportDevices       0.95      1.00      0.97        19

       micro avg       0.95      0.91      0.93       357
       macro avg       0.77      0.67      0.69       357
    weighted avg       0.92      0.91      0.91       357
     samples avg       0.94

In [None]:
# # From Transformers: https://huggingface.co/transformers/_modules/transformers/modeling_bert.html#BertForSequenceClassification
# from torch.nn import BCEWithLogitsLoss
# class BertForMultiLabelClassification(BertPreTrainedModel):
#     def __init__(self, config):
#         super().__init__(config)
#         self.num_labels = config.num_labels

#         self.bert = BertModel(config)
#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
#         self.classifier = nn.Linear(config.hidden_size, config.num_labels)

#         self.init_weights()

# [docs]
#     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
#     def forward(
#         self,
#         input_ids=None,
#         attention_mask=None,
#         token_type_ids=None,
#         position_ids=None,
#         head_mask=None,
#         inputs_embeds=None,
#         labels=None,
#     ):


#         outputs = self.bert(
#             input_ids,
#             attention_mask=attention_mask,
#             token_type_ids=token_type_ids,
#             position_ids=position_ids,
#             head_mask=head_mask,
#             inputs_embeds=inputs_embeds,
#         )

#         pooled_output = outputs[1]

#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)

#         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

#         if labels is not None:
#             if self.num_labels == 1:
#                 #  We are doing regression
#                 loss_fct = MSELoss()
#                 loss = loss_fct(logits.view(-1), labels.view(-1))
#             else:
# #                 loss_fct = CrossEntropyLoss()
#                 loss_fct = BCEWithLogitsLoss()
#                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
#             outputs = (loss,) + outputs

#         return outputs  # (loss), logits, (hidden_states), (attentions)