In [1]:
!pip install transformers -q

In [2]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
sentence = "no acute cardiopulmonary findings."
tokenise_text = tokenizer.tokenize(sentence)
tokenise_text

['no',
 'acute',
 'card',
 '##io',
 '##pu',
 '##lm',
 '##ona',
 '##ry',
 'findings',
 '.']

In [4]:
numerical_tokens = tokenizer.convert_tokens_to_ids(tokenise_text)
numerical_tokens

[1185, 12104, 3621, 2660, 16091, 13505, 7637, 1616, 9505, 119]

In [5]:
tokenizer.decode(numerical_tokens)


'no acute cardiopulmonary findings.'

In [6]:
#encoding
tokenizer.encode(sentence)

[101, 1185, 12104, 3621, 2660, 16091, 13505, 7637, 1616, 9505, 119, 102]

In [7]:
# tokenizer.get_vocab()

In [8]:
def create_semmentation_embedding(token_embeddings, sep_token_id):
  segmentation_embeddings = []
  sentence_length = 0

  for token in token_embeddings:
    segmentation_embeddings.append(sentence_length%2)
    if token == sep_token_id:
      sentence_length+=1
  return segmentation_embeddings


In [9]:
sentence2 = "tortuous aorta otherwise unremarkable examination."

In [10]:
encoded_text = tokenizer.encode(sentence, sentence2)
print(encoded_text)

[101, 1185, 12104, 3621, 2660, 16091, 13505, 7637, 1616, 9505, 119, 102, 1106, 3740, 8163, 170, 12148, 1161, 4303, 8362, 16996, 23822, 1895, 8179, 119, 102]


In [11]:
#create segmentation embedding
create_semmentation_embedding(encoded_text, 102)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [12]:
sentence3 = "no pulmonary disease"

In [13]:
# tokenizer.encode([sentence, sentence2, sentence3])

In [14]:
# model?


Steps to perform:
1. Input document
2. Token Embeddings
3. Segment Embeddings
4. Position embeddings

pass that to transoforrmer encoder layer -> generates contextual embeddings->pass that to generative decoder layer for sentence summarization.




In [15]:
def encode_text(text, tokenizer, symbols, is_summary=False):
    if is_summary:
        #don't need [CLS] and [SEP] token while inferencing.
        encoded = [tokenizer.encode(s)[1:-1] for s in text]
    else:
        encoded = [tokenizer.encode(s) for s in text]
        
    flattened = [item for sublist in encoded for item in sublist]
    
    if is_summary:
        return [symbols['BOS']] + flattened + [symbols['EOS']] 
    
    return flattened

In [16]:
def create_segmentation_embedding(token_embeddings, sep_token_id):
  segmentation_embeddings = []
  sentence_length = 0

  for token in token_embeddings:
    segmentation_embeddings.append(sentence_length%2)
    if token == sep_token_id:
      sentence_length+=1
  return segmentation_embeddings

In [17]:
def pad(encoded_text, seq_length, tokenizer, symbols, is_summary=False):
    if len(encoded_text) > seq_length:
        if is_summary:
            encoded_text = encoded_text[:seq_length]
        else:
            sent_sep_idxs = [idx for idx, t in enumerate(encoded_text) if t == tokenizer.sep_token_id and idx < seq_length]
            last_sent_sep_idx = min(max(sent_sep_idxs)+1 if (len(sent_sep_idxs) > 0) else seq_length, seq_length)
            encoded_text = encoded_text[:last_sent_sep_idx]
    
    if len(encoded_text) < seq_length:
        encoded_text.extend([tokenizer.pad_token_id] * (seq_length - len(encoded_text)))
    
    
    if is_summary:
        encoded_text += [tokenizer.pad_token_id]

    return encoded_text

In [18]:
def create_mask(text_tensor):
    """create attention mask"""
    mask = torch.zeros_like(text_tensor)
    mask[text_tensor != tokenizer.pad_token_id] = 1 
    
    return mask

In [19]:
def collate_function(data, tokenizer, symbols, block_size, training):
    encoded_stories = [encode_text(story, tokenizer, symbols) for _, story, summary in data]
    encoded_summaries = [encode_text(summary, tokenizer, symbols, True) for _, story, summary in data]
    story_segembs = [create_segmentation_embedding(s, tokenizer) for s in encoded_stories]
        
    padded_stories = torch.tensor([pad(s, block_size, tokenizer, symbols) for s in encoded_stories]).long()
    padded_summaries = torch.tensor([pad(s, block_size, tokenizer, symbols, True) for s in encoded_summaries]).long()
    padded_segembs = torch.tensor([pad(s, block_size, tokenizer, symbols) for s in story_segembs]).long()
    
    stories_mask = create_mask(padded_stories)
    summaries_mask = create_mask(padded_summaries)
    
    if training:
        return [padded_stories, padded_summaries, padded_segembs, stories_mask, summaries_mask], padded_summaries[:,1:]
    else:
        Batch = namedtuple("Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"])
        names = [name for name, _, _ in data]
        summaries = [" ".join(summary_list) for _, _, summary_list in data]
        batch = Batch(
            document_names=names,
            batch_size=len(encoded_stories),
            src=padded_stories.to(args.device),
            segs=padded_segembs.to(args.device),
            mask_src=stories_mask.to(args.device),
            tgt_str=summaries,
        )
        
        return batch

In [20]:
# clone the repo
!git clone https://github.com/chandan5362/Abstractive-Text-Summarisation.git

fatal: destination path 'Abstractive-Text-Summarisation' already exists and is not an empty directory.


In [21]:
import torch
import pandas as pd
import torchvision as tv
from torch.utils.data import Dataset, DataLoader, SequentialSampler
from sklearn.model_selection import train_test_split

In [22]:
# laod the dataset
df = pd.read_csv("/content/Abstractive-Text-Summarisation/data/indiana/preprocessed-indiana-cxr-reports.csv")
df.head()

Unnamed: 0,COMPARISON,INDICATION,FINDINGS,IMPRESSION,indication_count,findings_count,impression_count
0,none.,no indication,heart size normal. lungs are clear. are normal...,normal chest,2,17,2
1,no comparison,slipped back on right side,the heart size and pulmonary vascularity appea...,no evidence of active disease.,5,36,5
2,no comparison,bone marrow transplant evaluation. aml.,the heart size and pulmonary vascularity appea...,no evidence of active disease.,5,38,5
3,none.,chest pain and .,the heart is normal in size and contour. the l...,no acute cardiopulmonary disease.,4,41,4
4,none.,mid to lower back pain since .,the heart is normal in size and contour. the l...,no acute cardiopulmonary disease.,7,22,4


In [24]:
import numpy as np

In [25]:
#setrting up hyperparameters
hparams = {
    "batch_size": 5,
    "encoder_max_seq_length" : 512,
    "decoder_max_seq_length" : 512,

}

In [26]:
class SummarisationDataset(Dataset):
    def __init__(self, path, subset=None):
        if path.endswith('.csv'):
            self.dataset = pd.read_csv(path)
        
        if subset:
            self.dataset = self.dataset.iloc[:subset]
        
    def __len__(self):
         return self.dataset.shape[0]
        
    def __getitem__(self, idx):
        title = ""
        article = self.dataset.iloc[idx]['FINDINGS']
        article = [s.strip()+'.' for s in article.split('.')]
        article = [self.add_missing_period(line) for line in article if len(line) > 0]
        article = [s for s in article if s not in ['..', '.']]
        
        summary = self.dataset.iloc[idx]['IMPRESSION']
        summary = [s.strip()+'.' for s in summary.split('.')]
        summary = [self.add_missing_period(line) for line in summary if len(line) > 0]
        summary = [s for s in summary if s not in ['..', '.']]
        unflat_story = [tokenizer.encode(s) for s in article]
        unflat_summary = [tokenizer.encode(s) for s in summary]
        flat_article = [item for sublist in unflat_story for item in sublist]
        flat_summary = [item for sublist in unflat_summary for item in sublist]
        padded_article = flat_article + [tokenizer.pad_token_id] * (hparams["encoder_max_seq_length"] - len(flat_article))
        padded_summary = flat_summary + [tokenizer.pad_token_id] * (hparams["decoder_max_seq_length"] - len(flat_summary))

        return np.asarray(padded_article), np.asarray(padded_summary)
    
    def add_missing_period(self, line):
        END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
        if line.startswith("@highlight"):
            return line
        if line[-1] in END_TOKENS and len(line):
            return line
        return line + "."

In [27]:
data = SummarisationDataset("/content/Abstractive-Text-Summarisation/data/indiana/preprocessed-indiana-cxr-reports.csv", subset=1000)

train_ds, test_ds = train_test_split(data, test_size=0.2)
valid_ds, test_ds = train_test_split(test_ds, test_size=0.3)

In [28]:
# art, sum = next(iter(train_ds))
# sum.shape

In [29]:
# DataLoader
train_loader = DataLoader(train_ds, batch_size=hparams["batch_size"], shuffle = True)
val_loader = DataLoader(valid_ds, batch_size=4)
test_loader = DataLoader(test_ds, batch_size=4)

## Encoder Decoder Model

In [30]:
from transformers import EncoderDecoderModel

In [53]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained("emilyalsentzer/Bio_ClinicalBERT", "emilyalsentzer/Bio_ClinicalBERT")
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertLMHeadModel: ['cls.seq_relationsh

In [32]:
#import rouge score
!pip install rouge_score



In [33]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score('The quick brown fox jumps over the lazy dog',
                      'The quick brown dog jumps on the log.')

In [34]:
scores

{'rouge1': Score(precision=0.75, recall=0.6666666666666666, fmeasure=0.7058823529411765),
 'rougeL': Score(precision=0.625, recall=0.5555555555555556, fmeasure=0.5882352941176471)}

In [35]:
import time
import warnings
warnings.filterwarnings("ignore")

In [54]:

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val   = 0
        self.avg   = 0
        self.sum   = 0
        self.count = 0

    def update(self, val, n=1):
        self.val   = val
        self.sum   += val * n
        self.count += n
        self.avg   = self.sum / self.count

In [55]:
# # set special tokens
# model.config.decoder_start_token_id = tokenizer.bos_token_id                                             
# roberta_shared.config.eos_token_id = tokenizer.eos_token_id
# roberta_shared.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
# set decoding params                               
model.config.max_length = 30
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 1
model.config.length_penalty = 2.0
model.config.repetition_penalty = 3.0
model.config.num_beams = 10
# model.config.vocab_size = roberta_shared.config.encoder.vocab_size

In [56]:
for param in model.encoder.parameters():
    param.requires_grad = False

In [57]:
optimizer = torch.optim.Adam(params = model.parameters(), lr =2e-3)

In [None]:
for epoch in range(10):
  batch_time = AverageMeter()

  losses  = AverageMeter()
  end = time.time()
  for i, (article, summary) in enumerate(train_loader):

    op = model(input_ids = article, labels = summary)
    loss, out = op.loss, op.logits
    losses.update(loss.item(), out.size(0))

    
    optimizer.zero_grad()
    op.loss.backward()

    if i % 5 == 0:
      print('Epoch: [{0}][{1}/{2}]\t'
            'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
            'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
            .format(epoch+1, i, len(train_loader),  batch_time=batch_time,
             loss=losses))
      model.save_pretrained("bert2bert")
    optimizer.step()
    batch_time.update(time.time() - end)
    end = time.time()



In [41]:
model = EncoderDecoderModel.from_pretrained("bert2bert")

In [None]:
sentence = df["FINDINGS"].values[0]


In [None]:
findings, sum = next(iter(test_loader))

In [43]:
input_ids = tokenizer("heart size normal. lungs are clear. are normal. no pneumonia effusions edema pneumothorax adenopathy nodules or masses.", return_tensors="pt").input_ids

In [None]:
input_ids

In [None]:
model.generate(input_ids)

In [48]:
input_ids = tokenizer("heart size normal. lungs are clear. are normal. no pneumonia effusions edema pneumothorax adenopathy nodules or masses", return_tensors="pt").input_ids
labels = tokenizer("no pneumonia effusion", return_tensors="pt").input_ids
outputs = model(input_ids=input_ids, labels=labels)
loss, logits = outputs.loss, outputs.logits

In [49]:
loss

tensor(10.0887, grad_fn=<NllLossBackward0>)

In [51]:
generated = model.generate(labels)
generated

tensor([[101,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0]])

In [52]:
generated.shape

torch.Size([1, 20])