In [None]:
!pip install sentencepiece
!pip install transformers
!pip install wandb

# Code for TPU packages install
# !curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |▎                               | 10kB 21.0MB/s eta 0:00:01[K     |▋                               | 20kB 18.5MB/s eta 0:00:01[K     |▉                               | 30kB 16.2MB/s eta 0:00:01[K     |█▏                              | 40kB 15.0MB/s eta 0:00:01[K     |█▌                              | 51kB 12.4MB/s eta 0:00:01[K     |█▊                              | 61kB 12.4MB/s eta 0:00:01[K     |██                              | 71kB 13.7MB/s eta 0:00:01[K     |██▍                             | 81kB 13.1MB/s eta 0:00:01[K     |██▋                             | 92kB 13.7MB/s eta 0:00:01[K     |███                             | 102kB 13.4MB/s eta 0:00:01[K     |███▎                            | 112kB 13.4MB/s eta 0:00:01[K     |███▌        

In [None]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# WandB – Import the wandb library
import wandb

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Checking out the GPU we have access to. This is output is from the google colab version. 
!nvidia-smi

Sat Dec 12 17:39:20 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.45.01    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    10W /  70W |     10MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [None]:
# Login to wandb to log the model run and all the parameters
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length=self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length=self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
# Creating the training function. This will be called in the main function. It is run depending on the epoch value.
# The model is put into train mode and then we wnumerate over the training loader and passed to the defined network 

def train(epoch, tokenizer, model, device, loader, val_loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        labels = y[:, 1:].clone().detach()
        labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()

    model.eval()
    with torch.no_grad():
        for _,data in enumerate(val_loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            y_ids = y[:, :-1].contiguous()
            labels = y[:, 1:].clone().detach()
            labels[y[:, 1:] == tokenizer.pad_token_id] = -100
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=labels)
            loss = outputs[0]
            
            if _%10 == 0:
                wandb.log({"Validation Loss": loss.item()})

            if _%500==0:
                print(f'Epoch: {epoch}, Loss:  {loss.item()}')

In [None]:
def test(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
def main():
    # WandB – Initialize a new run
    wandb.init(project="transformers_summarization")

    # WandB – Config is a variable that holds and saves hyperparameters and inputs
    # Defining some key variables that will be used later on in the training  
    config = wandb.config          # Initialize config
    config.TRAIN_BATCH_SIZE = 64    # input batch size for training (default: 64)
    config.VALID_BATCH_SIZE = 64    # input batch size for validation (default: 1000)
    config.TEST_BATCH_SIZE = 64    # input batch size for testing (default: 1000)
    config.TRAIN_EPOCHS = 10        # number of epochs to train (default: 10)
    config.VAL_EPOCHS = 1 
    config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
    config.SEED = 42               # random seed (default: 42)
    config.MAX_LEN = 200
    config.SUMMARY_LEN = 54

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(config.SEED) # pytorch random seed
    np.random.seed(config.SEED) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    
    # Creation of Dataset and Dataloader
    train_dataset = pd.read_csv('gdrive/MyDrive/t5/df_train.csv')
    val_dataset = pd.read_csv('gdrive/MyDrive/t5/df_val.csv')
    test_dataset = pd.read_csv('gdrive/MyDrive/t5/df_test.csv')
    train_dataset.columns = ['text','ctext']
    val_dataset.columns = ['text','ctext']
    test_dataset.columns = ['text','ctext']
    train_dataset.ctext = 'summarize: ' + train_dataset.ctext
    val_dataset.ctext = 'summarize: ' + val_dataset.ctext
    test_dataset.ctext = 'summarize: ' + test_dataset.ctext

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
    val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
    test_set = CustomDataset(test_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': config.TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

    val_params = {
        'batch_size': config.VALID_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

    test_params = {
        'batch_size': config.TEST_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)
    test_loader = DataLoader(test_set, **test_params)


    
    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    model = model.to(device)

    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

    # Log metrics with wandb
    wandb.watch(model, log="all")
    # Training loop
    print('Initiating Fine-Tuning for the model on our dataset')

    for epoch in range(config.TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, val_loader, optimizer)



    # Validation loop and saving the resulting file with predictions and acutals in a dataframe.
    # Saving the dataframe as predictions.csv
    print('Now generating summaries on our fine tuned model for the test dataset and saving it in a dataframe')
    for epoch in range(config.VAL_EPOCHS):
        predictions, actuals = test(epoch, tokenizer, model, device, test_loader)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv('gdrive/MyDrive/t5/predictions_6.csv')
        print('Output Files generated for review')

if __name__ == '__main__':
    main()

[34m[1mwandb[0m: Currently logged in as: [33mdujiaying[0m (use `wandb login --relogin` to force relogin)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242065649.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at t5-small were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by pro

Initiating Fine-Tuning for the model on our dataset




Epoch: 0, Loss:  6.144913196563721
Epoch: 0, Loss:  2.513988971710205
Epoch: 0, Loss:  2.2246041297912598
Epoch: 0, Loss:  1.8138779401779175
Epoch: 1, Loss:  2.1713664531707764
Epoch: 1, Loss:  2.3400115966796875
Epoch: 1, Loss:  2.1547951698303223
Epoch: 1, Loss:  1.622109055519104
Epoch: 2, Loss:  2.205078363418579
Epoch: 2, Loss:  2.0317330360412598
Epoch: 2, Loss:  1.9158087968826294
Epoch: 2, Loss:  1.5234683752059937
Epoch: 3, Loss:  1.692735195159912
Epoch: 3, Loss:  1.6070899963378906
Epoch: 3, Loss:  1.977486491203308
Epoch: 3, Loss:  1.447749376296997
Epoch: 4, Loss:  1.648157000541687
Epoch: 4, Loss:  1.9099600315093994
Epoch: 4, Loss:  1.6578574180603027
Epoch: 4, Loss:  1.3842755556106567
Epoch: 5, Loss:  1.434805154800415
Epoch: 5, Loss:  1.7262396812438965
Epoch: 5, Loss:  1.7005250453948975
Epoch: 5, Loss:  1.340887188911438
Epoch: 6, Loss:  1.569461703300476
Epoch: 6, Loss:  1.435007095336914
Epoch: 6, Loss:  1.6889193058013916
Epoch: 6, Loss:  1.3012644052505493
Epoc

In [None]:
pred = pd.read_csv('gdrive/MyDrive/t5/predictions_6.csv', index_col=0)
pred.head()

Unnamed: 0,Generated Text,Actual Text
0,There is diffuse restriction present involving...,Subacute infarction involving the right poster...
1,Interval progression of moderate effacement of...,"1.Over the interval, previously demonstrated m..."
2,No evidence of pulmonary embolism.2. Mild cent...,1.No evidence of pulmonary embolism.2.Centrilo...
3,Enteric tube seen curled on itself with tip ex...,No change in appearance of enteric tube as abo...
4,No evidence of intracranial hemorrhage or mass...,1. No evidence of intracranial hemorrhage or m...


In [None]:
from nltk.translate.bleu_score import sentence_bleu

def sentence_bleu_n(ref, hyp, weights):
  return sentence_bleu(references = [ref.split()], 
                       hypothesis = hyp.split(),
                       weights = weights)

# bleu1
pred['bleu1'] = pred.apply(lambda x: sentence_bleu_n(x[1], x[0], weights = [1,0,0,0]), axis=1)

# bleu2
pred['bleu2'] = pred.apply(lambda x: sentence_bleu_n(x[1], x[0], weights = [.5,.5,0,0]), axis=1)

print('bleu1: {}'.format(pred['bleu1'].mean()))
print('bleu2: {}'.format(pred['bleu2'].mean()))

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


bleu1: 0.23101375794663268
bleu2: 0.22685058114066464


In [None]:
!pip install rouge
from rouge import Rouge 

# rouge1
pred['rouge1'] = pred.apply(lambda x: Rouge().get_scores(x[0], x[1])[0]['rouge-1']['f'], axis=1)

# rouge2
pred['rouge2'] = pred.apply(lambda x: Rouge().get_scores(x[0], x[1])[0]['rouge-2']['f'], axis=1)

print('rouge1: {}'.format(pred['rouge1'].mean()))
print('rouge2: {}'.format(pred['rouge2'].mean()))

Collecting rouge
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Installing collected packages: rouge
Successfully installed rouge-1.0.0
rouge1: 0.32601084529658725
rouge2: 0.20070894004695408
