In [1]:
!pip install transformers[torch]
!pip install SentencePiece

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m95.2 MB/s

In [2]:
# Importing required libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [3]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.seqB
        self.ctext = self.data.seqA

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [5]:
# Creating the training function. This will be called in the main process. It is run depending on the epoch value.
# The model is put into train mode and then we enumerate over the training loader and passed to the defined network

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [6]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    inputs = []
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask,
                max_length=150,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
                )
            input = [tokenizer.decode(i, skip_special_tokens=True, clean_up_tokenization_spaces=True) for i in ids]
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            inputs.extend(input)
            predictions.extend(preds)
            actuals.extend(target)

    return predictions, actuals, inputs

In [7]:
# Defining some key variables that will be used later on in the training
TRAIN_BATCH_SIZE = 4    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 4    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
VAL_EPOCHS = 1
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 512
SUMMARY_LEN = 150

In [8]:
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base")

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
df = pd.read_csv('./pair_match_df.csv', encoding='latin-1')
# 'ctext' is the original text (source), and 'text' is the summary (target).
df = df[['seqA','seqB']]
df.seqA = 'summarize: ' + df.seqA # add prefix "summarize: " to input indicating the task
print(df.head())

                                                seqA  \
0  summarize: Muscle cramps are a common problem ...   
1  summarize: These true cramps, which originate ...   
2  summarize: Medical history, physical examinati...   
3  summarize: Despite the "benign" nature of cram...   
4  summarize: Treatment options are guided both b...   

                                                seqB  
0  Muscle cramps are a common problem represented...  
1  These true cramps, coming from nerves outside ...  
2  Medical history, physical check-up, and lab sc...  
3  Despite their harmless nature, cramps are unco...  
4  Experience and limited medical studies guide t...  


In [10]:
# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest will be used for validation.
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

FULL Dataset: (9296, 2)
TRAIN Dataset: (7437, 2)
TEST Dataset: (1859, 2)


In [11]:
# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

In [12]:
# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=VALID_BATCH_SIZE, shuffle=False)

In [13]:
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session.
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
# Training loop (taking around 22 mins)
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Initiating Fine-Tuning for the model on our dataset




Epoch: 0, Loss:  6.8360395431518555
Epoch: 0, Loss:  2.426135540008545
Epoch: 0, Loss:  2.3795647621154785
Epoch: 0, Loss:  1.8136235475540161
Epoch: 1, Loss:  1.6601029634475708
Epoch: 1, Loss:  1.3831067085266113
Epoch: 1, Loss:  1.1387933492660522
Epoch: 1, Loss:  1.9038736820220947


In [15]:
# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals, inputs = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'input': inputs, 'Generated Text':predictions,'Actual Text':actuals, 'Input Text': inputs})
    final_df.to_csv('./predictions.csv')
    print('Output Files generated for review')

Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe




Completed 0
Completed 100
Completed 200
Completed 300
Completed 400
Output Files generated for review


In [22]:
full_text = pd.read_csv('abstract_adaptation_pair_df.csv')
full_text.rename(columns={'abstract': 'seqA', 'adaptation': 'seqB'}, inplace=True)
full_text = full_text[['seqA','seqB']]
full_text.seqA = 'summarize: ' + df.seqA # add prefix "summarize: " to input indicating the task
print(full_text.head())


                                                seqA  \
0  summarize: summarize: summarize: Muscle cramps...   
1  summarize: summarize: summarize: These true cr...   
2  summarize: summarize: summarize: Medical histo...   
3  summarize: summarize: summarize: Despite the "...   
4  summarize: summarize: summarize: Treatment opt...   

                                                seqB  
0  Muscle cramps are a common problem represented...  
1  Dystonias are disorders with a lot of uncontro...  
2  Muscle cramps cause constant and unintended co...  
3  Exercise-Associated Muscle Cramps (EAMC) are a...  
4  Muscle cramps are common in healthy people, es...  


Unnamed: 0,abstract,adaptation,question,category,question_type
0,Muscle cramps are a common problem characteriz...,Muscle cramps are a common problem represented...,What causes muscle spasm?,1,C
1,The dystonias are a group of disorders charact...,Dystonias are disorders with a lot of uncontro...,What causes muscle spasm?,1,C
2,"Muscle cramps result in continuous, involuntar...",Muscle cramps cause constant and unintended co...,What causes muscle spasm?,1,C
3,Exercise-Associated Muscle Cramps (EAMC) are a...,Exercise-Associated Muscle Cramps (EAMC) are a...,What causes muscle spasm?,1,C
4,Muscular cramp is a common symptom in healthy ...,"Muscle cramps are common in healthy people, es...",What causes muscle spasm?,1,C


In [26]:
full_length_set = CustomDataset(full_text, tokenizer, MAX_LEN, SUMMARY_LEN)
full_length_loader = DataLoader(full_length_set, batch_size=VALID_BATCH_SIZE, shuffle=False)

In [27]:
for epoch in range(VAL_EPOCHS):
    predictions, actuals, inputs = validate(epoch, tokenizer, model, device, full_length_loader)
    final_df = pd.DataFrame({'input': inputs, 'Generated Text':predictions,'Actual Text':actuals, 'Input Text': inputs})
    final_df.to_csv('./predictions_full_text.csv')
    print('Output Files generated for review')



Completed 0
Completed 100
Completed 200
Output Files generated for review


In [None]:
model.save_pretrained('t5-model')

In [None]:
predictions_df = pd.read_csv('./predictions.csv', index_col=None)

In [None]:
predictions_df.head()

Unnamed: 0.1,Unnamed: 0,input,Generated Text,Actual Text
0,0,"summarize: These true cramps, which originate ...","These true cramps, which originate from periph...","These true cramps, coming from nerves outside ..."
1,1,"summarize: Medical history, physical examinati...","medical history, physical examination, and a l...","Medical history, physical check-up, and lab sc..."
2,2,summarize: Treatment options are guided both b...,Treatment options are guided both by experienc...,Experience and limited medical studies guide t...
3,3,summarize: Quinine sulfate is an effective med...,"quinoine sulfate is an effective medication, b...","Quinine sulfate (an antimalarial drug) helps, ..."
4,4,"summarize: If a cause is identified, specific ...","If a cause is identified, specific causes-base...","If a cause is found, specific cause-based trea..."


In [None]:
#to evaluate the generated text using metrics like "bleu" and "rouge"
!pip install evaluate
import evaluate
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: dill, responses, mu

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
metric = evaluate.load("bleu")
references = [ [a] for a in actuals ]
results = metric.compute(predictions=predictions, references=references, tokenizer=word_tokenize)
results

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.3214336223163686,
 'precisions': [0.605516356638871,
  0.4009044576843012,
  0.30094656182701457,
  0.23443204202232437],
 'brevity_penalty': 0.8885318955756106,
 'length_ratio': 0.8943066112146852,
 'translation_length': 43652,
 'reference_length': 48811}

In [None]:
!pip install rouge_score
metric = evaluate.load('rouge')
results = metric.compute(predictions=predictions, references=references, tokenizer=word_tokenize)
results

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=cff6e3b7a21f626abc843bf8c2956fe3168da9f4945fd27221ee0f91028b9855
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.5652449532783603,
 'rouge2': 0.374805535190738,
 'rougeL': 0.5372100441980754,
 'rougeLsum': 0.5371118050871535}

In [None]:
metric = evaluate.load("bleu")
references = [ [a.replace('summarize: ', '')] for a in inputs ]
results = metric.compute(predictions=predictions, references=references, tokenizer=word_tokenize)
results

{'bleu': 0.7385291593381821,
 'precisions': [0.9183084394758545,
  0.8519369272366186,
  0.802223669053939,
  0.7575574523965857],
 'brevity_penalty': 0.8893872125918113,
 'length_ratio': 0.8950767905841825,
 'translation_length': 43652,
 'reference_length': 48769}

In [None]:
metric = evaluate.load('rouge')
results = metric.compute(predictions=predictions, references=references, tokenizer=word_tokenize)
results

{'rouge1': 0.8763111659902039,
 'rouge2': 0.8110150462278611,
 'rougeL': 0.8703657183773394,
 'rougeLsum': 0.8703317137922639}

In [None]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [None]:
from textstat import flesch_kincaid_grade, flesch_reading_ease

In [None]:
test_input = predictions_df['input'][0].replace('summarize', '')
test_gen = predictions_df['Generated Text'][0]
test_actual = predictions_df['Actual Text'][0]

In [None]:
input_fkgl_scores = []
gen_fkgl_scores = []
actual_fkgl_scores = []

input_fkre_scores = []
gen_fkre_scores = []
actual_fkre_scores = []


In [None]:
for row in predictions_df.itertuples():
  input = row[2].replace('summarize: ', '')
  gen = row[3]
  actual = row[4]
  input_fkgl_scores.append(flesch_kincaid_grade(input))
  gen_fkgl_scores.append(flesch_kincaid_grade(gen))
  actual_fkgl_scores.append(flesch_kincaid_grade(str(actual)))

  input_fkre_scores.append(flesch_reading_ease(input))
  gen_fkre_scores.append(flesch_reading_ease(gen))
  actual_fkre_scores.append(flesch_reading_ease(str(actual)))

In [None]:
import numpy as np
from tabulate import tabulate

input_fkgl_mean = np.mean(input_fkgl_scores)
input_fkgl_median = np.median(input_fkgl_scores)

gen_fkgl_mean = np.mean(gen_fkgl_scores)
gen_fkgl_median = np.median(gen_fkgl_scores)

actual_fkgl_mean = np.mean(actual_fkgl_scores)
actual_fkgl_median = np.median(actual_fkgl_scores)

input_fkre_mean = np.mean(input_fkre_scores)
input_fkre_median = np.median(input_fkre_scores)

gen_fkre_mean = np.mean(gen_fkre_scores)
gen_fkre_median = np.median(gen_fkre_scores)

actual_fkre_mean = np.mean(actual_fkre_scores)
actual_fkre_median = np.median(actual_fkre_scores)

In [None]:
data = [
    ["Input", input_fkgl_mean, input_fkgl_median, input_fkre_mean, input_fkre_median],
    ["Generated", gen_fkgl_mean, gen_fkgl_median, gen_fkre_mean, gen_fkre_median],
    ["Actual", actual_fkgl_mean, actual_fkgl_median, actual_fkre_mean, actual_fkre_median]
]

headers = ["Type", "FKGL Mean", "FKGL Median", "FKRE Mean", "FKRE Median"]
table = tabulate(data, headers, tablefmt="grid")
print(table)

+-----------+-------------+---------------+-------------+---------------+
| Type      |   FKGL Mean |   FKGL Median |   FKRE Mean |   FKRE Median |
| Input     |     14.0372 |          13.9 |     31.0798 |         30.87 |
+-----------+-------------+---------------+-------------+---------------+
| Generated |     12.8388 |          12.7 |     36.7612 |         37.3  |
+-----------+-------------+---------------+-------------+---------------+
| Actual    |     11.2952 |          11.1 |     49.1382 |         49.15 |
+-----------+-------------+---------------+-------------+---------------+
