In [1]:
!pip install rouge-score -q
!pip install wandb -q

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Importing stock libraries

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# WandB – Import 
import wandb
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import re
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/charmichokshi4444/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
wandb.login()

In [4]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [5]:
#Train function to train for one epoch
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [6]:
#Validation function to validate the generation, returns both predicted and actual value
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [7]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
#https://catriscode.com/2021/05/01/tweets-cleaning-with-python/
def clean_data(tweet):
    if type(tweet) == np.float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    temp = [w for w in temp if not w in stop_words]
    temp = " ".join(word for word in temp)
    return temp

#https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
def deEmojify(text):
#     print(text)
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/charmichokshi4444/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
torch.cuda.empty_cache()
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
# WandB – Initialize a new run
wandb.init(project="transformers_tutorials_summarization")


# Wandb config 
config = wandb.config          # Initialize config
config.TRAIN_BATCH_SIZE = 8    
config.VALID_BATCH_SIZE = 8    
config.TRAIN_EPOCHS = 10        
config.VAL_EPOCHS = 1 
config.LEARNING_RATE = 1e-4   
config.SEED = 42              
config.MAX_LEN = 512
config.SUMMARY_LEN = 150 

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(config.SEED) # pytorch random seed
np.random.seed(config.SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")


df = pd.read_csv('./TitleGenData/no_clickbait_train_data.csv')
df = df.rename(columns={'postText':'text', 'targetParagraphs': 'ctext'})
df = df[['text','ctext']]
df = df.dropna()
df['ctext'] = df['ctext'].apply(deEmojify)
df['ctext'] = df['ctext'].apply(clean_data)
df.ctext = 'summarize: ' + df.ctext
print(df.head())


# Dataset creation and 80% is train remaining is test(val) 
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = config.SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))



training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

# parameters for dataloaders
train_params = {
  'batch_size': config.TRAIN_BATCH_SIZE,
  'shuffle': True,
  'num_workers': 0
  }

val_params = {
  'batch_size': config.VALID_BATCH_SIZE,
  'shuffle': False,
  'num_workers': 0
  }

# Dataloaders for train and validation.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)




model = T5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=config.LEARNING_RATE)

# Log metrics with wandb
wandb.watch(model, log="all")
# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(config.TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)
    torch.save(model.state_dict(), './finetuneT5/final-t5-finetuned-epoch-{}.pt'.format(epoch))
    # Validation and saving the resulting file in a dataframe.
    print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
    for vepoch in range(config.VAL_EPOCHS):
        predictions, actuals = validate(vepoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv('./finetuneT5/fine_tuned_predictions-epoch-{}.csv'.format(epoch))
        print('Output Files generated for review')
    

[34m[1mwandb[0m: Currently logged in as: [33msandstorm11[0m. Use [1m`wandb login --relogin`[0m to force relogin


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


                                                text  \
0  UK’s response to modern slavery leaving victim...   
1  The "forgotten" Trump roast: Relive his brutal...   
2  Tokyo's subway is shut down amid fears over an...   
3             Ban lifted on Madrid doping laboratory   
4  Despite the ‘Yuck Factor,’ Leeches Are Big in ...   

                                               ctext  
0  summarize: thousands modern slavery victims co...  
1  summarize: white house correspondents dinner e...  
2  summarize: one tokyos major subways systems sa...  
3  summarize: share madrids anti doping laborator...  
4  summarize: moscow small physician assistants g...  
FULL Dataset: (16390, 2)
TRAIN Dataset: (13112, 2)
TEST Dataset: (3278, 2)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Initiating Fine-Tuning for the model on our dataset




Epoch: 0, Loss:  8.9929838180542
Epoch: 0, Loss:  3.008601427078247
Epoch: 0, Loss:  2.458158254623413
Epoch: 0, Loss:  2.575425863265991
Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0
Completed 100
Completed 200
Completed 300
Completed 400
Output Files generated for review
Epoch: 1, Loss:  2.1646344661712646
Epoch: 1, Loss:  2.1130573749542236
Epoch: 1, Loss:  2.6266937255859375
Epoch: 1, Loss:  2.509052038192749
Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0
Completed 100
Completed 200
Completed 300
Completed 400
Output Files generated for review
Epoch: 2, Loss:  2.89286732673645
Epoch: 2, Loss:  2.7665982246398926
Epoch: 2, Loss:  1.8510767221450806
Epoch: 2, Loss:  2.1461596488952637
Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0
Completed 100
Completed 200
Completed 300
Comp

In [15]:
for e in range(config.TRAIN_EPOCHS):
    final_df = pd.read_csv('./finetuneT5/fine_tuned_predictions-epoch-{}.csv'.format(e))
    final_df['Actual Text'] = final_df['Actual Text'].apply(deEmojify)
    final_df['Actual Text'] = final_df['Actual Text'].apply(clean_data)
    ac_text = final_df['Actual Text'].tolist()
    gen_text = final_df['Generated Text'].tolist()
    results = {'precision': [], 'recall': [], 'fmeasure': []}
    for (h, r) in zip(gen_text, ac_text):
        # computing the ROUGE
        score = scorer.score(h, r)
        # separating the measurements
        precision, recall, fmeasure = score['rougeL']
        # add them to the proper list in the dictionary
        results['precision'].append(precision)
        results['recall'].append(recall)
        results['fmeasure'].append(fmeasure)
    print("Epoch {} Rouge score".format(e))
    print(sum(results['precision'])/len(results['precision']), sum(results['recall'])/len(results['recall']), sum(results['fmeasure'])/len(results['fmeasure']))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


Epoch 0 Rouge score
0.31175009777793916 0.2305779375785791 0.25683693955235326
Epoch 1 Rouge score
0.3080772083605138 0.22362719545498225 0.25157800707569883
Epoch 2 Rouge score
0.3139571128943229 0.23197930591981605 0.25908909252048695
Epoch 3 Rouge score
0.3053456100828505 0.23474932878875643 0.25736181158217875
Epoch 4 Rouge score
0.30878704399637136 0.23450181337142384 0.25870959977967356
Epoch 5 Rouge score
0.30583623616072525 0.23465087813834584 0.25792616427696813
Epoch 6 Rouge score
0.2976730910047489 0.2253216486152646 0.24893878897029406
Epoch 7 Rouge score
0.2994694282685974 0.23282905068760676 0.2539848251877404
Epoch 8 Rouge score
0.29991925740857966 0.22418793898921713 0.24926194586726905
Epoch 9 Rouge score
0.30272454317446773 0.23017937996402799 0.253891552134198


In [19]:
#Calculated by running validation only on pretrained model
print("Default Model RougeL")
print(sum(results['precision'])/len(results['precision']), sum(results['recall'])/len(results['recall']), sum(results['fmeasure'])/len(results['fmeasure']))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


Default Model RougeL
0.2657697767913373 0.25738135100269566 0.24482966585823734
