<a href="https://colab.research.google.com/github/cicattzo/nlp_project/blob/main/5_20_21_NLP_Fine_Tuned_Summarization_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
'''Define the model you want to run here:'''

'''Wanted data sets the dataset to train on. The options are:
cnn - full cnn dataset from online
cnn_sample - cnn sample dataset from the dropbox
bc3 - bc3 dataset from the dropbox
merged_data - merged dataset from the dropbox'''
wanted_data = 'merged_data'

'''pretrained_model_name determines the pretrained model to load prior to training. The options are:
bert - trains a bert-base-uncased to bert-base-uncased encoder decoder model
gpt2 - trains a gp2 encoder decoder model
pretrained_summarizer - pretrained summarization model on financial reports'''
pretrained_model_name = 'bert-gpt2'

'''Model type determines the architecture of the model to train on. The options are:
original - fine tuned model with only a single linear layer
bottleneck - bottleneck fine tuning with a linear layer scaling it down, dropout, then scaling it back up
vanilla - no added layer'''
model_type = 'original'

'''Train the model or just load from memory and evaluate'''
train_model = True

In [None]:
%%bash
pip -q install torch
pip -q install transformers
# pip -q install datasets
pip -q install tqdm
# pip -q install rouge_score
# pip -q install sacrebleu
# pip install datasets==1.0.2
# pip install sumy
pip install SentencePiece
# pip install transformers==4.0.1

In [None]:
%%bash
# mkdir "/content/gdrive/MyDrive/6864_project/"
cd "/content/gdrive/MyDrive/6864_project/"

In [None]:
MODEL_FOLDER = "/content/gdrive/MyDrive/6864_project/"
import pandas as pd
import torch
import numpy as np
# import datasets
import transformers
import random
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
# from sumy.parsers.plaintext import PlaintextParser
# from sumy.nlp.tokenizers import Tokenizer
# from sumy.summarizers.lsa import LsaSummarizer
# from sumy.summarizers.text_rank import TextRankSummarizer
# from sumy.nlp.stemmers import Stemmer
# from sumy.utils import get_stop_words
from IPython.display import display

In [None]:
#decide which dataset we want to train on 
if wanted_data == 'cnn':
  train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:5%]")
  train_data = [x for x in train_data]
  full_text_key = 'article'
  label_key = 'highlights'
elif wanted_data == 'bc3':
  import nltk
  nltk.download('punkt')

  #setting up to preprocess bc3 data
  LANGUAGE = "english"
  SENTENCES_COUNT = 10
  tokenizer = Tokenizer(LANGUAGE)
  stemmer = Stemmer(LANGUAGE)
  
  # change this line to any other summarizer
  summarizer = TextRankSummarizer(stemmer)
  summarizer.stop_words = get_stop_words(LANGUAGE)

  #reading in and processing data
  bc3_df = pd.read_csv("/content/gdrive/My Drive/6864_project/bc3_processed.csv")
  txt = bc3_df.iloc[0]['body']
  summary = bc3_df.iloc[0]['summary']
  parser = PlaintextParser.from_string(txt, tokenizer)
  # keeping the subject and body separate, but they can be merged
  bc3_df['unique_key'] = bc3_df['listno'] + "-" + bc3_df['email_num'].astype(str)
  # train_data = bc3_df.groupby('unique_key').agg({'subject':lambda x: x.iloc[0], 'body':lambda x: x.iloc[0], 'summary':lambda x: x.to_list()}).to_dict('records')
  train_data = bc3_df.agg({'subject':lambda x: x.iloc[0], 'body':lambda x: x.iloc[0], 'summary':lambda x: x}).to_dict('records')

  test_data_pd = pd.read_csv("bc3_test.csv")
  test_data = test_data_pd.to_dict('records')
  test_data = [x for x in test_data]

  full_text_key = 'body'
  label_key = 'summary'
elif wanted_data == 'cnn_sample':
  train_data = []
  if train_model:
    train_data_pd = pd.read_csv("cnn_train_data_5.csv")
    train_data = train_data_pd.to_dict('records')
    train_data = [x for x in train_data]

  test_data_pd = pd.read_csv("bc3_test.csv")
  test_data = test_data_pd.to_dict('records')
  test_data = [x for x in test_data]

  full_text_key = 'article'
  label_key = 'highlights'

elif wanted_data == 'merged_data':
  train_data = []
  if train_model:
    train_data_pd = pd.read_csv("train_combined.csv")
    train_data = train_data_pd.to_dict('records')
    train_data = [x for x in train_data]

  test_data_pd = pd.read_csv("bc3_test.csv")
  test_data = test_data_pd.to_dict('records')
  test_data = [x for x in test_data]

  full_text_key = 'article'
  label_key = 'highlights'

In [None]:
# print(train_data[0])

In [None]:
#intializing the tokenizer and choosing the pretrained model
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration

if pretrained_model_name == 'bert':
  tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
  tokenizer2 = tokenizer
if pretrained_model_name == 'bert-gpt2':
  tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
  tokenizer2 = transformers.AutoTokenizer.from_pretrained("gpt2")
  tokenizer2.pad_token = tokenizer2.eos_token
  # tokenizer2=tokenizer
elif pretrained_model_name == 'gpt2':
  tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer2 = tokenizer
elif pretrained_model_name =='pretrained_summarizer':
  model_name = "human-centered-summarization/financial-summarization-pegasus"
  tokenizer = PegasusTokenizer.from_pretrained(model_name)
  tokenizer2 = tokenizer

In [None]:
'''Defining our text summarization model as a class'''
import torch.nn as nn
device = 'cuda'
encoder_max_length = 512
decoder_max_length = 128

class ModelOutputs:
    def __init__(self, logits=None, loss=None):
        self.logits = logits
        self.loss = loss

if model_type == 'original':
  class TextSummarizationModel(nn.Module):

      def __init__(self, lm=None, model_name=None):
          '''
          lm:         a pretrained transformer language model
          dropout:    dropoutrate for the dropout layer
          '''
          super(TextSummarizationModel, self).__init__()
          self.pretrained_model = lm
          self.model_name = model_name
          if model_name == 'pretrained_summarizer':
            self.linear_layer = nn.Linear(lm.config.vocab_size, lm.config.vocab_size)
          else:
            self.linear_layer = nn.Linear(lm.config.decoder.hidden_size, tokenizer.vocab_size)
      
      def forward(self, input_ids=None, labels=None): #dont think I need anything besides input ids
          if self.model_name == 'pretrained_summarizer':
            outputs = self.pretrained_model(input_ids, max_length=decoder_max_length, num_beams=5, early_stopping=True, output_hidden_states=True)
            logits = outputs.decoder_hidden_states[1]
            new_logits = self.linear_layer(logits)  
            final_logits = new_logits.permute(0,2,1)
            
            if labels is not None:
              loss_fct = nn.NLLLoss(reduction="mean").to(device)

              loss = loss_fct(final_logits, labels)
            else:
              loss = 0
          else:
            outputs = self.pretrained_model(input_ids=input_ids, decoder_input_ids = labels, output_hidden_states=True)
            logits = outputs.decoder_hidden_states[1]
            # print(outputs.decoder_hidden_states[1].size())
            # print(logits.size())
            new_logits = self.linear_layer(logits)
            final_logits = new_logits.permute(0,2,1)
            
            if labels is not None:
              loss_fct = nn.NLLLoss(reduction="mean").to(device)

              loss = loss_fct(final_logits, labels)
            else:
              loss = 0
          
          return ModelOutputs(
              logits=new_logits,
              loss=loss)
elif model_type == 'bottleneck':
  class TextSummarizationModel(nn.Module):

      def __init__(self, lm=None, model_name=None):
          '''
          lm:         a pretrained transformer language model
          dropout:    dropoutrate for the dropout layer
          '''
          super(TextSummarizationModel, self).__init__()
          self.pretrained_model = lm
          self.model_name = model_name
          self.dropout_layer = nn.Dropout(p=0.2)
          if model_name == 'pretrained_summarizer':
            self.bottleneck = nn.Linear(lm.config.vocab_size, int(lm.config.vocab_size * 0.5))
            self.upscale = nn.Linear(int(lm.vocab_size * 0.5), lm.vocab_size)
          else:
            self.bottleneck = nn.Linear(lm.config.decoder.hidden_size, int(lm.config.decoder.hidden_size * 0.5))
            self.upscale = nn.Linear(int(lm.config.decoder.hidden_size * 0.5), tokenizer.vocab_size)
      
      def forward(self, input_ids=None, labels=None): #dont think I need anything besides input ids
          if self.model_name == 'pretrained_summarizer':
            outputs = self.pretrained_model(input_ids, max_length=decoder_max_length, num_beams=5, early_stopping=True, output_hidden_states=True)
            logits = outputs.decoder_hidden_states[1]
            new_logits = self.bottleneck(logits)  
            new_logits = self.dropout_layer(new_logits)
            new_logits = self.upscale(new_logits)
            final_logits = new_logits.permute(0,2,1)
            
            if labels is not None:
              loss_fct = nn.NLLLoss(reduction="mean").to(device)

              loss = loss_fct(final_logits, labels)
            else:
              loss = 0
          else:
            outputs = self.pretrained_model(input_ids=input_ids, decoder_input_ids = labels, output_hidden_states=True)
            logits = outputs.decoder_hidden_states[1]
            new_logits = self.bottleneck(logits)  
            new_logits = self.dropout_layer(new_logits)
            new_logits = self.upscale(new_logits)
            final_logits = new_logits.permute(0,2,1)
            
            if labels is not None:
              loss_fct = nn.NLLLoss(reduction="mean").to(device)

              loss = loss_fct(final_logits, labels)
            else:
              loss = 0
          
          return ModelOutputs(
              logits=new_logits,
              loss=loss)
elif model_type == 'vanilla':
  class TextSummarizationModel(nn.Module):

      def __init__(self, lm=None, model_name=None):
          '''
          lm:         a pretrained transformer language model
          dropout:    dropoutrate for the dropout layer
          '''
          super(TextSummarizationModel, self).__init__()
          self.pretrained_model = lm
          self.model_name = model_name
      
      def forward(self, input_ids=None, labels=None): #dont think I need anything besides input ids
          if self.model_name == 'pretrained_summarizer':
            outputs = self.pretrained_model(input_ids, max_length=decoder_max_length, num_beams=5, early_stopping=True, output_hidden_states=True)
            logits = outputs.last_hidden_state
            final_logits = logits.permute(0,2,1)
            
            if labels is not None:
              loss_fct = nn.NLLLoss(reduction="mean").to(device)

              loss = loss_fct(final_logits, labels)
            else:
              loss = 0
          else:
            outputs = self.pretrained_model(input_ids=input_ids, decoder_input_ids = labels, output_hidden_states=True)
            logits = outputs.logits
            final_logits = logits.permute(0,2,1)
            
            if labels is not None:
              loss_fct = nn.NLLLoss(reduction="mean").to(device)

              loss = loss_fct(final_logits, labels)
            else:
              loss = 0
          
          return ModelOutputs(
              logits=logits,
              loss=loss)

In [None]:
'''intializing our BERT pretrained encoder decoder model and using GPU'''

# from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
from transformers import EncoderDecoderModel

if pretrained_model_name == 'bert':
  enc_dec_pretrained = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')
elif pretrained_model_name == 'bert-gpt2':
  enc_dec_pretrained = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'gpt2')
elif pretrained_model_name == 'gpt2':
  enc_dec_pretrained = EncoderDecoderModel.from_encoder_decoder_pretrained('gpt2', 'gpt2')
elif pretrained_model_name == 'pretrained_summarizer':
  enc_dec_pretrained = PegasusForConditionalGeneration.from_pretrained(model_name)

In [None]:
#defining it into our class
model = TextSummarizationModel(enc_dec_pretrained, pretrained_model_name)
model = model.to(device)

In [None]:
import torch

# Hyper-parameters: you could try playing with different settings
num_epochs = 1
learning_rate = 3e-5
weight_decay = 1e-5
eps = 1e-6
batch_size = 4 #was 32
warmup_rate = 0.05

# Calculating the number of warmup steps
num_training_cases = len(train_data)
t_total = (num_training_cases // batch_size + 1) * num_epochs
ext_warmup_steps = int(warmup_rate * t_total)

# Initializing an AdamW optimizer
ext_optim = torch.optim.AdamW(model.parameters(), lr=learning_rate,
                              eps=eps, weight_decay=weight_decay)

# Initializing the learning rate scheduler [details are in the BERT paper]
ext_sche = transformers.get_linear_schedule_with_warmup(
    ext_optim, num_warmup_steps=ext_warmup_steps, num_training_steps=t_total
)

print("***** Training Info *****")
print("  Num examples = %d" % t_total)
print("  Num Epochs = %d" % num_epochs)
print("  Batch size = %d" % batch_size)
print("  Total optimization steps = %d" % t_total)

In [None]:
def gather_batch(batch, full_text_key, label_key):

    # input_batch  = [x[full_text_key] for x in batch if len(x[full_text_key]) < encoder_max_length]
    input_batch  = [x[full_text_key] if len(x[full_text_key]) < encoder_max_length else x[full_text_key][:encoder_max_length] for x in batch]
    label_batch  = [x[label_key] for x in batch]

    return input_batch, label_batch

In [None]:
def vectorize_batch(batch, tokenizer, full_text_key, label_key):
    input_batch, label_batch = gather_batch(batch, full_text_key, label_key)

    # Encode the main body
    input_encode = tokenizer.batch_encode_plus(
        input_batch,
        max_length = encoder_max_length,
        truncation = True,
        padding = 'longest',
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    input_ids = input_encode['input_ids'].to(device)

    # Encode the summary
    label_encode = tokenizer2.batch_encode_plus(
        label_batch,
        max_length = decoder_max_length,
        truncation = True,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    label_ids = label_encode['input_ids'].to(device)

    return input_ids, label_ids

In [None]:
loss_lst = []

if train_model:

  model.train()
  max_grad_norm = 1

  print("Number of Epochs", num_epochs)
  tot_steps = num_training_cases / batch_size
  print("Total Training Steps", tot_steps)
  step_id = 0
  perc_steps = tot_steps * 0.05
  cur_step_displayed = 0
  for _ in range(num_epochs):

      random.shuffle(train_data)

      for i in tqdm(range(0, num_training_cases, batch_size), position=0, leave=True):
          batch = train_data[i: i + batch_size]
          input_ids, label_ids = vectorize_batch(batch, tokenizer, full_text_key, label_key)

          model.zero_grad()

          outputs = model(input_ids=input_ids, labels=label_ids)

          # Back-propagate the loss signal and clip the gradients
          loss = outputs.loss.mean()
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

          # Update neural network parameters and the learning rate
          ext_optim.step()
          ext_sche.step() # Update learning rate for better convergence

          model.zero_grad()

          if step_id >= cur_step_displayed + perc_steps:
              loss_lst.append(loss)
              print(f'\tAt step {step_id}, the extraction loss = {loss}')
              cur_step_displayed += perc_steps
          
          step_id += 1

  torch.save(model.state_dict(), MODEL_FOLDER+"/" + "text_summarization_model_{}_{}_{}.pt".format(wanted_data, pretrained_model_name, model_type))
  print('Finished Training')

else:
  # model.load_state_dict(torch.load(MODEL_FOLDER+"/" + "text_summarization_model_{}_{}_{}.pt".format(wanted_data, pretrained_model_name, model_type)))
  model.load_state_dict(torch.load(MODEL_FOLDER + "text_summarization_model_merged_data_bert-gpt2_original.pt"))
  model.to(device)

In [None]:
if train_model:
  import numpy as np
  import matplotlib.pyplot as plt
  %matplotlib inline
  f = plt.figure(figsize=(10,5))
  plt.title("Extraction Loss at Training Step")
  plt.xlabel("Step")
  plt.ylabel("Loss")
  plt.plot(loss_lst)
  plt.xticks(ticks=np.arange(0,len(loss_lst),10), labels=np.arange(0, len(loss_lst), 10)*100, rotation=30)
  f.show()

## Evaluating our model using ROUGE and BLEU

In [None]:
%%bash
pip install sacrebleu
pip install rouge_score

In [None]:
PAD_INDEX = 0
UNK_INDEX = 1
SOS_INDEX = 2
EOS_INDEX = 3
def greedy_decode(batch_logits):
  '''
  decodes the logits in a greedy way, picks the most probable word till EOS token is found
  logits: tensor, (batch_size x seq_len x vocab_size)
  '''
  batch_out_ids = torch.argmax(batch_logits, dim=2)
  batch_predicted = []
  for i in range(batch_out_ids.shape[0]):
    out_ids = batch_out_ids[i] 
    out_ids_trunc = []
    for id in out_ids:
      if id == EOS_INDEX:
        break
      out_ids_trunc.append(id)
    out_str = ' '.join(tokenizer2.batch_decode(torch.stack(out_ids_trunc)))
    batch_predicted.append(out_str)
  return batch_predicted

In [None]:
import sacrebleu
from rouge_score import rouge_scorer

rscorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)

In [None]:
model.eval()

#redefining the keys to the bc3 dataset in case that wasnt trained on
full_text_key = 'body'
label_key = 'summary'

rouge_scores_list = []
bleu_score_list = []
predictions = []
num_test_cases = len(test_data)

for i in tqdm(range(0, num_test_cases, batch_size), position=0, leave=True):
  batch = test_data[i: i + batch_size]
  input_batch  = [x[full_text_key] if len(x[full_text_key]) < encoder_max_length else x[full_text_key][:encoder_max_length] for x in batch]
  label_batch  = [x[label_key] for x in batch]

  input_encode = tokenizer.batch_encode_plus(
      input_batch,
      max_length = encoder_max_length,
      truncation = True,
      padding = 'longest',
      return_attention_mask = True,
      return_tensors = 'pt'
  )
  input_ids = input_encode['input_ids'].to(device)

  # Encode the summary
  label_encode = tokenizer.batch_encode_plus(
      label_batch,
      max_length = decoder_max_length,
      truncation = True,
      padding = 'max_length',
      return_attention_mask = True,
      return_tensors = 'pt'
  )
  label_ids = label_encode['input_ids'].to(device)

  with torch.no_grad():
    outputs = model(input_ids=input_ids, labels=label_ids)

  pred_batch = greedy_decode(outputs.logits)
  predictions.extend(pred_batch)
  rouge_scores_list.extend([rscorer.score(pred, targ) for targ, pred in zip(label_batch, pred_batch)])
  bleu_score_list.extend([sacrebleu.raw_corpus_bleu([pred], [[targ]], .01).score for pred, targ in zip(pred_batch, label_batch)])

In [None]:
print(np.mean([sc['rouge2'].fmeasure for sc in rouge_scores_list]))
print(np.mean([sc['rouge1'].fmeasure for sc in rouge_scores_list]))
print(np.mean(bleu_score_list))

In [None]:
predictions = pd.DataFrame({'predicted_summary': predictions})
predictions.to_csv(MODEL_FOLDER+"/" + "predictions_{}_{}_{}.csv".format(wanted_data, pretrained_model_name, model_type))