In [None]:
# !pip install -r requirements.txt -q

In [1]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## **MemSum + Pegasus Encoder**

In [None]:
!python src/MemSum/src/MemSum_Full/train.py -pegasus_mode True -training_corpus_file_name src/MemSum/data/custom_data/train_CUSTOM_labelled.jsonl -validation_corpus_file_name src/MemSum/data/custom_data/val_CUSTOM_raw.jsonl -model_folder src/MemSum/model/MemSum_Full/custom_data/200dim/run2/ -log_folder src/MemSum/log/MemSum_Full/custom_data/200dim/run2/ -vocabulary_file_name src/MemSum/model/glove/vocabulary_200dim.pkl -pretrained_unigram_embeddings_file_name src/MemSum/model/glove/unigram_embeddings_200dim.pkl -max_seq_len 100 -max_doc_len 150 -num_of_epochs 10 -save_every 1000 -n_device 1 -batch_size_per_device 1 -max_extracted_sentences_per_document 7 -moving_average_decay 0.999 -p_stop_thres 0.6

### The model that was used is [PEGASUS-BASE](https://huggingface.co/google/pegasus-x-base)
* Load the model and the tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/pegasus-x-base")

model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-x-base")

* This is how the decoder works

In [None]:
input_text = "Studies have shown that owning a dog has numerous benefits."

# Tokenize the input text
input_tokens = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

output_tokens = model.generate(
    input_tokens.input_ids,
    decoder_start_token_id=model.config.pad_token_id,
    max_length=50,  # Set the desired maximum length of the generated output
    num_beams=1,  # Number of beams for beam search
    early_stopping=True,  # Stop generation when all beams have reached the end token
)

# Decode the generated output tokens
decoded_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

print(decoded_output)

* This is how the encoder works

In [None]:

# Example input text
input_text = ["Studies have shown that owning a dog has numerous benefits.",
              "hi how are you?"]

# Tokenize the input text
input_tokens = tokenizer(input_text, return_tensors="pt",
                         truncation=True, max_length=100,
                         padding='max_length'
                         )
print(input_tokens['input_ids'].shape)
# Pass the input through the encoder
encoder_outputs = model.model.encoder(**input_tokens)

# # Access the encoder outputs
# encoder_last_hidden_state = encoder_outputs.last_hidden_state
# print(encoder_last_hidden_state.shape)

In [None]:
model.to('cuda')
t = torch.randint(0, 100, size = (1, 100*100), device = 'cuda')
t2 = torch.randint(0, 1, size = (1, 100*100), device = 'cuda')
input_tok = {}
input_tok['input_ids'] = t
input_tok['attention_mask'] = t2
model.model.encoder(**input_tok)

In [18]:
model.model.decoder

NameError: name 'model' is not defined

## **Evaluation**

In [19]:
from src.MemSum.summarizers import MemSum
from tqdm import tqdm
from rouge_score import rouge_scorer
import json
import numpy as np

In [20]:
rouge_cal = rouge_scorer.RougeScorer(
    ['rouge1', 'rouge2', 'rougeLsum'], use_stemmer=True)

memsum_custom_data = MemSum("src/MemSum/model/MemSum_Full/custom_data/200dim/run3/model_batch_1000_pegs.pt",
                            "src/MemSum/model/glove/vocabulary_200dim.pkl",
                            gpu=True,  max_doc_len=500, pegasus_mode=True, embed_dim=768)

pad_index is:  1


In [21]:
test_corpus_custom_data = [ json.loads(line) for line in open("src/MemSum/data/custom_data/test_CUSTOM_raw.jsonl")]

In [22]:
def evaluate( model, corpus, p_stop, max_extracted_sentences, rouge_cal ):
    scores = []
    for data in tqdm(corpus):
        gold_summary = data["summary"]
        extracted_summary = model.extract( [data["text"]], p_stop_thres = p_stop, max_extracted_sentences_per_document = max_extracted_sentences )[0]

        score = rouge_cal.score( "\n".join( gold_summary ), "\n".join(extracted_summary)  )
        scores.append( [score["rouge1"].fmeasure, score["rouge2"].fmeasure, score["rougeLsum"].fmeasure ] )

    return np.asarray(scores).mean(axis = 0)

In [23]:
evaluate( memsum_custom_data, test_corpus_custom_data, 0.6, 7, rouge_cal)

100%|██████████| 100/100 [01:37<00:00,  1.02it/s]


array([0.43501696, 0.17507162, 0.39479994])