## PEGASUS Local

In [1]:
import sys
sys.path.append('..')

import torch
from parser.parser import Parser

from scorer import Score

from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### Getting the token and downloading the model

In [2]:
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

# by default encoder-attention is `block_sparse` with num_random_blocks=3, block_size=64
model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")

# # decoder attention type can't be changed & will be "original_full"
# # you can change `attention_type` (encoder only) to full attention like this:
# model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", attention_type="original_full")

# # you can change `block_size` & `num_random_blocks` like this:
# model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", block_size=16, num_random_blocks=2)

### Getting response

In [3]:
def get_response(input_text):
  inputs = tokenizer(input_text, return_tensors='pt', max_length=4096, truncation=True)
  prediction = model.generate(**inputs)
  prediction = tokenizer.batch_decode(prediction)
  return prediction

### Getting the Summary

In [4]:
import sys
sys.path.append('..')
from Datasets.DataLoader import DataLoader

# loading the datasets
datasetLoader = DataLoader(datasetName='arxiv')

arxiv_test = datasetLoader.getData('../Datasets/', split='test')
datasetLoader.datasetName = 'pubmed'
pubmed_test = datasetLoader.getData('../Datasets/', split='test')

In [5]:
content = '.'.join(arxiv_test.loc[0, 'article_text'])

pegasusSummary = get_response(content)[0]

print(pegasusSummary)

<s> the problem of the existence of the periodicity of about 155 days in sunspot data from cycle 16 is considered.<n> the daily sunspot areas, the mean sunspot areas per carrington rotation, the monthly sunspot numbers and their fluctuations, which are obtained after removing the 11-year cycle, are analysed.<n> the power spectrum method is used.<n> the numerical results of the new method of the diagnosis of an echo - effect for sunspot area data are discussed.<n> the numerical results of the new method of the diagnosis of an echo - effect for sunspot area data are discussed.<n> the numerical results of the new method of the diagnosis of an echo - effect for sunspot area data are presented.<n> it is shown that the existence of the periodicity of about 155 days in sunspot data from cycle 16 is statistically significant during all solar cycles from 16 to 21.<n> the existence of this periodicity was confirmed by @xcite, @xcite, @xcite, @xcite, @xcite, @xcite, @xcite, @xcite, @xcite, @xcite

In [11]:
goldenSummary = '.'.join(arxiv_test.loc[0, 'abstract_text'])
score = Score(trueSummary = goldenSummary, predSummary = pegasusSummary)

In [12]:
for key, value in score.rougeScore().items():
    print(f"Criteria:{key}, Score:{value}")

Criteria:rouge1, Score:Score(precision=0.5433526011560693, recall=0.44976076555023925, fmeasure=0.4921465968586387)
Criteria:rouge2, Score:Score(precision=0.2558139534883721, recall=0.21153846153846154, fmeasure=0.23157894736842105)
Criteria:rougeL, Score:Score(precision=0.2947976878612717, recall=0.24401913875598086, fmeasure=0.26701570680628267)


In [13]:
import time
import numpy as np

# pick only the first 1000 rows from the dataframes
arxiv_test = arxiv_test[:100]
pubmed_test = pubmed_test[:100]

# creating 'Gold Summary' column
def mapping(row):
    row['Gold Summary'] = ''.join(row['abstract_text'])
    return row

arxiv_test = arxiv_test.apply(mapping, axis=1)
pubmed_test = pubmed_test.apply(mapping, axis=1)

# generating summaries
def generateSummary(row):
    article = ''.join(row['article_text'])
    summary = get_response(article)[0]
    row['Generated Summary'] = summary
    print(f"Generated summary for {row['article_id']}.")
    return row

start_time = time.process_time()
arxiv_test = arxiv_test.apply(generateSummary, axis=1)
print('Time taken for arxiv: ', time.process_time() - start_time)

start_time = time.process_time()
pubmed_test = pubmed_test.apply(generateSummary, axis=1)
print('Time taken for pubmed: ', time.process_time() - start_time)

# evaluating the summaries
from Evaluation.evaluation import rougeScores
arxiv_test, rougeScoresArxiv = rougeScores(arxiv_test)
pubmed_test, rougeScoresPubmed = rougeScores(pubmed_test)

# printing the results
print('arxiv')
print('rouge1: ', np.mean([ score.fmeasure for score in rougeScoresArxiv['rouge1'] ]))
print('rouge2: ', np.mean([ score.fmeasure for score in rougeScoresArxiv['rouge2'] ]))
print('rougeL: ', np.mean([ score.fmeasure for score in rougeScoresArxiv['rougeL'] ]))
print('pubmed')
print('rouge1: ', np.mean([ score.fmeasure for score in rougeScoresPubmed['rouge1'] ]))
print('rouge2: ', np.mean([ score.fmeasure for score in rougeScoresPubmed['rouge2'] ]))
print('rougeL: ', np.mean([ score.fmeasure for score in rougeScoresPubmed['rougeL'] ]))

# saving the results
arxiv_test.to_csv('arxiv_test_bigbird.csv')
pubmed_test.to_csv('pubmed_test_bigbird.csv')

Generated summary for 1009.3123.
Generated summary for 1512.09139.
Generated summary for 0909.1602.
Generated summary for 1512.03812.
Generated summary for 1512.09024.
Generated summary for 0807.5065.
Generated summary for 0908.1812.
Generated summary for hep-ph0701277.
Generated summary for 1311.0649.
Generated summary for nlin0001046.
Generated summary for quant-ph0307206.
Generated summary for 1412.2508.
Generated summary for 1512.07656.
Generated summary for 1004.5347.
Generated summary for 1001.0199.
Generated summary for hep-lat0105026.
Generated summary for quant-ph0305125.
Generated summary for 0809.0691.
Generated summary for hep-ph9602267.
Generated summary for 1307.2735.
Generated summary for astro-ph0205340.
Generated summary for 1111.4135.
Generated summary for 1602.03055.
Generated summary for hep-ex0307059.
Generated summary for 0801.1913.
Generated summary for astro-ph0011128.
Generated summary for 0907.5423.
Generated summary for 1601.05253.
Generated summary for astro