## PEGASUS Local

In [4]:
import sys
sys.path.append('..')

import torch
from parser.parser import Parser

from scorer import Score

from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = 'google/pegasus-arxiv'
# torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch_device = 'cpu'

### Getting the token and downloading the model

In [5]:
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)


[A
[A
[A
[A
[A
[A
[A
[A

KeyboardInterrupt: 

### Getting response

In [None]:
def get_response(input_text):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=1024, return_tensors="pt").to(torch_device)
  gen_out = model.generate(**batch,max_length=128,num_beams=5, num_return_sequences=1, do_sample = True, temperature=1.5)
  output_text = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
  return output_text


### Getting the Summary

In [None]:
import sys
sys.path.append('..')
from Datasets.DataLoader import DataLoader

# loading the datasets
datasetLoader = DataLoader(datasetName='arxiv')

arxiv_test = datasetLoader.getData('../Datasets/', split='test')
datasetLoader.datasetName = 'pubmed'
pubmed_test = datasetLoader.getData('../Datasets/', split='test')

content = '.'.join(arxiv_test.loc[0, 'article_text'])

pegasusSummary = get_response(content)

print(pegasusSummary)

IndexError: index out of range in self

In [None]:
goldenSummary = '.'.join(arxiv_test.loc[0, 'abstract_text'])
score = Score(trueSummary = goldenSummary, predSummary = pegasusSummary)

In [None]:
for key, value in score.rougeScore().items():
    print(f"Criteria:{key}, Score:{value}" )

Criteria:rouge1, Score:Score(precision=0.0, recall=0.0, fmeasure=0.0)
Criteria:rouge2, Score:Score(precision=0.0, recall=0.0, fmeasure=0.0)
Criteria:rougeL, Score:Score(precision=0.0, recall=0.0, fmeasure=0.0)


In [None]:
import time
import numpy as np

# pick only the first 1000 rows from the dataframes
arxiv_test = arxiv_test[:10]
pubmed_test = pubmed_test[:10]

# creating 'Gold Summary' column
def mapping(row):
    row['Gold Summary'] = ''.join(row['abstract_text'])
    return row

arxiv_test = arxiv_test.apply(mapping, axis=1)
pubmed_test = pubmed_test.apply(mapping, axis=1)

# generating summaries
def generateSummary(row):
    article = ''.join(row['article_text'])
    summary = get_response(article)[0]
    row['Generated Summary'] = summary
    print(f"Generated summary for {row['article_id']}.")
    return row

start_time = time.process_time()
arxiv_test = arxiv_test.apply(generateSummary, axis=1)
print('Time taken for arxiv: ', time.process_time() - start_time)

start_time = time.process_time()
pubmed_test = pubmed_test.apply(generateSummary, axis=1)
print('Time taken for pubmed: ', time.process_time() - start_time)

# evaluating the summaries
from Evaluation.evaluation import rougeScores
arxiv_test, rougeScoresArxiv = rougeScores(arxiv_test)
pubmed_test, rougeScoresPubmed = rougeScores(pubmed_test)

# printing the results
print('arxiv')
print('rouge1: ', np.mean([ score.fmeasure for score in rougeScoresArxiv['rouge1'] ]))
print('rouge2: ', np.mean([ score.fmeasure for score in rougeScoresArxiv['rouge2'] ]))
print('rougeL: ', np.mean([ score.fmeasure for score in rougeScoresArxiv['rougeL'] ]))
print('pubmed')
print('rouge1: ', np.mean([ score.fmeasure for score in rougeScoresPubmed['rouge1'] ]))
print('rouge2: ', np.mean([ score.fmeasure for score in rougeScoresPubmed['rouge2'] ]))
print('rougeL: ', np.mean([ score.fmeasure for score in rougeScoresPubmed['rougeL'] ]))

# saving the results
arxiv_test.to_csv('arxiv_test_pegasus.csv')
pubmed_test.to_csv('pubmed_test_pegasus.csv')


Generated summary for 1009.3123.
Generated summary for 1512.09139.
Generated summary for 0909.1602.
Generated summary for 1512.03812.
Generated summary for 1512.09024.
Generated summary for 0807.5065.
Generated summary for 0908.1812.
Generated summary for hep-ph0701277.
Generated summary for 1311.0649.
Generated summary for nlin0001046.
Time taken for arxiv:  2318.8101469320004
Generated summary for PMC5075302.
Generated summary for PMC3309138.
Generated summary for PMC4086000.
Generated summary for PMC3603086.
Generated summary for PMC4414990.
Generated summary for PMC5094872.
Generated summary for PMC3702150.
Generated summary for PMC4262794.
Generated summary for PMC3320503.
Generated summary for PMC3679767.
Time taken for pubmed:  2320.0221513430006
arxiv
rouge1:  0.0
rouge2:  0.0
rougeL:  0.0
pubmed
rouge1:  0.0
rouge2:  0.0
rougeL:  0.0
