In [1]:
import time
import numpy as np
from MatchSummarizer import MatchSummarizer
import sys
sys.path.append('..')
from Datasets.DataLoader import DataLoader

# loading the datasets
datasetLoader = DataLoader(datasetName='arxiv')

arxiv_test = datasetLoader.getData('../Datasets/', split='test')
datasetLoader.datasetName = 'pubmed'
pubmed_test = datasetLoader.getData('../Datasets/', split='test')

# pick only the first 1000 rows from the dataframes
arxiv_test = arxiv_test[:100]
pubmed_test = pubmed_test[:100]

# creating 'Gold Summary' column
def mapping(row):
    row['Gold Summary'] = ''.join(row['abstract_text'])
    return row

arxiv_test = arxiv_test.apply(mapping, axis=1)
pubmed_test = pubmed_test.apply(mapping, axis=1)

In [2]:
# loading the model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
model2 = SentenceTransformer('ArtifactAI/arxiv-distilbert-base-v3-GenQ')
n_gram_size = 3
summary_size = 10

# generating summaries
def generateSummary(row):
    article = ''.join(row['article_text'])
    summarizer = MatchSummarizer(article, n_gram_size, model)
    summary = summarizer.generateSummary(summary_size)
    row['Generated Summary'] = summary
    print(f"Generated summary for {row['article_id']}.")
    return row

start_time = time.process_time()
arxiv_test = arxiv_test.apply(generateSummary, axis=1)
print('Time taken for arxiv: ', (time.process_time() - start_time)/60, 'minutes.')

# start_time = time.process_time()
# pubmed_test = pubmed_test.apply(generateSummary, axis=1)
# print('Time taken for pubmed: ', (time.process_time() - start_time)/60, 'minutes.')

  from .autonotebook import tqdm as notebook_tqdm


Generated summary for 1009.3123.
Generated summary for 1512.09139.
Generated summary for 0909.1602.
Generated summary for 1512.03812.
Generated summary for 1512.09024.
Generated summary for 0807.5065.
Generated summary for 0908.1812.
Generated summary for hep-ph0701277.
Generated summary for 1311.0649.
Generated summary for nlin0001046.
Generated summary for quant-ph0307206.
Generated summary for 1412.2508.
Generated summary for 1512.07656.
Generated summary for 1004.5347.
Generated summary for 1001.0199.
Generated summary for hep-lat0105026.
Generated summary for quant-ph0305125.
Generated summary for 0809.0691.
Generated summary for hep-ph9602267.
Generated summary for 1307.2735.
Generated summary for astro-ph0205340.
Generated summary for 1111.4135.
Generated summary for 1602.03055.
Generated summary for hep-ex0307059.
Generated summary for 0801.1913.
Generated summary for astro-ph0011128.
Generated summary for 0907.5423.
Generated summary for 1601.05253.
Generated summary for astro

In [4]:
# evaluating the summaries
import Evaluation.evaluation as evaluation
import importlib
importlib.reload(evaluation)

rougeScores = evaluation.rougeScores
arxiv_test, rougeScoresArxiv = rougeScores(arxiv_test)

# printing the results
print('arxiv')
print('rouge1: ', np.mean([ score.fmeasure for score in rougeScoresArxiv['rouge1'] ]))
print('rouge2: ', np.mean([ score.fmeasure for score in rougeScoresArxiv['rouge2'] ]))
print('rougeL: ', np.mean([ score.fmeasure for score in rougeScoresArxiv['rougeL'] ]))

# saving the results
arxiv_test.to_csv('arxiv_test_matchSum.csv')

arxiv
rouge1:  0.36929161032140134
rouge2:  0.12271812670596084
rougeL:  0.18628748467665954


In [5]:
start_time = time.process_time()
pubmed_test = pubmed_test.apply(generateSummary, axis=1)
print('Time taken for pubmed: ', (time.process_time() - start_time)/60, 'minutes.')

Generated summary for PMC5075302.
Generated summary for PMC3309138.
Generated summary for PMC4086000.
Generated summary for PMC3603086.
Generated summary for PMC4414990.
Generated summary for PMC5094872.
Generated summary for PMC3702150.
Generated summary for PMC4262794.
Generated summary for PMC3320503.
Generated summary for PMC3679767.
Generated summary for PMC3830274.
Generated summary for PMC3270611.
Generated summary for PMC4841868.
Generated summary for PMC3135278.
Generated summary for PMC4727599.
Generated summary for PMC2656958.
Generated summary for PMC3461795.
Generated summary for PMC5002941.
Generated summary for PMC3580606.
Generated summary for PMC5353405.
Generated summary for PMC4745564.
Generated summary for PMC5216144.
Generated summary for PMC5189720.
Generated summary for PMC3881782.
Generated summary for PMC4573445.
Generated summary for PMC3459524.
Generated summary for PMC5423572.
Generated summary for PMC4749793.
Generated summary for PMC4815391.
Generated summ

In [6]:
pubmed_test, rougeScoresPubmed = rougeScores(pubmed_test)
print('pubmed')
print('rouge1: ', np.mean([ score.fmeasure for score in rougeScoresPubmed['rouge1'] ]))
print('rouge2: ', np.mean([ score.fmeasure for score in rougeScoresPubmed['rouge2'] ]))
print('rougeL: ', np.mean([ score.fmeasure for score in rougeScoresPubmed['rougeL'] ]))
pubmed_test.to_csv('pubmed_test_matchSum.csv')

pubmed
rouge1:  0.401717566558167
rouge2:  0.14963074250494834
rougeL:  0.20940120337682622
