# Loading the models

In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/e5-large-v2')
model1 = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm


We will be using the second model to generate embeddings for the document and the summary.
# Loading the Data

In [2]:
input_texts = []

with open('../data/paper1', 'r') as f:
    input_texts.append("passage: " + f.read())
with open('../data/abstract1', 'r') as f:
    input_texts.append("passage: " + f.read())    
with open('../data/paper2', 'r') as f:
    input_texts.append("passage: " + f.read())
with open('../data/abstract2', 'r') as f:
    input_texts.append("passage: " + f.read())    

# Generating Embeddings
We will generate the embeddings and see the semantic similarity score for abstract and the papers

In [3]:
import numpy as np

embeddings = model.encode(input_texts, normalize_embeddings=True)
embeddings = np.array(embeddings)
print(f"For Paper 1:\n\tAbstract 1 Score: {np.dot(embeddings[0], embeddings[1])}, \
Abstract 2 Score: {np.dot(embeddings[0], embeddings[3])}.\n\
For Paper 2:\n\tAbstract 1 Score: {np.dot(embeddings[2], embeddings[1])}, \
Abstract 2 Score: {np.dot(embeddings[2], embeddings[3])}.")

embeddings1 = model1.encode(input_texts)
embeddings1 = np.array(embeddings1)
print(f"For Paper 1:\n\tAbstract 1 Score: {np.dot(embeddings1[0], embeddings1[1]) / np.linalg.norm(embeddings1[0]) / np.linalg.norm(embeddings1[1])}, \
Abstract 2 Score: {np.dot(embeddings1[0], embeddings1[3]) / np.linalg.norm(embeddings1[0]) / np.linalg.norm(embeddings1[3])}.\n\
For Paper 2:\n\tAbstract 1 Score: {np.dot(embeddings1[2], embeddings1[1]) / np.linalg.norm(embeddings1[2]) / np.linalg.norm(embeddings1[1])}, \
Abstract 2 Score: {np.dot(embeddings1[2], embeddings1[3]) / np.linalg.norm(embeddings1[2]) / np.linalg.norm(embeddings1[3])}.")

For Paper 1:
	Abstract 1 Score: 0.9030022621154785, Abstract 2 Score: 0.7938213348388672.
For Paper 2:
	Abstract 1 Score: 0.7524808645248413, Abstract 2 Score: 0.8678621053695679.
For Paper 1:
	Abstract 1 Score: 0.8340383172035217, Abstract 2 Score: 0.13522927463054657.
For Paper 2:
	Abstract 1 Score: 0.048615314066410065, Abstract 2 Score: 0.7643997669219971.


As we can see, model1 provides a much more stark difference between the similarity of abstract with its own paper and the similarity of abstract with another paper, which is the property which we desire: good summaries will be more similar to the source documents.

# Generating n-grams

In [11]:
# returns an array n-grams where n-grams[i] = (ith n-gram, i)
def generate_n_grams(n, sentences):
    return [('.'.join(sentences[i:i+n]), i) for i in range(len(sentences)-n+1)]

# get the sentences
paper1 = input_texts[0]
sentences = paper1.split(".") # getting the sentences

# generate n-grams
n = 2
n_grams = generate_n_grams(n, sentences)

# calculate similarity of each n-gram with the paper
sims = []
for gram in n_grams:
    embeddings1 = model1.encode([gram[0], paper1], normalize_embeddings=True)
    sims.append((np.dot(embeddings1[0], embeddings1[1]), gram[1]))

# select top k sentences
k = 10
r = int(np.ceil(k/n))
topr_n_grams = sorted(sims, key=lambda x: x[0], reverse=True)[:r]
topr_n_grams = sorted(topr_n_grams, key=lambda x: x[1]) # get the sentences back in order


TypeError: 'normalize_embeddings' is an invalid keyword argument for encode()

$i^{th}$ row of the list `ngrams` (1-indexed) contains i-grams of the sentences from the source document. 

In [5]:

# ngrams = []
# k = 4
# for i in range(1, k+1):
#     ngrams.append(n_grams(i))

# sims = []
# for grams in ngrams:
#     sims.append([])
#     for gram in grams:
#         embeddings1 = model1.encode([gram, paper1], normalize_embeddings=True)
#         sims[-1].append(np.dot(embeddings1[0], embeddings1[1]))

# for i in range(len(sims)):
#     ind = np.argsort(sims[i])[::-1]
#     sims[i] = np.array(sims[i])[ind]
#     ngrams[i] = np.array(ngrams[i])[ind]

In [6]:
# print the generated sumamry
print("Summary:")
print(".\n".join([n_grams[gram[1]][0] for gram in topr_n_grams]))

Summary:
 it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models . many examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space @xmath0 , see e.
 @xcite . in the last years many interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.
 in the last years many interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.g.
 @xcite for the general case and @xcite for additive models . therefore , we will here consider the case of regularized kernel based methods based on a general convex and lipschitz co

Writing a function to generate summary.

In [7]:
class MatchSummarizer:
    def __init__(self, sentences, n_gram_size, model=model1):
        '''
        paper: the research paper to summarize as a string.
        n_gram_size: the size of the n-grams to use for summarization.
        model: the model that will return the embedding for a given text.
               should have a method `encode` that takes a list of strings 
               and returns a list of embeddings.
        '''
        # get the sentences
        self.sentences = sentences
        self.model = model

        # generate n-grams
        n = n_gram_size
        self.n_gram_size = n_gram_size
        n_grams = self.generate_n_grams(n, sentences)
        self.n_grams = n_grams

        # calculate similarity of each n-gram with the paper
        sims = []
        for gram in n_grams:
            embeddings1 = self.model.encode([gram[0], paper], normalize_embeddings=True)
            sims.append((np.dot(embeddings1[0], embeddings1[1]), gram[1]))

        self.sims = sims

    def generate_n_grams(self, n, sentences): 
        '''
        returns an array n-grams where n-grams[i] = (ith n-gram, i)
        '''
        return [('.'.join(sentences[i:i+n]), i) for i in range(len(sentences)-n+1)]
    
    def generateSummary(self, summary_size):
        '''
        Generates a summary of approximatly `summary_size` sentences
        '''
        # select top k sentences
        k = summary_size
        sorted_n_grams = sorted(self.sims, key=lambda x: x[0], reverse=True)
        
        n_gram_ids = []
        done_set = set()

        # select top k sentences
        for sim, index in sorted_n_grams:
            if index in done_set:
                continue
            for x in range(index-self.n_gram_size+1, index+self.n_gram_size):
                done_set.add(x)

            n_gram_ids.append(index)
            k -= self.n_gram_size
            if k <= 0 or k < self.n_gram_size/2:
                break

        # generate summary
        return ".\n".join([self.n_grams[id][0] for id in n_gram_ids])


# Calculating Rouge Score

In [None]:
import sys
sys.path.append('../..')
from Scorer import Score

golden_summary = """additive models play an important role in semiparametric statistics . 
this paper gives learning rates for regularized kernel based methods for additive models . 
these learning rates compare favourably in particular in high dimensions to recent results on 
optimal learning rates for purely nonparametric regularized kernel based quantile regression 
using the gaussian radial basis function kernel , provided the assumption of an additive model 
is valid . additionally , a concrete example is presented to show that a gaussian function 
depending only on one variable lies in a reproducing kernel hilbert space generated by an 
additive gaussian kernel , but does not belong to the reproducing kernel hilbert space 
generated by the multivariate gaussian kernel of the same variance . * key words and phrases . 
* additive model , kernel , quantile regression , semiparametric , rate of convergence , 
support vector machine."""

our_summary = MatchSummarizer(paper1, 2).generateSummary(20)

score = Score(trueSummary=golden_summary, predSummary=our_summary)

In [None]:
for sc in score.rougeScore():
    print(f'{sc}: ', score.rougeScore()[sc].fmeasure)

rouge1:  0.29032258064516125
rouge2:  0.1235294117647059
rougeL:  0.1495601173020528


## Trying out different values of n

In [22]:
n_vals = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
n_sentences = [8, 12, 16]
rouge_scores = []
models = ['all-mpnet-base-v2']
for n in n_vals:
    for model in models:
        if model == 'all-mpnet-base-v2':
            sim_model = model1
        else:
            sim_model = model
        summarizer = MatchSummarizer(input_texts[0], n, model=sim_model)
        for n_sentence in n_sentences:
            our_summary = summarizer.generateSummary(n_sentence)
            score = Score(trueSummary=golden_summary, predSummary=our_summary)
            rouge_scores.append([model,
                                 n,
                                 n_sentence,
                                 score.rougeScore()['rouge1'].fmeasure, 
                                 score.rougeScore()['rouge2'].fmeasure, 
                                 score.rougeScore()['rougeL'].fmeasure])

In [23]:
from matplotlib import pyplot as plt
import pandas as pd

n_vals = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
n_sentences = [8, 12, 16]
rouge_scores = pd.DataFrame(rouge_scores, columns=['model_name', 'n_gram_size', 'n_sentences', 'rouge1', 'rouge2', 'rougeL'])

# plot rouge1, rouge2, rouge3 vs n_sentences for each n_gram_size
# for n in n_vals:
#     plt.plot(n_sentences, rouge_scores[rouge_scores['n_gram_size'] == n]['rouge1'], label=f'rouge1, n={n}')
#     # plt.plot(n_sentences, rouge_scores[rouge_scores['n_gram_size'] == n]['rouge2'], label=f'rouge2, n={n}')
#     # plt.plot(n_sentences, rouge_scores[rouge_scores['n_gram_size'] == n]['rouge3'], label=f'rouge3, n={n}')    
# plt.legend()
# plt.show()

# best rouge1 scores for which parameters
for n in n_vals:
    print(rouge_scores[rouge_scores['n_gram_size'] == n].sort_values(by='rouge1', ascending=False).head(1))

          model_name  n_gram_size  n_sentences    rouge1    rouge2    rougeL
0  all-mpnet-base-v2            1            8  0.367615  0.149451  0.196937
          model_name  n_gram_size  n_sentences    rouge1    rouge2    rougeL
3  all-mpnet-base-v2            2            8  0.445055  0.171271  0.203297
          model_name  n_gram_size  n_sentences    rouge1    rouge2    rougeL
7  all-mpnet-base-v2            3           12  0.429708  0.165333  0.196286
          model_name  n_gram_size  n_sentences   rouge1  rouge2    rougeL
9  all-mpnet-base-v2            4            8  0.42236   0.125  0.192547
           model_name  n_gram_size  n_sentences    rouge1    rouge2    rougeL
12  all-mpnet-base-v2            5            8  0.391185  0.110803  0.176309
           model_name  n_gram_size  n_sentences    rouge1    rouge2    rougeL
16  all-mpnet-base-v2            6           12  0.389041  0.110193  0.175342
           model_name  n_gram_size  n_sentences    rouge1    rouge2    rougeL


In [27]:
n_vals = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
n_sentences = [8, 12, 16]
rouge_scores = []

models = ['all-mpnet-base-v2']
for n in n_vals:
    for model in models:
        if model == 'all-mpnet-base-v2':
            sim_model = model1
        else:
            sim_model = model
        summarizer = MatchSummarizer(input_texts[0], n, model=sim_model)
        for n_sentence in n_sentences:
            our_summary = summarizer.generateSummary(n_sentence)
            score = Score(trueSummary=golden_summary, predSummary=our_summary)
            rouge_scores.append([model, 
                                 n,
                                 n_sentence,
                                 score.rougeScore()['rouge1'].fmeasure, 
                                 score.rougeScore()['rouge2'].fmeasure, 
                                 score.rougeScore()['rougeL'].fmeasure])

In [29]:
from matplotlib import pyplot as plt
import pandas as pd

n_vals = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
n_sentences = [8, 12, 16]
rouge_scores = pd.DataFrame(rouge_scores, columns=['model_name', 'n_gram_size', 'n_sentences', 'rouge1', 'rouge2', 'rouge3'])

# best rouge1 scores for which parameters
for n in n_vals:
    print(rouge_scores[rouge_scores['n_gram_size'] == n].sort_values(by='rouge1', ascending=False).head(1))

          model_name  n_gram_size  n_sentences    rouge1    rouge2    rouge3
0  all-mpnet-base-v2            1            8  0.367615  0.149451  0.196937
          model_name  n_gram_size  n_sentences    rouge1    rouge2    rouge3
3  all-mpnet-base-v2            2            8  0.445055  0.171271  0.203297
          model_name  n_gram_size  n_sentences    rouge1    rouge2    rouge3
7  all-mpnet-base-v2            3           12  0.429708  0.165333  0.196286
          model_name  n_gram_size  n_sentences   rouge1  rouge2    rouge3
9  all-mpnet-base-v2            4            8  0.42236   0.125  0.192547
           model_name  n_gram_size  n_sentences    rouge1    rouge2    rouge3
12  all-mpnet-base-v2            5            8  0.391185  0.110803  0.176309
           model_name  n_gram_size  n_sentences    rouge1    rouge2    rouge3
16  all-mpnet-base-v2            6           12  0.389041  0.110193  0.175342
           model_name  n_gram_size  n_sentences    rouge1    rouge2    rouge3
