# Loading the models

In [34]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/e5-large-v2')
model1 = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

We will be using the second model to generate embeddings for the document and the summary.
# Loading the Data

In [35]:
input_texts = []

with open('../data/paper1', 'r') as f:
    input_texts.append("passage: " + f.read())
with open('../data/abstract1', 'r') as f:
    input_texts.append("passage: " + f.read())    
with open('../data/paper2', 'r') as f:
    input_texts.append("passage: " + f.read())
with open('../data/abstract2', 'r') as f:
    input_texts.append("passage: " + f.read())    

# Generating Embeddings
We will generate the embeddings and see the semantic similarity score for abstract and the papers

In [36]:
import numpy as np

embeddings = model.encode(input_texts, normalize_embeddings=True)
embeddings = np.array(embeddings)
print(f"For Paper 1:\n\tAbstract 1 Score: {np.dot(embeddings[0], embeddings[1])}, \
Abstract 2 Score: {np.dot(embeddings[0], embeddings[3])}.\n\
For Paper 2:\n\tAbstract 1 Score: {np.dot(embeddings[2], embeddings[1])}, \
Abstract 2 Score: {np.dot(embeddings[2], embeddings[3])}.")

embeddings1 = model1.encode(input_texts)
embeddings1 = np.array(embeddings1)
print(f"For Paper 1:\n\tAbstract 1 Score: {np.dot(embeddings1[0], embeddings1[1]) / np.linalg.norm(embeddings1[0]) / np.linalg.norm(embeddings1[1])}, \
Abstract 2 Score: {np.dot(embeddings1[0], embeddings1[3]) / np.linalg.norm(embeddings1[0]) / np.linalg.norm(embeddings1[3])}.\n\
For Paper 2:\n\tAbstract 1 Score: {np.dot(embeddings1[2], embeddings1[1]) / np.linalg.norm(embeddings1[2]) / np.linalg.norm(embeddings1[1])}, \
Abstract 2 Score: {np.dot(embeddings1[2], embeddings1[3]) / np.linalg.norm(embeddings1[2]) / np.linalg.norm(embeddings1[3])}.")

For Paper 1:
	Abstract 1 Score: 0.9030022621154785, Abstract 2 Score: 0.7938213348388672.
For Paper 2:
	Abstract 1 Score: 0.7524808645248413, Abstract 2 Score: 0.8678621053695679.
For Paper 1:
	Abstract 1 Score: 0.8340383172035217, Abstract 2 Score: 0.13522927463054657.
For Paper 2:
	Abstract 1 Score: 0.048615314066410065, Abstract 2 Score: 0.7643997669219971.


As we can see, model1 provides a much more stark difference between the similarity of abstract with its own paper and the similarity of abstract with another paper, which is the property which we desire: good summaries will be more similar to the source documents.

# Generating n-grams

In [37]:
# returns an array n-grams where n-grams[i] = (ith n-gram, i)
def generate_n_grams(n, sentences):
    return [('.'.join(sentences[i:i+n]), i) for i in range(len(sentences)-n+1)]

# get the sentences
paper1 = input_texts[0]
sentences = paper1.split(".") # getting the sentences

# generate n-grams
n = 2
n_grams = generate_n_grams(n, sentences)

# calculate similarity of each n-gram with the paper
sims = []
for gram in n_grams:
    embeddings1 = model1.encode([gram[0], paper1], normalize_embeddings=True)
    sims.append((np.dot(embeddings1[0], embeddings1[1]), gram[1]))

# select top k sentences
k = 10
r = int(np.ceil(k/n))
topr_n_grams = sorted(sims, key=lambda x: x[0], reverse=True)[:r]
topr_n_grams = sorted(topr_n_grams, key=lambda x: x[1]) # get the sentences back in order


$i^{th}$ row of the list `ngrams` (1-indexed) contains i-grams of the sentences from the source document. 

In [38]:

# ngrams = []
# k = 4
# for i in range(1, k+1):
#     ngrams.append(n_grams(i))

# sims = []
# for grams in ngrams:
#     sims.append([])
#     for gram in grams:
#         embeddings1 = model1.encode([gram, paper1], normalize_embeddings=True)
#         sims[-1].append(np.dot(embeddings1[0], embeddings1[1]))

# for i in range(len(sims)):
#     ind = np.argsort(sims[i])[::-1]
#     sims[i] = np.array(sims[i])[ind]
#     ngrams[i] = np.array(ngrams[i])[ind]

In [39]:
# print the generated sumamry
print("Summary:")
print(".\n".join([n_grams[gram[1]][0] for gram in topr_n_grams]))

Summary:
 it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models . many examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space @xmath0 , see e.
 @xcite . in the last years many interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.
 in the last years many interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.g.
 @xcite for the general case and @xcite for additive models . therefore , we will here consider the case of regularized kernel based methods based on a general convex and lipschitz co

Writing a function to generate summary.

In [91]:
class MatchSummarizer:
    def __init__(self, paper, n_gram_size, model=model1):
        # get the sentences
        self.paper = paper
        sentences = paper.split(".")
        self.sentences = sentences
        self.model = model

        # generate n-grams
        n = n_gram_size
        self.n_gram_size = n_gram_size
        n_grams = self.generate_n_grams(n, sentences)
        self.n_grams = n_grams

        # calculate similarity of each n-gram with the paper
        sims = []
        for gram in n_grams:
            embeddings1 = self.model.encode([gram[0], paper], normalize_embeddings=True)
            sims.append((np.dot(embeddings1[0], embeddings1[1]), gram[1]))

        self.sims = sims

    def generate_n_grams(self, n, sentences): 
        '''
        returns an array n-grams where n-grams[i] = (ith n-gram, i)
        '''
        return [('.'.join(sentences[i:i+n]), i) for i in range(len(sentences)-n+1)]
    
    def generateSummary(self, summary_size):
        '''
        Generates a summary of approximatly summary_size sentences
        '''
        # select top k sentences
        k = summary_size
        r = int(np.ceil(k/n))
        topr_n_grams = sorted(self.sims, key=lambda x: x[0], reverse=True)[:r]
        topr_n_grams = sorted(topr_n_grams, key=lambda x: x[1]) # get the sentences back in order

        # generate summary
        return ".\n".join([n_grams[gram[1]][0] for gram in topr_n_grams])


# Calculating Rouge Score

In [58]:
import sys
sys.path.append('../..')
from scorer import Score

golden_summary = """additive models play an important role in semiparametric statistics . 
this paper gives learning rates for regularized kernel based methods for additive models . 
these learning rates compare favourably in particular in high dimensions to recent results on 
optimal learning rates for purely nonparametric regularized kernel based quantile regression 
using the gaussian radial basis function kernel , provided the assumption of an additive model 
is valid . additionally , a concrete example is presented to show that a gaussian function 
depending only on one variable lies in a reproducing kernel hilbert space generated by an 
additive gaussian kernel , but does not belong to the reproducing kernel hilbert space 
generated by the multivariate gaussian kernel of the same variance . * key words and phrases . 
* additive model , kernel , quantile regression , semiparametric , rate of convergence , 
support vector machine ."""

our_summary = MatchSummarizer(paper1, 2).generateSummary(10)

score = Score(trueSummary=golden_summary, predSummary=our_summary)

In [54]:
for sc in score.rougeScore():
    print(f'{sc}: ', score.rougeScore()[sc].fmeasure)

rouge1:  0.3779904306220096
rouge2:  0.11538461538461539
rouge3:  0.04830917874396135


## Trying out different values of n

In [126]:
n_vals = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
n_sentences = [4, 8, 12, 16, 20, 24, 28, 32]
rouge_scores = []
for n in n_vals:
    summarizer = MatchSummarizer(input_texts[2], n, model=model)
    for n_sentence in n_sentences:
        our_summary = summarizer.generateSummary(n_sentence)
        score = Score(trueSummary=golden_summary, predSummary=our_summary)
        rouge_scores.append([n,
                             n_sentence,
                             score.rougeScore()['rouge1'].fmeasure, 
                             score.rougeScore()['rouge2'].fmeasure, 
                             score.rougeScore()['rouge3'].fmeasure])

In [133]:
from matplotlib import pyplot as plt
import pandas as pd

n_vals = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
n_sentences = [4, 8, 12, 16, 20, 24]
rouge_scores = pd.DataFrame(rouge_scores, columns=['n_gram_size', 'n_sentences', 'rouge1', 'rouge2', 'rouge3'])

# plot rouge1, rouge2, rouge3 vs n_sentences for each n_gram_size
# for n in n_vals:
#     plt.plot(n_sentences, rouge_scores[rouge_scores['n_gram_size'] == n]['rouge1'], label=f'rouge1, n={n}')
#     # plt.plot(n_sentences, rouge_scores[rouge_scores['n_gram_size'] == n]['rouge2'], label=f'rouge2, n={n}')
#     # plt.plot(n_sentences, rouge_scores[rouge_scores['n_gram_size'] == n]['rouge3'], label=f'rouge3, n={n}')    
# plt.legend()
# plt.show()

# best rouge1 scores for which parameters
for n in n_vals:
    print(rouge_scores[rouge_scores['n_gram_size'] == n].sort_values(by='rouge1', ascending=False).head(1))

   n_gram_size  n_sentences    rouge1    rouge2    rouge3
0            1            4  0.315152  0.073171  0.042945
   n_gram_size  n_sentences    rouge1    rouge2    rouge3
8            2            4  0.378641  0.147059  0.089109
    n_gram_size  n_sentences    rouge1    rouge2    rouge3
17            3            8  0.386364  0.129771  0.069231
    n_gram_size  n_sentences   rouge1    rouge2    rouge3
25            4            8  0.39823  0.116071  0.054054
    n_gram_size  n_sentences    rouge1    rouge2    rouge3
37            5           24  0.395415  0.132565  0.057971
    n_gram_size  n_sentences    rouge1    rouge2    rouge3
47            6           32  0.393162  0.131805  0.057637
    n_gram_size  n_sentences   rouge1    rouge2    rouge3
53            7           24  0.38835  0.123779  0.059016
    n_gram_size  n_sentences    rouge1    rouge2    rouge3
60            8           20  0.430279  0.144578  0.072874
    n_gram_size  n_sentences   rouge1    rouge2    rouge3
70    