# Loading the models

In [1]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/e5-large-v2')
model1 = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm


We will be using the second model to generate embeddings for the document and the summary.
# Loading the Data

In [2]:
input_texts = []

with open('../data/paper1', 'r') as f:
    input_texts.append("passage: " + f.read())
with open('../data/abstract1', 'r') as f:
    input_texts.append("passage: " + f.read())    
with open('../data/paper2', 'r') as f:
    input_texts.append("passage: " + f.read())
with open('../data/abstract2', 'r') as f:
    input_texts.append("passage: " + f.read())    

# Generating Embeddings
We will generate the embeddings and see the semantic similarity score for abstract and the papers

In [3]:
import numpy as np

embeddings = model.encode(input_texts, normalize_embeddings=True)
embeddings = np.array(embeddings)
print(f"For Paper 1:\n\tAbstract 1 Score: {np.dot(embeddings[0], embeddings[1])}, \
Abstract 2 Score: {np.dot(embeddings[0], embeddings[3])}.\n\
For Paper 2:\n\tAbstract 1 Score: {np.dot(embeddings[2], embeddings[1])}, \
Abstract 2 Score: {np.dot(embeddings[2], embeddings[3])}.")

embeddings1 = model1.encode(input_texts)
embeddings1 = np.array(embeddings1)
print(f"For Paper 1:\n\tAbstract 1 Score: {np.dot(embeddings1[0], embeddings1[1]) / np.linalg.norm(embeddings1[0]) / np.linalg.norm(embeddings1[1])}, \
Abstract 2 Score: {np.dot(embeddings1[0], embeddings1[3]) / np.linalg.norm(embeddings1[0]) / np.linalg.norm(embeddings1[3])}.\n\
For Paper 2:\n\tAbstract 1 Score: {np.dot(embeddings1[2], embeddings1[1]) / np.linalg.norm(embeddings1[2]) / np.linalg.norm(embeddings1[1])}, \
Abstract 2 Score: {np.dot(embeddings1[2], embeddings1[3]) / np.linalg.norm(embeddings1[2]) / np.linalg.norm(embeddings1[3])}.")

For Paper 1:
	Abstract 1 Score: 0.9030022621154785, Abstract 2 Score: 0.7938213348388672.
For Paper 2:
	Abstract 1 Score: 0.7524808645248413, Abstract 2 Score: 0.8678621053695679.
For Paper 1:
	Abstract 1 Score: 0.8340383172035217, Abstract 2 Score: 0.13522927463054657.
For Paper 2:
	Abstract 1 Score: 0.048615314066410065, Abstract 2 Score: 0.7643997669219971.


As we can see, model1 provides a much more stark difference between the similarity of abstract with its own paper and the similarity of abstract with another paper, which is the property which we desire: good summaries will be more similar to the source documents.

# Generating n-grams

In [16]:
# returns an array n-grams where n-grams[i] = (ith n-gram, i)
def generate_n_grams(n, sentences):
    return [('.'.join(sentences[i:i+n]), i) for i in range(len(sentences)-n+1)]

# get the sentences
paper1 = input_texts[0]
sentences = paper1.split(".") # getting the sentences


# generate n-grams
n = 2
n_grams = generate_n_grams(n, sentences)

# calculate similarity of each n-gram with the paper
sims = []
for gram in n_grams:
    embeddings1 = model1.encode([gram[0], paper1], normalize_embeddings=True)
    sims.append((np.dot(embeddings1[0], embeddings1[1]), gram[1]))

# select top k sentences
k = 10
r = int(np.ceil(k/n))
topr_n_grams = sorted(sims, key=lambda x: x[0], reverse=True)[:r]
topr_n_grams = sorted(topr_n_grams, key=lambda x: x[1]) # get the sentences back in order


$i^{th}$ row of the list `ngrams` (1-indexed) contains i-grams of the sentences from the source document. 

In [13]:

# ngrams = []
# k = 4
# for i in range(1, k+1):
#     ngrams.append(n_grams(i))

# sims = []
# for grams in ngrams:
#     sims.append([])
#     for gram in grams:
#         embeddings1 = model1.encode([gram, paper1], normalize_embeddings=True)
#         sims[-1].append(np.dot(embeddings1[0], embeddings1[1]))

# for i in range(len(sims)):
#     ind = np.argsort(sims[i])[::-1]
#     sims[i] = np.array(sims[i])[ind]
#     ngrams[i] = np.array(ngrams[i])[ind]

In [18]:
# print the generated sumamry
print("Summary:")
print(".\n".join([n_grams[gram[1]][0] for gram in topr_n_grams]))

Summary:
 it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models . many examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space @xmath0 , see e.
 @xcite . in the last years many interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.
 in the last years many interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.g.
 @xcite for the general case and @xcite for additive models . therefore , we will here consider the case of regularized kernel based methods based on a general convex and lipschitz co