# Generate Embeddings using SentenceTransformer



- Make sure to have data/collection.tsv in the same folder as this file
- The embeddings are generated in the folder embeddings/ with the name corpus_embeddings_{k} where k is an integer
- We have split the data into chunks of 200k passages each for faster processing

In [2]:
# Load necessary libraries
from sentence_transformers import SentenceTransformer
import pickle
import csv
from itertools import zip_longest
embedder = SentenceTransformer('all-MiniLM-L6-v2')


In [4]:
# Create a file for the entire corpus of passages
corpus = ['']
with open("data/collection.tsv") as fd:
    passageData = csv.reader(fd, delimiter="\t")
    for line in passageData:
        corpus.append(line[1])

In [5]:
# Chunk them so that they can be processed efficiently
len(corpus)
chunk_size = 200000
corpus_list = [list(filter(None, group)) for group in zip_longest(*[iter(corpus)] * chunk_size)]

In [7]:
len(corpus_list)

45

In [None]:
# Incase you have already created the corpus uncomment and use the below line
# with open('./embeddings/corpus', 'rb') as f:
#     corpus = pickle.load(f)

In [14]:
for i in range(45):
        corpus_embeddings = embedder.encode(corpus_list[i], convert_to_tensor=True)
        with open('./embeddings/corpus_embeddings_'+str(i), 'wb') as file:
                pickle.dump(corpus_embeddings, file)

In [5]:
del corpus_embeddings
del corpus
print(f"Array saved")

Array saved


In [1]:
import pickle
import torch
combined_corpus_embeddings = None
for k in range(45):
    file_name = f'./embeddings/corpus_embeddings_{k}'
    with open(file_name, 'rb') as file:
        corpus_embeddings = pickle.load(file)
        corpus_embeddings = torch.tensor(corpus_embeddings)
        if combined_corpus_embeddings is None:
            combined_corpus_embeddings = corpus_embeddings
        else:
            combined_corpus_embeddings = torch.cat((combined_corpus_embeddings, corpus_embeddings), dim=0)

output_file = './embeddings/combined_corpus_embeddings'
with open(output_file, 'wb') as file:
    pickle.dump(combined_corpus_embeddings, file)

print(f"Combined corpus embeddings saved to {output_file}")

  corpus_embeddings = torch.tensor(corpus_embeddings)


Combined corpus embeddings saved to ./embeddings/combined_corpus_embeddings
