In [3]:
import os

directory_path = 'documents/'
contents = {}  # Dictionary to store file contents

# Loop through each file in the directory
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    # Check if it's a file and not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            contents[filename] = file.read()  # Read and store file content
            
texts = list(contents.values())



In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5')
model.eval()

# Tokenize sentences
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    # Perform pooling. In this case, cls pooling.
    sentence_embeddings = model_output[0][:, 0]
# normalize embeddings
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings)



Sentence embeddings: tensor([[ 0.0167,  0.0076,  0.0121,  ...,  0.0299, -0.0065, -0.0162],
        [-0.0075, -0.0031, -0.0314,  ..., -0.0181, -0.0515, -0.0038],
        [ 0.0047, -0.0240, -0.0088,  ..., -0.0313, -0.0133, -0.0198],
        ...,
        [-0.0170, -0.0106, -0.0202,  ...,  0.0073,  0.0111, -0.0304],
        [ 0.0060,  0.0052,  0.0010,  ..., -0.0232, -0.0033,  0.0208],
        [ 0.0230,  0.0092, -0.0152,  ...,  0.0374, -0.0125, -0.0185]])


In [None]:
torch.save(sentence_embeddings, "embeddings/document_embeddings.pt")
