In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd "/content/drive/My Drive"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive


In [None]:
p!pip install --progress-bar off  qdrant-client pandas transformers accelerate

In [2]:
import json
import re
from time import time

import torch
import torch.nn.functional as F
#from sentence_transformers import SentenceTransformer
from torch import Tensor
from transformers import AutoModel, AutoTokenizer
import numpy as np


In [29]:

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def embed(tokenizer, model, texts, batch_size=1):
    # Sentence transformers for E5
    embeddings = []
    for i in range(0, len(texts), batch_size):
        #print("batch:", i)
        batch_dict = tokenizer(
            texts[i : i + batch_size],
            max_length=512,
            padding=True,
            truncation=True,
            return_tensors="pt",
        ).to("cuda")
        outputs = model(**batch_dict)

        vectors = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
        vectors = F.normalize(vectors, p=2, dim=1)
        # print(type(vectors)) -> Tensor
        # print(vectors.shape) -> (n_doc X size_embedding)
        embeddings.append(vectors.detach().cpu().numpy())
        #torch.cuda.empty_cache()

    #return torch.cat(embeddings) # burn memory
    return np.vstack(embeddings)

now = time()

device_map = "cuda:0"

# sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="e5-multilingual", device = "cuda")
model_name = "intfloat/multilingual-e5-base"
#model_name = "intfloat/multilingual-e5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, device_map=device_map)  # -> SentenceTransformer

loading_time = time()
print("Loading time: %.3f" % (loading_time - now))

Loading time: 2.007


In [30]:

with open("_data/export-expa-c-riences.json") as f:
    documents = json.load(f)

def add_space_after_punctuation(text):
    return re.sub(r'([.,;:!?])([^\s\d])', r'\1 \2', text)

for d in documents:
  descr = d["description"]
  d["description"] = re.sub(r'([.,;!?])([^\s])', r'\1 \2', descr)


In [31]:

now = time()
texts = [x["description"] for x in documents]
embeddings = embed(tokenizer, model, texts, batch_size=4)
print("Inference time: %.3f" % (time() - now))

np.save('embeddings_e5_experiences.npy', embeddings)

Inference time: 633.767
