In [2]:
from google.colab import drive
drive.mount('/content/drive')

%cd "/content/drive/My Drive"

Mounted at /content/drive
/content/drive/My Drive


In [5]:
!pip install -qq --progress-bar off  pandas transformers accelerate

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 4.24.3 which is incompatible.[0m[31m
[0m

In [7]:
import json
import re
from time import time

import torch
import torch.nn.functional as F
#from sentence_transformers import SentenceTransformer
from torch import Tensor
from transformers import AutoModel, AutoTokenizer
import numpy as np


In [9]:
# Assumes GPU

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def embed(tokenizer, model, texts, batch_size=1):
    # Sentence transformers for E5
    embeddings = []
    for i in range(0, len(texts), batch_size):
        if i % 100 == 0:
          print("batch:", i)
        batch_dict = tokenizer(
            texts[i : i + batch_size],
            max_length=512,
            padding=True,
            truncation=True,
            return_tensors="pt",
        ).to("cuda")
        outputs = model(**batch_dict)

        vectors = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
        vectors = F.normalize(vectors, p=2, dim=1)
        # print(type(vectors)) -> Tensor
        # print(vectors.shape) -> (n_doc X size_embedding)
        embeddings.append(vectors.detach().cpu().numpy())
        #torch.cuda.empty_cache()

    #return torch.cat(embeddings) # burn memory
    return np.vstack(embeddings)

now = time()

device_map = "cuda:0"

# sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="e5-multilingual", device = "cuda")
model_name = "intfloat/multilingual-e5-base"
#model_name = "intfloat/multilingual-e5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, device_map=device_map)  # -> SentenceTransformer

loading_time = time()
print("Loading time: %.3f" % (loading_time - now))

Loading time: 2.683


In [None]:
#
# EXPERIENCES
#

with open("_data/export-expa-c-riences.json") as f:
    documents = json.load(f)

def add_space_after_punctuation(text):
    return re.sub(r'([.,;:!?])([^\s\d])', r'\1 \2', text)

for d in documents:
  descr = d["description"]
  d["description"] = re.sub(r'([.,;!?])([^\s])', r'\1 \2', descr)


In [None]:
now = time()
texts = [x["description"] for x in documents]
embeddings = embed(tokenizer, model, texts, batch_size=4)
print("Inference time: %.3f" % (time() - now))

np.save('embeddings_e5_experiences.npy', embeddings)

Inference time: 633.767


In [10]:
#
# CHUNKS
#

with open("_data/xmlfiles_as_chunks.json") as f:
    documents = json.load(f)

for doc in documents:
    if "context" in doc:
        doc["context"] = " > ".join(doc["context"])


In [11]:
now = time()
texts = [" ".join([x["title"], x["introduction"], x["text"], x.get("context", "")]) for x in documents]
embeddings = embed(tokenizer, model, texts, batch_size=4)
print("Inference time: %.3f" % (time() - now))

np.save('embeddings_e5_chunks.npy', embeddings)

batch: 0
batch: 100
batch: 200
batch: 300
batch: 400
batch: 500
batch: 600
batch: 700
batch: 800
batch: 900
batch: 1000
batch: 1100
batch: 1200
batch: 1300
batch: 1400
batch: 1500
batch: 1600
batch: 1700
batch: 1800
batch: 1900
batch: 2000
batch: 2100
batch: 2200
batch: 2300
batch: 2400
batch: 2500
batch: 2600
batch: 2700
batch: 2800
batch: 2900
batch: 3000
batch: 3100
batch: 3200
batch: 3300
batch: 3400
batch: 3500
batch: 3600
batch: 3700
batch: 3800
batch: 3900
batch: 4000
batch: 4100
batch: 4200
batch: 4300
batch: 4400
batch: 4500
batch: 4600
batch: 4700
batch: 4800
batch: 4900
batch: 5000
batch: 5100
batch: 5200
batch: 5300
batch: 5400
batch: 5500
batch: 5600
batch: 5700
batch: 5800
batch: 5900
batch: 6000
batch: 6100
batch: 6200
batch: 6300
batch: 6400
batch: 6500
batch: 6600
batch: 6700
batch: 6800
batch: 6900
batch: 7000
batch: 7100
batch: 7200
batch: 7300
batch: 7400
batch: 7500
batch: 7600
batch: 7700
batch: 7800
batch: 7900
batch: 8000
batch: 8100
batch: 8200
batch: 8300
batc