## Create Embeddings from HF

In [6]:
!pip install optimum[onnxruntime] onnxruntime-gpu onnx -q
#RIAVVIA RUNTIME!!!!!!!!!!

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
!mkdir onnx

In [1]:
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
from pathlib import Path

SAVE_PATH = Path("onnx")

model_id = "sentence-transformers/all-MiniLM-L6-v2"

# load vanilla transformers and convert to onnx
model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

('onnx/tokenizer_config.json',
 'onnx/special_tokens_map.json',
 'onnx/vocab.txt',
 'onnx/added_tokens.json',
 'onnx/tokenizer.json')

## 1. Load Dataset

In [2]:
!pip install datasets -q

In [4]:
from datasets import load_dataset

imdb = load_dataset("imdb")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
imdb_train_pd = imdb["train"].to_pandas()
imdb_test_pd = imdb["test"].to_pandas()
imdb_train_pd

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [6]:
# getting 1000 for faster processing
samples_per_label = 10000 // imdb_train_pd['label'].nunique()
df_small_train_sample = imdb_train_pd.groupby('label').apply(lambda x: x.sample(samples_per_label)).reset_index(drop=True)
df_small_train_sample

  df_small_train_sample = imdb_train_pd.groupby('label').apply(lambda x: x.sample(samples_per_label)).reset_index(drop=True)


Unnamed: 0,text,label
0,My mother keeps a cassette of this film as a g...,0
1,"Following the appalling Attack Force, chances ...",0
2,The whole Biker Movie genre has to be made up ...,0
3,After apprehending the man responsible for the...,0
4,The sects that capitalise on this film are wel...,0
...,...,...
9995,An incredible little English film for so many ...,1
9996,"""Homeward Bound: The Incredible Journey"" is on...",1
9997,I was very fond of this film. It kept me guess...,1
9998,In watching how the two brothers interact and ...,1


## 2. Convert Text To Embeddings

In [7]:
df = df_small_train_sample

In [8]:
from tokenizers import Tokenizer
import onnxruntime as ort
import numpy as np
from typing import List

# Use pytorches default epsilon for division by zero
# https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html
def normalize(v):
    norm = np.linalg.norm(v, axis=1)
    norm[norm == 0] = 1e-12
    return v / norm[:, np.newaxis]

# Sampel implementation of the default sentence-transformers model using ONNX
class DefaultEmbeddingModel():

    def __init__(self):
        # max_seq_length = 256, for some reason sentence-transformers uses 256 even though the HF config has a max length of 128
        # https://github.com/UKPLab/sentence-transformers/blob/3e1929fddef16df94f8bc6e3b10598a98f46e62d/docs/_static/html/models_en_sentence_embeddings.html#LL480
        self.tokenizer = Tokenizer.from_file("onnx/tokenizer.json")
        self.tokenizer.enable_truncation(max_length=256)
        self.tokenizer.enable_padding(pad_id=0, pad_token="[PAD]", length=256)
        self.model = ort.InferenceSession("onnx/model.onnx")


    def __call__(self, documents: List[str], batch_size: int = 32):
        all_embeddings = []

        for i in range(0, len(documents), batch_size):
            batch = documents[i:i + batch_size]
            encoded = [self.tokenizer.encode(d) for d in batch]
            input_ids = np.array([e.ids for e in encoded])
            attention_mask = np.array([e.attention_mask for e in encoded])
            onnx_input = {
                        "input_ids": np.array(input_ids, dtype=np.int64),
                        "attention_mask": np.array(attention_mask, dtype=np.int64),
                        "token_type_ids": np.array([np.zeros(len(e), dtype=np.int64) for e in input_ids], dtype=np.int64),
                        }
            model_output = self.model.run(None, onnx_input)
            last_hidden_state = model_output[0]
            # Perform mean pooling with attention weighting
            input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), last_hidden_state.shape)
            embeddings = np.sum(last_hidden_state * input_mask_expanded, 1) / np.clip(input_mask_expanded.sum(1), a_min=1e-9, a_max=None)
            embeddings = normalize(embeddings).astype(np.float32)
            all_embeddings.append(embeddings)

        return np.concatenate(all_embeddings)

In [9]:
model = DefaultEmbeddingModel()

def generate_embeddings(text):
    return model([text])[0]

In [13]:
from tqdm import tqdm

tqdm.pandas()

df['embeddings'] = df['text'].apply(generate_embeddings)
df

Unnamed: 0,text,label,embeddings
0,My mother keeps a cassette of this film as a g...,0,"[-0.08453036, 0.029612308, -0.008545775, -0.06..."
1,"Following the appalling Attack Force, chances ...",0,"[0.0086717745, 0.035316963, -0.003142024, 0.07..."
2,The whole Biker Movie genre has to be made up ...,0,"[0.0029011595, 0.04693485, -0.061057057, -0.03..."
3,After apprehending the man responsible for the...,0,"[-0.05496012, -0.0008805994, -0.042904574, -0...."
4,The sects that capitalise on this film are wel...,0,"[0.0051144445, 0.010276215, -0.048168007, -0.0..."
...,...,...,...
9995,An incredible little English film for so many ...,1,"[-0.039787658, 0.040039677, -0.057586174, -0.0..."
9996,"""Homeward Bound: The Incredible Journey"" is on...",1,"[-0.03021396, -0.038801964, 0.02921036, -0.012..."
9997,I was very fond of this film. It kept me guess...,1,"[-0.05967381, -0.031805467, -0.009821296, -0.0..."
9998,In watching how the two brothers interact and ...,1,"[-0.05373771, -0.0025991648, -0.024466867, 0.0..."


## 3. Let's Classify Using Sklearn on Embeddings

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X = df["embeddings"].to_list()
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=667,
                                                    )


knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [15]:
from sklearn.metrics import accuracy_score

y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

Accuracy: 0.7675
