# example of evaluating embeddings on text classification

In [1]:
import sys
sys.path.append("../src")
from text2sql import hello
print(hello.message)

hello, world!


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [21]:
import os
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from text2sql.engine.embeddings import (
    AzureEmbedder, 
    BedrockCohereEmbedder, 
    BedrockTitanv2Embedder, 
    SentenceTransformerEmbedder,
)

# create embedders

In [4]:
api_key=os.environ.get("AZURE_OPENAI_API_KEY")
azure_endpoint=os.environ.get("AZURE_OPENAI_API_ENDPOINT")
api_version=os.environ.get("AZURE_OPENAI_API_VERSION")
model=os.environ.get("AZURE_OPENAI_MODEL")
print(f"api_key: {api_key[:3]}")
print(f"azure_endpoint: {azure_endpoint}")
print(f"api_version: {api_version}")
print(f"model: {model}")

api_key: 3dc
azure_endpoint: https://gena-gpt-2.openai.azure.com/
api_version: 2024-06-01
model: gena-text-embedding-3-small


In [12]:
azure_ada_embedder = AzureEmbedder(
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.environ.get("AZURE_OPENAI_API_ENDPOINT"),
    api_version=os.environ.get("AZURE_OPENAI_API_VERSION"),
    model="gena-embedding",
    batch_size=8,
)

In [13]:
azure_te3_embedder = AzureEmbedder(
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.environ.get("AZURE_OPENAI_API_ENDPOINT"),
    api_version=os.environ.get("AZURE_OPENAI_API_VERSION"),
    model="gena-text-embedding-3-small",
    batch_size=8,
)

In [14]:
bedrock_cohere_clustering_embedder = BedrockCohereEmbedder(
    region_name="us-east-1",
    model="cohere.embed-multilingual-v3",
    input_type="clustering",
    batch_size=8,
)

In [15]:
bedrock_cohere_classification_embedder = BedrockCohereEmbedder(
    region_name="us-east-1",
    model="cohere.embed-multilingual-v3",
    input_type="classification",
    batch_size=8,
)

In [16]:
bedrock_cohere_document_embedder = BedrockCohereEmbedder(
    region_name="us-east-1",
    model="cohere.embed-multilingual-v3",
    input_type="search_document",
    batch_size=8,
)

In [17]:
bedrock_cohere_query_embedder = BedrockCohereEmbedder(
    region_name="us-east-1",
    model="cohere.embed-multilingual-v3",
    input_type="search_query",
    batch_size=8,
)

In [19]:
bedrock_titan_embedder = BedrockTitanv2Embedder(
    region_name="us-east-1",
    model="amazon.titan-embed-text-v2:0",
    dimensions=1024,
    batch_size=1,
)

In [20]:
sentence_transformer_embedder = SentenceTransformerEmbedder(
    model_path="sentence-transformers/LaBSE",
    batch_size=1,
)



# load dataset

we're using BANKING77, available at: https://huggingface.co/datasets/mteb/banking77/tree/main

In [33]:
import json
import os
import time

from collections import defaultdict

import numpy as np

In [31]:
np.__version__

'1.26.4'

In [23]:
data_path = "/data/gena_data/BANKING77"
train_file = os.path.join(data_path, "train.jsonl")
test_file = os.path.join(data_path, "test.jsonl")

In [25]:
def load_jsonl(file_path):
    data = []
    with open(file_path, "r") as f:
        for line in f:
            datum = line.strip("\n")
            if len(datum) > 0:
                data.append(json.loads(datum))
    return data

In [26]:
train_data = load_jsonl(train_file)
test_data = load_jsonl(test_file)
print(f"train_data: {len(train_data)}, test_data: {len(test_data)}")

train_data: 10003, test_data: 3080


In [37]:
# subsample by first n examples for each label
n = 25
sorted_data = defaultdict(list)
for datum in train_data:
    label = datum["label"]
    sorted_data[label].append(datum)
train_sampled_data = []
for label, data in sorted_data.items():
    train_sampled_data.extend(data[:n])
print(f"train_sampled_data: {len(train_sampled_data)}")

train_sampled_data: 1925


In [41]:
# subsample by first n examples for each label
n = 10
sorted_data = defaultdict(list)
for datum in test_data:
    label = datum["label"]
    sorted_data[label].append(datum)
test_sampled_data = []
for label, data in sorted_data.items():
    test_sampled_data.extend(data[:n])
print(f"test_sampled_data: {len(test_sampled_data)}")

test_sampled_data: 770


In [42]:
train_texts = [datum["text"] for datum in train_sampled_data]
train_labels = [datum["label"] for datum in train_sampled_data]
test_texts = [datum["text"] for datum in test_sampled_data]
test_labels = [datum["label"] for datum in test_sampled_data]

In [73]:
check_data_leaks = set(train_texts).intersection(set(test_texts))
print(f"check_data_leaks: {len(check_data_leaks)}")

check_data_leaks: 0


# embed all data

In [39]:
for emb_name, embedder in [
        ("openai-ada-002", azure_ada_embedder), 
        ("openai-text-emb3-small", azure_te3_embedder), 
        ("cohere-multi-clustering", bedrock_cohere_clustering_embedder), 
        ("cohere-multi-classification", bedrock_cohere_classification_embedder), 
        ("cohere-multi-document", bedrock_cohere_document_embedder), 
        ("sentence-transformer-labse", sentence_transformer_embedder),
    ]:
    save_dir = os.path.join(data_path, "train_embeddings", emb_name)
    os.makedirs(save_dir, exist_ok=True)
    fname = f"train-sampled-embeddings-{emb_name}.npy"
    if os.path.isfile(os.path.join(save_dir, fname)):
        print(f"embedding traindata with {emb_name} already exists")
        continue
    print(f"embedding traindata with {emb_name}")
    time.sleep(0.5)
    train_embeddings = embedder.embed(train_texts, verbose=True)
    np.save(os.path.join(save_dir, fname), train_embeddings)


embedding traindata with openai-ada-002


100%|██████████| 241/241 [01:15<00:00,  3.20it/s]


embedding traindata with openai-text-emb3-small


100%|██████████| 241/241 [02:10<00:00,  1.85it/s]


embedding traindata with cohere-multi-clustering


  datetime_now = datetime.datetime.utcnow()
100%|██████████| 241/241 [01:58<00:00,  2.04it/s]


embedding traindata with cohere-multi-classification


100%|██████████| 241/241 [01:22<00:00,  2.94it/s]


embedding traindata with cohere-multi-document


100%|██████████| 241/241 [04:02<00:00,  1.01s/it]


embedding traindata with sentence-transformer-labse


100%|██████████| 1925/1925 [00:46<00:00, 41.59it/s]


In [43]:
for emb_name, embedder in [
        ("openai-ada-002", azure_ada_embedder), 
        ("openai-text-emb3-small", azure_te3_embedder), 
        ("cohere-multi-clustering", bedrock_cohere_clustering_embedder), 
        ("cohere-multi-classification", bedrock_cohere_classification_embedder), 
        ("cohere-multi-document", bedrock_cohere_query_embedder),  # use query here 
        ("sentence-transformer-labse", sentence_transformer_embedder),
    ]:
    save_dir = os.path.join(data_path, "test_embeddings", emb_name)
    os.makedirs(save_dir, exist_ok=True)
    fname = f"test-embeddings-{emb_name}.npy"
    if os.path.isfile(os.path.join(save_dir, fname)):
        print(f"embedding traindata with {emb_name} already exists")
        continue
    print(f"embedding test data with {emb_name}")
    time.sleep(0.5)
    test_embeddings = embedder.embed(test_texts, verbose=True)
    np.save(os.path.join(save_dir, fname), test_embeddings)


embedding test data with openai-ada-002


100%|██████████| 97/97 [02:02<00:00,  1.26s/it]


embedding test data with openai-text-emb3-small


100%|██████████| 97/97 [00:56<00:00,  1.70it/s]


embedding test data with cohere-multi-clustering


  datetime_now = datetime.datetime.utcnow()
100%|██████████| 97/97 [02:20<00:00,  1.45s/it]


embedding test data with cohere-multi-classification


100%|██████████| 97/97 [01:18<00:00,  1.24it/s]


embedding test data with cohere-multi-document


100%|██████████| 97/97 [00:33<00:00,  2.92it/s]


embedding test data with sentence-transformer-labse


100%|██████████| 770/770 [00:18<00:00, 41.77it/s]


In [46]:
# validate embedding outputs
for emb_name, _ in [
        ("openai-ada-002", azure_ada_embedder), 
        ("openai-text-emb3-small", azure_te3_embedder), 
        ("cohere-multi-clustering", bedrock_cohere_clustering_embedder), 
        ("cohere-multi-classification", bedrock_cohere_classification_embedder), 
        ("cohere-multi-document", bedrock_cohere_document_embedder), 
        ("sentence-transformer-labse", sentence_transformer_embedder),
    ]:
    print(f"checking embeddings for {emb_name}")
    train_fname = f"train-sampled-embeddings-{emb_name}.npy"
    train_embedding_file = os.path.join(data_path, "train_embeddings", emb_name, train_fname)
    train_embeddings = np.load(train_embedding_file)
    if len(train_embeddings) != len(train_texts):
        print(f"train_embeddings: {train_embeddings.shape}, train_texts: {len(train_texts)}")
        continue
    test_fname = f"test-embeddings-{emb_name}.npy"
    test_embedding_file = os.path.join(data_path, "test_embeddings", emb_name, test_fname)
    test_embeddings = np.load(test_embedding_file)
    if len(test_embeddings) != len(test_texts):
        print(f"test_embeddings: {test_embeddings.shape}, train_texts: {len(test_texts)}")
        continue
    print("ok")

checking embeddings for openai-ada-002
ok
checking embeddings for openai-text-emb3-small
ok
checking embeddings for cohere-multi-clustering
ok
checking embeddings for cohere-multi-classification
ok
checking embeddings for cohere-multi-document
ok
checking embeddings for sentence-transformer-labse
ok


In [54]:
import tqdm
from sklearn.preprocessing import normalize
from text2sql.engine.retrieval import LocalRetriever

In [56]:
# run tests
result_dict = {}
for emb_name, _ in [
        ("openai-ada-002", azure_ada_embedder), 
        ("openai-text-emb3-small", azure_te3_embedder), 
        ("cohere-multi-clustering", bedrock_cohere_clustering_embedder), 
        ("cohere-multi-classification", bedrock_cohere_classification_embedder), 
        ("cohere-multi-document", bedrock_cohere_document_embedder), 
        ("sentence-transformer-labse", sentence_transformer_embedder),
    ]:
    for norm in (True, False):
        for distance in ("cosine", "euclidean"):
            test_name = f"{emb_name}_norm:{norm}_distance:{distance}"
            print(f"running test '{emb_name}' {norm=} {distance=}")
            time.sleep(0.5)
            train_fname = f"train-sampled-embeddings-{emb_name}.npy"
            train_embedding_file = os.path.join(data_path, "train_embeddings", emb_name, train_fname)
            train_embeddings = np.load(train_embedding_file)
            test_fname = f"test-embeddings-{emb_name}.npy"
            test_embedding_file = os.path.join(data_path, "test_embeddings", emb_name, test_fname)
            test_embeddings = np.load(test_embedding_file)
            if norm:
                test_fmt_embeddings = normalize(test_embeddings)
            retriever = LocalRetriever(train_embeddings, train_sampled_data, norm=norm, distance_metric=distance)
            top_labels = []
            for i in tqdm.trange(len(test_texts)):
                emb = test_fmt_embeddings[i]
                results = retriever.query(emb, top_k=10)
                top_label = [result["data"]["label"] for result in results]
                top_labels.append(top_label)
            result_dict[test_name] = top_labels


running test 'openai-ada-002' norm=True distance='cosine'


100%|██████████| 770/770 [00:03<00:00, 193.25it/s]


running test 'openai-ada-002' norm=True distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 485.99it/s]


running test 'openai-ada-002' norm=False distance='cosine'


100%|██████████| 770/770 [00:03<00:00, 203.44it/s]


running test 'openai-ada-002' norm=False distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 491.17it/s]


running test 'openai-text-emb3-small' norm=True distance='cosine'


100%|██████████| 770/770 [00:03<00:00, 196.02it/s]


running test 'openai-text-emb3-small' norm=True distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 489.58it/s]


running test 'openai-text-emb3-small' norm=False distance='cosine'


100%|██████████| 770/770 [00:03<00:00, 203.19it/s]


running test 'openai-text-emb3-small' norm=False distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 492.45it/s]


running test 'cohere-multi-clustering' norm=True distance='cosine'


100%|██████████| 770/770 [00:03<00:00, 236.94it/s]


running test 'cohere-multi-clustering' norm=True distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 733.96it/s]


running test 'cohere-multi-clustering' norm=False distance='cosine'


100%|██████████| 770/770 [00:02<00:00, 290.59it/s]


running test 'cohere-multi-clustering' norm=False distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 715.83it/s]


running test 'cohere-multi-classification' norm=True distance='cosine'


100%|██████████| 770/770 [00:02<00:00, 295.75it/s]


running test 'cohere-multi-classification' norm=True distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 716.50it/s]


running test 'cohere-multi-classification' norm=False distance='cosine'


100%|██████████| 770/770 [00:02<00:00, 260.77it/s]


running test 'cohere-multi-classification' norm=False distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 715.89it/s]


running test 'cohere-multi-document' norm=True distance='cosine'


100%|██████████| 770/770 [00:02<00:00, 289.63it/s]


running test 'cohere-multi-document' norm=True distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 716.09it/s]


running test 'cohere-multi-document' norm=False distance='cosine'


100%|██████████| 770/770 [00:02<00:00, 295.90it/s]


running test 'cohere-multi-document' norm=False distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 715.52it/s]


running test 'sentence-transformer-labse' norm=True distance='cosine'


100%|██████████| 770/770 [00:01<00:00, 757.20it/s]


running test 'sentence-transformer-labse' norm=True distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 768.77it/s]


running test 'sentence-transformer-labse' norm=False distance='cosine'


100%|██████████| 770/770 [00:01<00:00, 754.45it/s]


running test 'sentence-transformer-labse' norm=False distance='euclidean'


100%|██████████| 770/770 [00:01<00:00, 615.97it/s]


In [68]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

In [57]:
test_names = list(result_dict.keys())

In [67]:
results_dict = []
for test_name in test_names:
    top_labels = result_dict[test_name]
    test_preds = [row[0] for row in top_labels]
    macro_precision = precision_score(test_labels, test_preds, average="macro")
    macro_recall = recall_score(test_labels, test_preds, average="macro")
    micro_f1 = f1_score(test_labels, test_preds, average="micro")
    macro_f1 = f1_score(test_labels, test_preds, average="macro")
    print(f"{test_name}: {macro_precision=}, {macro_recall=}, {micro_f1=} {macro_f1=}")
    test, norm_str, dist_str = test_name.split("_")
    norm_val = norm_str.split(":")[1]
    dist_val = dist_str.split(":")[1]
    results_dict.append({"test_name": test, "norm": norm_val, "distance": dist_val, "macro_precision": macro_precision, "macro_recall": macro_recall, "micro_f1": micro_f1, "macro_f1": macro_f1})

openai-ada-002_norm:True_distance:cosine: macro_precision=0.904423102150375, macro_recall=0.9000000000000004, micro_f1=0.9 macro_f1=0.898537292633263
openai-ada-002_norm:True_distance:euclidean: macro_precision=0.904423102150375, macro_recall=0.9000000000000004, micro_f1=0.9 macro_f1=0.898537292633263
openai-ada-002_norm:False_distance:cosine: macro_precision=0.904423102150375, macro_recall=0.9000000000000004, micro_f1=0.9 macro_f1=0.898537292633263
openai-ada-002_norm:False_distance:euclidean: macro_precision=0.904423102150375, macro_recall=0.9000000000000004, micro_f1=0.9 macro_f1=0.898537292633263
openai-text-emb3-small_norm:True_distance:cosine: macro_precision=0.8972104014311807, macro_recall=0.8922077922077922, micro_f1=0.8922077922077922 macro_f1=0.8904681082423449
openai-text-emb3-small_norm:True_distance:euclidean: macro_precision=0.8972104014311807, macro_recall=0.8922077922077922, micro_f1=0.8922077922077922 macro_f1=0.8904681082423449
openai-text-emb3-small_norm:False_dista

In [72]:
pd.DataFrame(results_dict).to_csv("banking77_results.tsv", sep="\t", index=False)