In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from huggingface_hub import login, whoami

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# DATASET CONFIG
DATASET='sentence-transformers/trivia-qa-triplet'
MODEL='Alibaba-NLP/gte-modernbert-base'
MIN_TOKENS=50
MAX_TOKENS=8000
SAMPLES=10_000
MAX_VUS = 2000
TEXT_COLUMN='positive'
DATASET_PATH='data/embedding-dataset.json'
# K6_BIN = "/usr/bin/k6"
K6_BIN = "~/.local/bin/k6-sse"

HUB_DATASET_PATH = f'{whoami()["name"]}/embedding-ie-optimization'

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer
from src.process_dataset import tokenize_and_filter, sample_dataset, save_dataset

dataset = load_dataset(DATASET, 'triplet', split='train')

tokenizer = AutoTokenizer.from_pretrained(MODEL)

dataset = tokenize_and_filter(dataset, tokenizer, text_column=TEXT_COLUMN, min_tokens=MIN_TOKENS, max_tokens=MAX_TOKENS, num_proc=8)
dataset = sample_dataset(dataset, n_samples=SAMPLES, seed=42)
save_dataset(dataset.select_columns([TEXT_COLUMN]), DATASET_PATH)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
[32m2025-01-28 10:51:34.771[0m | [32m[1mSUCCESS [0m | [36msrc.process_dataset[0m:[36msample_dataset[0m:[36m44[0m - [32m[1mSampled dataset down to 10000 samples[0m
[32m2025-01-28 10:51:34.826[0m | [32m[1mSUCCESS [0m | [36msrc.process_dataset[0m:[36msave_dataset[0m:[36m58[0m - [32m[1mSaved dataset to data/embedding-dataset.json[0m


In [13]:
import plotly.graph_objects as go
from IPython.display import display, HTML
import base64
import json

num_tokens = dataset.to_pandas()['num_tokens']

# Create histogram
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=num_tokens,
    nbinsx=30,  # Adjust number of bins as needed
    marker=dict(color="lightblue", line=dict(color="black", width=1))
))

# Add vertical lines for min and max values
min_val = num_tokens.min()
max_val = num_tokens.max()

fig.add_vline(x=min_val, line=dict(color="red", dash="dash"), annotation_text=f"Min: {min_val}", annotation_position="top left")
fig.add_vline(x=max_val, line=dict(color="blue", dash="dash"), annotation_text=f"Max: {max_val}", annotation_position="top right")

# Update layout
fig.update_layout(
    title="Distribution of num_tokens",
    xaxis_title="num_tokens",
    yaxis_title="Count",
    bargap=0.1
)

# # Show plot
# fig.show()

fig_json = fig.to_json()

# Encode data
encoded_data = base64.b64encode(json.dumps({"fig_json": fig_json}).encode()).decode()

# Generate HTML with Plotly.js
html_code = f'''
<html>
<head>
    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
</head>
<body>
    <h1>Token Distribution</h1>
    <div id="plotly-chart"></div>

    <script>
        var figData = {json.dumps(json.loads(fig_json))};  // Pass raw JSON directly

        // Extract data and layout separately
        var data = figData.data;
        var layout = figData.layout;

        // Render Plotly figure
        Plotly.newPlot("plotly-chart", data, layout);
    </script>
</body>
</html>
'''

# Display HTML
display(HTML(html_code))


In [6]:
from dataclasses import dataclass, field, asdict
from typing import Dict

@dataclass
class InstanceConfig:
    repository: str
    accelerator: str
    instance_size: str
    instance_type: str
    custom_image: Dict[str, str]  # Dict of str -> str
    vendor: str
    region: str

@dataclass
class InfinityConfig:
    INFINITY_PORT: str = "80"
    INFINITY_ENGINE: str = "torch"
    INFINITY_BATCH_SIZE: str = "16"
    INFINITY_DTYPE: str = "auto"
    INFINITY_EMBEDDING_DTYPE: str = "float32"
    INFINITY_POOLING_METHOD: str = "auto"
    INFINITY_COMPILE: str = 'false'
    INFINITY_BETTERTRANSFORMER: str = 'false'
    INFINITY_MODEL_ID: str = "/repository"

    def to_dict(self) -> Dict[str, str]:
        """Converts the dataclass to a dictionary representation."""
        return asdict(self)

@dataclass
class ImageConfig:
    health_route: str = "/health"
    url: str = "michaelf34/infinity:0.0.75"
    env: InfinityConfig = field(default_factory=InfinityConfig)


In [7]:
def set_env(batch_size: int, image_config: ImageConfig) -> Dict[str, str]:
    config = asdict(image_config)
    config['env']["INFINITY_BATCH_SIZE"] = str(batch_size)
    return config

# Experiments

In [8]:
from src.deployment import deploy_endpoint
?deploy_endpoint

[0;31mSignature:[0m [0mdeploy_endpoint[0m[0;34m([0m[0minstance_config[0m[0;34m,[0m [0mendpoint_name[0m[0;34m,[0m [0mwait[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Creates and deploys an inference endpoint using the given instance configuration.
[0;31mFile:[0m      ~/projects/encoder-analysis/src/deployment.py
[0;31mType:[0m      function

## GPUs

In [9]:
instance_config_experiment_dicts = [
    {
        'accelerator': 'gpu',
        'instance_size': 'x1',
        'instance_type': 'nvidia-l4',
        'vendor': 'aws',
        'region': 'us-east-1',
    },
]
instance_config_experiments = [InstanceConfig(repository=MODEL, **instance_config_experiment_dict, custom_image=None) for instance_config_experiment_dict in instance_config_experiment_dicts]

In [10]:
import copy
from pathlib import Path
from time import sleep

from loguru import logger

from src.k6 import call_k6, optimal_vus

template_file = "embedding-analysis.js.j2"
output_file = Path("./generated").resolve()/"embedding-analysis.js"

start_vus = 32
batch_sizes = [16, 32, 64, 128, 256, 512, 1024]

for og_instance_config_experiment in instance_config_experiments:
    endpoints = []
    for batch_size in batch_sizes:
        # Configure Infinity Settings, pass them up to the image
        infinty_config = InfinityConfig(INFINITY_BATCH_SIZE=str(batch_size))
        image_config = ImageConfig(env=infinty_config)

        # Add the image to the HW instance
        instance_config_experiment = copy.deepcopy(og_instance_config_experiment)
        instance_config_experiment.custom_image = image_config

        # Deploy the endpoint
        logger.info(f'Creating endpoint with Batch Size:\t{batch_size}')
        vendor = instance_config_experiment.vendor
        endpoint = deploy_endpoint(instance_config_experiment, endpoint_name=f'ea-{vendor}-{batch_size}', wait=False)
        endpoints.append(endpoint)

    endpoints[0].wait()
    
    for endpoint in endpoints:
        endpoint.fetch()
        batch_size = endpoint.__dict__["raw"]["model"]["env"]["INFINITY_BATCH_SIZE"]
        vendor = endpoint.__dict__["raw"]["provider"]["vendor"]
        instance_type = endpoint.__dict__["raw"]["compute"]["instanceType"]
        logger.success(f'Endpoint is ready!')
        logger.info(f'\tVendor: {vendor}')
        logger.info(f'\tInstance: {instance_type}')
        logger.info(f'\tBatch Size: {batch_size}')

        args_dict = dict(
            endpoint=endpoint,
            text_column=TEXT_COLUMN,
            total_requests=10_000,
            template_file=template_file,
            output_file=output_file,
            dataset_path=DATASET_PATH,
            k6_bin=K6_BIN
        )

        optimal_vus(max_vus=MAX_VUS, args_dict=args_dict, start_vus=start_vus)
        endpoint.delete()
        sleep(5)

[32m2025-01-28 10:51:35.106[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mCreating endpoint with Batch Size:	16[0m
[32m2025-01-28 10:51:36.200[0m | [1mINFO    [0m | [36msrc.deployment[0m:[36mdeploy_endpoint[0m:[36m21[0m - [1mCreating inference endpoint...[0m
[32m2025-01-28 10:51:36.481[0m | [1mINFO    [0m | [36msrc.deployment[0m:[36mdeploy_endpoint[0m:[36m40[0m - [1mWaiting for endpoint to be ready...[0m
[32m2025-01-28 10:51:36.482[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mCreating endpoint with Batch Size:	32[0m
[32m2025-01-28 10:51:37.037[0m | [1mINFO    [0m | [36msrc.deployment[0m:[36mdeploy_endpoint[0m:[36m21[0m - [1mCreating inference endpoint...[0m
[32m2025-01-28 10:51:37.421[0m | [1mINFO    [0m | [36msrc.deployment[0m:[36mdeploy_endpoint[0m:[36m40[0m - [1mWaiting for endpoint to be ready...[0m
[32m2025-01-28 10:51:37.422[0m | [1mINFO    [0m | [36m__main_

KeyboardInterrupt: 

In [54]:
from src.process_dataset import load_json_files

# Define cost per hour considering both hardware type and vendor
cost_per_hour = {
    ('gcp', 'nvidia-t4'): 0.5,
    ('gcp', 'nvidia-l4'): 0.7,
    ('aws', 'nvidia-t4'): 0.5,
    ('aws', 'nvidia-l4'): 0.8,
}

# Load dataset
df = load_json_files('results/embedding')

# Compute cost
df['seconds_per_billion'] = 1e9 / df['throughput_req_per_sec']
df['cost_per_sec'] = df.apply(lambda row: cost_per_hour.get((row['vendor'], row['hw_type']), 0) / 3600, axis=1)
df['1B_cost'] = df['seconds_per_billion'] * df['cost_per_sec']

df.sort_values(by='1B_cost').head()

Unnamed: 0,total_requests,test_duration_sec,successful_requests,avg_latency_ms,p95_latency_ms,throughput_req_per_sec,avg_embedding_size,hw_type,batch_size,engine,vendor,vus,seconds_per_billion,cost_per_sec,1B_cost
3,10000,16.553394,10000,1205.6278,1819.0,604.10572,768,nvidia-l4,64,torch,aws,768,1655339.4,0.000222,367.8532
40,10000,16.68379,10000,1305.3264,2177.1,599.384193,768,nvidia-l4,64,torch,aws,832,1668379.0,0.000222,370.750889
51,10000,16.780754,10000,1228.5425,2100.1,595.920779,768,nvidia-l4,64,torch,aws,776,1678075.4,0.000222,372.905644
6,10000,16.841353,10000,1211.6202,2050.1,593.776521,768,nvidia-l4,64,torch,aws,772,1684135.3,0.000222,374.252289
1,10000,16.907216,10000,1266.5086,2114.0,591.463432,768,nvidia-l4,64,torch,aws,800,1690721.6,0.000222,375.715911


In [55]:
from datasets import Dataset
# Push to the hub
dataset = Dataset.from_pandas(df)
dataset.push_to_hub(HUB_DATASET_PATH)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/derek-thomas/embedding-ie-optimization/commit/224017dc9e7cf723243ec4b7d5e79772635c7154', commit_message='Upload dataset', commit_description='', oid='224017dc9e7cf723243ec4b7d5e79772635c7154', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/derek-thomas/embedding-ie-optimization', endpoint='https://huggingface.co', repo_type='dataset', repo_id='derek-thomas/embedding-ie-optimization'), pr_revision=None, pr_num=None)