In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from huggingface_hub import login, whoami

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# DATASET CONFIG
DATASET='openbmb/RLAIF-V-Dataset'
MODEL='vidore/colqwen2-v1.0-merged'
MIN_TOKENS=50
MAX_TOKENS=8000
SAMPLES=1_000
MAX_VUS = 600
IMAGE_COLUMN='image'
DATASET_PATH='data/vision-embedding-dataset.jsonl'
K6_BIN = "/usr/bin/k6"

HUB_DATASET_PATH = f'{whoami()["name"]}/vision-embedding-ie-optimization'

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer
from src.process_dataset import pil_to_base64, save_dataset
from loguru import logger
import json

try:
    # Check if the file exists and count lines
    with open(DATASET_PATH, "r", encoding="utf-8") as f:
        line_count = sum(1 for _ in f)

    if line_count != SAMPLES:
        raise ValueError(f"Dataset has {line_count} rows instead of {SAMPLES}. Reprocessing...")

    logger.info(f"Loaded dataset from JSON with {SAMPLES} samples.")

    # Now load the dataset since we confirmed the row count is correct
    with open(DATASET_PATH, "r", encoding="utf-8") as f:
        dataset = json.load(f)

except (FileNotFoundError, ValueError):
    logger.warning("Dataset missing, incorrect size, or corrupted. Reprocessing...")

    dataset = load_dataset(DATASET, 'default', split=f"train[:{SAMPLES}]")
    dataset = dataset.map(lambda x: {"image_b64": pil_to_base64(x[IMAGE_COLUMN])})

    save_dataset(dataset.select_columns(["image_b64"]), DATASET_PATH)
    logger.info(f"Saved new dataset with {SAMPLES} samples.")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
[32m2025-01-30 08:26:49.336[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mLoaded dataset from JSON with 1000 samples.[0m
[32m2025-01-30 08:26:55.957[0m | [1mINFO    [0m | [36msrc.process_dataset[0m:[36msave_dataset[0m:[36m56[0m - [1mSaved dataset to data/vision-embedding-dataset.jsonl in JSONL format[0m
[32m2025-01-30 08:26:55.977[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mSaved new dataset with 1000 samples.[0m


In [5]:
from dataclasses import dataclass, field, asdict
from typing import Dict

@dataclass
class InstanceConfig:
    repository: str
    accelerator: str
    instance_size: str
    instance_type: str
    custom_image: Dict[str, str]  # Dict of str -> str
    vendor: str
    region: str

@dataclass
class InfinityConfig:
    INFINITY_PORT: str = "80"
    INFINITY_ENGINE: str = "torch"
    INFINITY_BATCH_SIZE: str = "16"
    INFINITY_DTYPE: str = "auto"
    INFINITY_EMBEDDING_DTYPE: str = "float32"
    INFINITY_POOLING_METHOD: str = "auto"
    INFINITY_COMPILE: str = 'false'
    INFINITY_BETTERTRANSFORMER: str = 'false'
    INFINITY_MODEL_ID: str = "/repository"

    def to_dict(self) -> Dict[str, str]:
        """Converts the dataclass to a dictionary representation."""
        return asdict(self)

@dataclass
class ImageConfig:
    health_route: str = "/health"
    url: str = "michaelf34/infinity:0.0.75"
    env: InfinityConfig = field(default_factory=InfinityConfig)


In [6]:
def set_env(batch_size: int, image_config: ImageConfig) -> Dict[str, str]:
    config = asdict(image_config)
    config['env']["INFINITY_BATCH_SIZE"] = str(batch_size)
    return config

# Experiments

In [7]:
from src.deployment import deploy_endpoint
?deploy_endpoint

[0;31mSignature:[0m [0mdeploy_endpoint[0m[0;34m([0m[0minstance_config[0m[0;34m,[0m [0mendpoint_name[0m[0;34m,[0m [0mwait[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Creates and deploys an inference endpoint using the given instance configuration.
[0;31mFile:[0m      ~/encoder-analysis/src/deployment.py
[0;31mType:[0m      function

## GPUs

In [8]:
instance_config_experiment_dicts = [
    {
        'accelerator': 'gpu',
        'instance_size': 'x1',
        'instance_type': 'nvidia-l4',
        'vendor': 'aws',
        'region': 'us-east-1',
    },
]
instance_config_experiments = [InstanceConfig(repository=MODEL, **instance_config_experiment_dict, custom_image=None) for instance_config_experiment_dict in instance_config_experiment_dicts]

In [9]:
import copy
from pathlib import Path
from time import sleep

from loguru import logger

from src.k6 import call_k6, optimal_vus

template_file = "vision-embedding-analysis.js.j2"
output_file = Path("./generated").resolve()/"vision-embedding-analysis.js"

start_vus = 1
batch_sizes = [1, 2, 4, 8, 16, 32]

for og_instance_config_experiment in instance_config_experiments:
    endpoints = []
    for batch_size in batch_sizes:
        # Configure Infinity Settings, pass them up to the image
        infinty_config = InfinityConfig(INFINITY_BATCH_SIZE=str(batch_size))
        image_config = ImageConfig(env=infinty_config)

        # Add the image to the HW instance
        instance_config_experiment = copy.deepcopy(og_instance_config_experiment)
        instance_config_experiment.custom_image = image_config

        # Deploy the endpoint
        logger.info(f'Creating endpoint with Batch Size:\t{batch_size}')
        vendor = instance_config_experiment.vendor
        endpoint = deploy_endpoint(instance_config_experiment, endpoint_name=f'ea-{vendor}-{batch_size}', wait=False)
        endpoints.append(endpoint)

    endpoints[0].wait()
    
    for endpoint in endpoints:
        endpoint.fetch()
        batch_size = endpoint.__dict__["raw"]["model"]["env"]["INFINITY_BATCH_SIZE"]
        vendor = endpoint.__dict__["raw"]["provider"]["vendor"]
        instance_type = endpoint.__dict__["raw"]["compute"]["instanceType"]
        logger.success(f'Endpoint is ready!')
        logger.info(f'\tVendor: {vendor}')
        logger.info(f'\tInstance: {instance_type}')
        logger.info(f'\tBatch Size: {batch_size}')

        args_dict = dict(
            endpoint=endpoint,
            text_column=IMAGE_COLUMN,
            total_requests=SAMPLES,
            template_file=template_file,
            output_file=output_file,
            dataset_path=DATASET_PATH,
            k6_bin=K6_BIN
        )

        optimal_vus(max_vus=MAX_VUS, args_dict=args_dict, start_vus=start_vus)
        endpoint.delete()
        sleep(5)

[32m2025-01-30 08:26:56.126[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mCreating endpoint with Batch Size:	1[0m
[32m2025-01-30 08:26:56.402[0m | [32m[1mSUCCESS [0m | [36msrc.deployment[0m:[36mdeploy_endpoint[0m:[36m15[0m - [32m[1mRe-using Endpoint: hw=nvidia-l4	bs=1	[0m
[32m2025-01-30 08:26:56.402[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mCreating endpoint with Batch Size:	2[0m
[32m2025-01-30 08:26:56.528[0m | [32m[1mSUCCESS [0m | [36msrc.deployment[0m:[36mdeploy_endpoint[0m:[36m15[0m - [32m[1mRe-using Endpoint: hw=nvidia-l4	bs=2	[0m
[32m2025-01-30 08:26:56.528[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mCreating endpoint with Batch Size:	4[0m
[32m2025-01-30 08:26:56.682[0m | [32m[1mSUCCESS [0m | [36msrc.deployment[0m:[36mdeploy_endpoint[0m:[36m15[0m - [32m[1mRe-using Endpoint: hw=nvidia-l4	bs=4	[0m
[32m2025-01-30 08:26:56.683[0m |

In [11]:
from src.process_dataset import load_json_files

# Define cost per hour considering both hardware type and vendor
cost_per_hour = {
    ('gcp', 'nvidia-t4'): 0.5,
    ('gcp', 'nvidia-l4'): 0.7,
    ('aws', 'nvidia-t4'): 0.5,
    ('aws', 'nvidia-l4'): 0.8,
}

# Load dataset
df = load_json_files('results/vision-embedding')

# Compute cost
df['seconds_per_billion'] = 1e9 / df['throughput_req_per_sec']
df['cost_per_sec'] = df.apply(lambda row: cost_per_hour.get((row['vendor'], row['hw_type']), 0) / 3600, axis=1)
df['1B_cost'] = df['seconds_per_billion'] * df['cost_per_sec']

df.sort_values(by='1B_cost').head()

Unnamed: 0,total_requests,test_duration_sec,successful_requests,avg_latency_ms,p95_latency_ms,throughput_req_per_sec,avg_num_vectors,min_num_vectors,max_num_vectors,invalid_embeddings,hw_type,batch_size,image,engine,vendor,vus,seconds_per_billion,cost_per_sec,1B_cost
12,301,60.270528,301,709.554817,1222.0,4.994149,365.445183,81,779,0,nvidia-l4,4,michaelf34/infinity:0.0.75,torch,aws,4,200234300.0,0.000222,44496.51414
2,292,60.472359,292,736.383562,1260.35,4.828652,365.832192,81,779,0,nvidia-l4,1,michaelf34/infinity:0.0.75,torch,aws,4,207097100.0,0.000222,46021.581982
7,294,60.93241,294,732.860544,1275.95,4.825018,370.377551,81,779,0,nvidia-l4,2,michaelf34/infinity:0.0.75,torch,aws,4,207253100.0,0.000222,46056.243394
27,293,60.945637,293,734.651877,1325.4,4.807563,368.901024,81,779,0,nvidia-l4,32,michaelf34/infinity:0.0.75,torch,aws,4,208005600.0,0.000222,46223.463621
17,291,60.621261,291,740.14433,1333.0,4.800296,369.158076,81,779,0,nvidia-l4,8,michaelf34/infinity:0.0.75,torch,aws,4,208320500.0,0.000222,46293.441279


In [12]:
from datasets import Dataset
# Push to the hub
dataset = Dataset.from_pandas(df)
dataset.push_to_hub(HUB_DATASET_PATH)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/derek-thomas/vision-embedding-ie-optimization/commit/89224a3966ac6a03601c1f39227563281ab2aecc', commit_message='Upload dataset', commit_description='', oid='89224a3966ac6a03601c1f39227563281ab2aecc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/derek-thomas/vision-embedding-ie-optimization', endpoint='https://huggingface.co', repo_type='dataset', repo_id='derek-thomas/vision-embedding-ie-optimization'), pr_revision=None, pr_num=None)