In [1]:
import os
import sys
import numpy as np
import pandas as pd
import yaml 
from datetime import datetime
import time
import json

with open("../../config.yaml.local", "r") as f:
    LOCAL_CONFIG = yaml.safe_load(f)
with open("../../config.yaml", "r") as f:
    CONFIG = yaml.safe_load(f)

sys.path.append("../python")

import globals
import data_tools as dt
import emb
import utils

LOCAL_PATH = LOCAL_CONFIG["LOCAL_PATH"]
DATA_PATH = LOCAL_CONFIG["DATA_PATH"]

with open(os.path.join(LOCAL_PATH, 'metadata/models.json'), 'r') as f:
    MODELS = json.load(f)
model = MODELS[emb.EMBEDDING_MODEL]
input_cost = model['input_cost']
input_cost_batch = model['input_cost_batch']

OVERWRITE = False


In [2]:
ESTIMATE_COSTS = False
BATCH = False
START_IDX = 0
END_IDX = 100000
BATCH_SIZE = 50
DATESTR = datetime.now().strftime("%Y-%m-%d")

In [3]:
items = dt.get_items()
items['subName'] = items['subName'].fillna('')
subs = items.loc[items['subName'].str.len()>0, 'subName'].unique().tolist()

In [4]:
t0 = time.time()

texts_to_submit = []
sub_embeddings = []
batch_num = 0
total_requests = 0
total_input_tokens = 0
total_bytes = 0
for idx, sub in enumerate(subs):
    if idx < START_IDX:
        continue
    if idx > END_IDX:
        break

    if ESTIMATE_COSTS:
        sub_chunks = utils.split_to_max_length(sub)
        for chunk in sub_chunks:
            total_requests += 1
            n_tokens = utils.token_length(chunk)
            total_input_tokens += n_tokens
            total_bytes += 32 + emb.EMBEDDING_DIMENSION*4  # 32B per hash, 4B per embedding dim
    elif BATCH:
        sub_chunks = utils.split_to_max_length(sub)
        for chunk in sub_chunks:
            chunk_hash = utils.get_hash(chunk)
            cached_response = emb.check_cache(chunk_hash)
            if cached_response and not OVERWRITE:
                continue
            texts_to_submit.append(chunk)
            if len(texts_to_submit) >= BATCH_SIZE:
                batch_filename = f"sub_embeddings_batch_{DATESTR}_{START_IDX}_{END_IDX}_{batch_num}.jsonl"
                batch = emb.create_batch_job(texts_to_submit, batch_filename, overwrite=OVERWRITE)
                batch_num += 1
                texts_to_submit = []
    else:
        sub_embeddings.append({
            'subName': sub,
            'embedding': emb.get_embedding_robust(sub, overwrite=OVERWRITE)
        })
    if ((idx+1)%1000)==0:
        print(f"Processed {idx+1:,} items ... elapsed time: {(time.time()-t0)/60:,.2f} minutes")

if (not ESTIMATE_COSTS) and (BATCH) and (len(texts_to_submit)>0):
    batch_filename = f"sub_embeddings_batch_{DATESTR}_{START_IDX}_{END_IDX}_{batch_num}.jsonl"
    batch = emb.create_batch_job(texts_to_submit, batch_filename, overwrite=OVERWRITE)

print(f"Elapsed time: {(time.time()-t0)/60:,.2f} minutes")

Elapsed time: 0.00 minutes


In [5]:
if ESTIMATE_COSTS:
    total_input_cost = (total_input_tokens / 1e6) * input_cost
    total_input_cost_batch = (total_input_tokens / 1e6) * input_cost_batch
    print(f"Total requests: {total_requests:,}")
    print(f"Input tokens: {total_input_tokens:,.0f}")
    print(f"Total cost: ${total_input_cost:,.2f}")
    print(f"Total cost (batch): ${total_input_cost_batch:,.2f}")
    print(f"Total storage: {total_bytes / 1e9:,.2f} GB")
elif not BATCH:
    sub_embeddings_df = pd.DataFrame(sub_embeddings)
    outfilename = os.path.join(DATA_PATH, "sub_embeddings.pkl")
    sub_embeddings_df.to_pickle(outfilename)


In [6]:
emb.close_connections()

In [9]:
if not ESTIMATE_COSTS and not BATCH:
    print(sub_embeddings_df.head(5))

   subName                                          embedding
0  bitcoin  [0.006226676050573587, -0.036271411925554276, ...
1     meta  [0.02716204710304737, 0.04521230608224869, -0....
2    nostr  [-0.002283795503899455, 0.030661288648843765, ...
3     jobs  [-0.03474509343504906, 0.035567715764045715, 0...
4     tech  [-0.02633531019091606, 0.0053547038696706295, ...
