In [1]:
import os
import sys
import numpy as np
import pandas as pd
import yaml 
from datetime import datetime
import time
import json

with open("../../config.yaml.local", "r") as f:
    LOCAL_CONFIG = yaml.safe_load(f)
with open("../../config.yaml", "r") as f:
    CONFIG = yaml.safe_load(f)

sys.path.append("../python")

import globals
import data_tools as dt
import emb
import utils

LOCAL_PATH = LOCAL_CONFIG["LOCAL_PATH"]
DATA_PATH = LOCAL_CONFIG["DATA_PATH"]

with open(os.path.join(LOCAL_PATH, 'metadata/models.json'), 'r') as f:
    MODELS = json.load(f)
model = MODELS[emb.EMBEDDING_MODEL]
input_cost = model['input_cost']
input_cost_batch = model['input_cost_batch']

OVERWRITE = False


In [2]:
ESTIMATE_COSTS = False
BATCH = True
START_IDX = 0
END_IDX = 99000
BATCH_SIZE = 4000
DATESTR = datetime.now().strftime("%Y-%m-%d")

In [3]:
posts = dt.get_posts()
posts['text'] = posts['text'].fillna('')

mask = (posts['invoiceActionState'] != 'FAILED') & \
    (~posts['bio']) & (~posts['freebie']) & (~posts['saloon']) & \
    (~posts['subName'].isin(['jobs', 'ama'])) & \
    (posts['title'] != 'deleted by author') 

posts = posts.loc[mask].reset_index(drop=True)
posts = posts.sort_values(by='itemId', ascending=True).reset_index(drop=True)
print(len(posts))

191541


In [4]:
t0 = time.time()

texts_to_submit = []
post_embeddings = []
batch_num = 0
total_requests = 0
total_input_tokens = 0
total_bytes = 0
for idx, row in posts.iterrows():
    if idx < START_IDX:
        continue
    if idx > END_IDX:
        break
    
    title = row['title']
    text = row['text']

    if ESTIMATE_COSTS:
        title_chunks = utils.split_to_max_length(title)
        text_chunks = utils.split_to_max_length(text)
        for chunk in title_chunks + text_chunks:
            total_requests += 1
            n_tokens = utils.token_length(chunk)
            total_input_tokens += n_tokens
            total_bytes += 64 + emb.EMBEDDING_DIMENSION*4  # 64B per hash, 4B per embedding dim
    elif BATCH:
        title_chunks = utils.split_to_max_length(title)
        text_chunks = utils.split_to_max_length(text)
        for chunk in title_chunks + text_chunks:
            chunk_hash = utils.get_hash(chunk)
            cached_response = emb.check_cache(chunk_hash)
            if cached_response and not OVERWRITE:
                continue
            texts_to_submit.append(chunk)
            if len(texts_to_submit) >= BATCH_SIZE:
                batch_filename = f"post_embeddings_batch_{DATESTR}_{START_IDX}_{END_IDX}_{batch_num}.jsonl"
                batch = emb.create_batch_job(texts_to_submit, batch_filename, overwrite=OVERWRITE)
                batch_num += 1
                texts_to_submit = []
    else:
        title_embedding = emb.get_embedding_robust(title, overwrite=OVERWRITE)
        text_embedding = emb.get_embedding_robust(text, overwrite=OVERWRITE)
        post_embeddings.append({
            'itemId': row['itemId'],
            'title_embedding': title_embedding,
            'text_embedding': text_embedding
        })
    if ((idx+1)%5000)==0:
        print(f"Processed {idx+1:,} items ... elapsed time: {(time.time()-t0)/60:,.2f} minutes")

if (not ESTIMATE_COSTS) and (BATCH) and (len(texts_to_submit)>0):
    batch_filename = f"post_embeddings_batch_{DATESTR}_{START_IDX}_{END_IDX}_{batch_num}.jsonl"
    batch = emb.create_batch_job(texts_to_submit, batch_filename, overwrite=OVERWRITE)

print(f"Elapsed time: {(time.time()-t0)/60:,.2f} minutes")

Processed 5,000 items ... elapsed time: 0.04 minutes
Batch input file created: /Users/ekung/Dropbox/projects/sn-research/processed_data/batch/post_embeddings_batch_2025-11-14_0_99000_0.jsonl (3967 requests)
Batch job created with ID: batch_69177e4e7bb48190b74b748edb2a50ca
Processed 10,000 items ... elapsed time: 0.14 minutes
Batch input file created: /Users/ekung/Dropbox/projects/sn-research/processed_data/batch/post_embeddings_batch_2025-11-14_0_99000_1.jsonl (3966 requests)
Batch job created with ID: batch_69177e53af5c8190baeb518c1c164caa
Processed 15,000 items ... elapsed time: 0.22 minutes
Batch input file created: /Users/ekung/Dropbox/projects/sn-research/processed_data/batch/post_embeddings_batch_2025-11-14_0_99000_2.jsonl (3966 requests)
Batch job created with ID: batch_69177e586f248190b93f810debe5f01d
Processed 20,000 items ... elapsed time: 0.30 minutes
Batch input file created: /Users/ekung/Dropbox/projects/sn-research/processed_data/batch/post_embeddings_batch_2025-11-14_0_9

In [5]:
if ESTIMATE_COSTS:
    total_input_cost = (total_input_tokens / 1e6) * input_cost
    total_input_cost_batch = (total_input_tokens / 1e6) * input_cost_batch
    print(f"Total requests: {total_requests:,}")
    print(f"Input tokens: {total_input_tokens:,.0f}")
    print(f"Total cost: ${total_input_cost:,.2f}")
    print(f"Total cost (batch): ${total_input_cost_batch:,.2f}")
    print(f"Total storage: {total_bytes / 1e9:,.2f} GB")
elif not BATCH:
    post_embeddings_df = pd.DataFrame(post_embeddings)
    outfilename = os.path.join(DATA_PATH, "post_embeddings.pkl")
    post_embeddings_df.to_pickle(outfilename)

In [6]:
emb.close_connections()