In [1]:
import os
import sys
import numpy as np
import pandas as pd
import yaml 
from datetime import datetime
import time
import json

with open("../../config.yaml.local", "r") as f:
    LOCAL_CONFIG = yaml.safe_load(f)
with open("../../config.yaml", "r") as f:
    CONFIG = yaml.safe_load(f)

sys.path.append("../python")

import globals
import data_tools as dt
import emb
import utils

LOCAL_PATH = LOCAL_CONFIG["LOCAL_PATH"]
DATA_PATH = LOCAL_CONFIG["DATA_PATH"]

with open(os.path.join(LOCAL_PATH, 'metadata/models.json'), 'r') as f:
    MODELS = json.load(f)
model = MODELS[emb.EMBEDDING_MODEL]
input_cost = model['input_cost']
input_cost_batch = model['input_cost_batch']

OVERWRITE = False


In [2]:
ESTIMATE_COSTS = True
BATCH = True
START_IDX = 0
END_IDX = 100000
BATCH_SIZE = 16000
DATESTR = datetime.now().strftime("%Y-%m-%d")

In [3]:
posts = dt.get_posts()
posts['text'] = posts['text'].fillna('')

mask = (posts['invoiceActionState'] != 'FAILED') & \
    (~posts['bio']) & (~posts['freebie']) & (~posts['saloon']) & \
    (~posts['subName'].isin(['jobs', 'ama'])) & \
    (posts['title'] != 'deleted by author') & \
    (posts['text'].str.len() > 0)

posts = posts.loc[mask].reset_index(drop=True)
posts = posts.sort_values(by='itemId', ascending=True).reset_index(drop=True)
print(len(posts))

98098


In [4]:
t0 = time.time()

texts_to_submit = []
title_embeddings = []
text_embeddings = []
batch_num = 0
total_input_tokens = 0
for idx, row in posts.iterrows():
    if idx < START_IDX:
        continue
    if idx > END_IDX:
        break
    title = row['title']
    text = row['text']
    title_tokens = utils.token_length(title)
    text_tokens = utils.token_length(text)

    if ESTIMATE_COSTS:
        total_input_tokens += title_tokens + text_tokens
    

In [5]:
if ESTIMATE_COSTS:
    total_input_cost = (total_input_tokens / 1e6) * input_cost
    total_input_cost_batch = (total_input_tokens / 1e6) * input_cost_batch
    print(f"Input tokens: {total_input_tokens:,.0f}")
    print(f"Total cost: ${total_input_cost:,.2f}")
    print(f"Total cost (batch): ${total_input_cost_batch:,.2f}")


Input tokens: 24,990,252
Total cost: $0.50
Total cost (batch): $0.25


In [6]:
emb.close_connections()