In [1]:
import os
import sys
import numpy as np
import pandas as pd
import yaml 
from datetime import datetime
import time
import json

with open("../../config.yaml.local", "r") as f:
    LOCAL_CONFIG = yaml.safe_load(f)
with open("../../config.yaml", "r") as f:
    CONFIG = yaml.safe_load(f)

sys.path.append("../python")

import globals
import data_tools as dt
import emb
import utils

LOCAL_PATH = LOCAL_CONFIG["LOCAL_PATH"]
DATA_PATH = LOCAL_CONFIG["DATA_PATH"]
BATCH_PATH = emb.BATCH_PATH

with open(os.path.join(LOCAL_PATH, 'metadata/models.json'), 'r') as f:
    MODELS = json.load(f)
model = MODELS[emb.EMBEDDING_MODEL]
input_cost = model['input_cost']
input_cost_batch = model['input_cost_batch']

OVERWRITE = False


In [2]:
# emb.fail_all_jobs() # use to mark all current jobs as failed (clean restart)

In [3]:
# update status of batch jobs
batch_jobs = emb.get_batch_jobs_df().copy()
for _, row in batch_jobs.iterrows():
    batch_id = row['id']
    status = row['status']
    input_filepath = os.path.join(BATCH_PATH, row['input_file'])
    if status not in ['completed', 'failed', 'expired', 'cancelled', 'written']:
        batch = emb.update_batch_status(batch_id)
        print(f"Batch {batch_id} status updated to {batch.status}.")


Batch batch_691650164ed881908aca6185740fbca5 status updated to in_progress.
Batch batch_6916501c6e648190b9b57210119bd833 status updated to in_progress.
Batch batch_691650a34c5481909765172419a59301 status updated to in_progress.
Batch batch_69177e4e7bb48190b74b748edb2a50ca status updated to in_progress.
Batch batch_69177e53af5c8190baeb518c1c164caa status updated to in_progress.
Batch batch_69177e586f248190b93f810debe5f01d status updated to in_progress.
Batch batch_69177e5ccf008190ba90d5b3cd2ab358 status updated to in_progress.
Batch batch_69177e620dc481908b3f1aa64ef1fe67 status updated to completed.
Batch batch_69177e667e2c8190890a6028295eb28c status updated to in_progress.
Batch batch_69177e6bd2c081908c566bdf1964ce71 status updated to in_progress.
Batch batch_69177e70946081909897bb0ac28c817e status updated to in_progress.
Batch batch_69177e7498e8819099aa928dd8fc996b status updated to in_progress.
Batch batch_69177e793e5081908e41150ee6838519 status updated to in_progress.
Batch batch_69

In [4]:
# for completed but not written jobs, write to response store
batch_jobs = emb.get_batch_jobs_df().copy()
batch_jobs = batch_jobs[batch_jobs['status'] != 'written'].reset_index(drop=True)
batch_jobs = batch_jobs[batch_jobs['input_file'].str.contains('embedding')].reset_index(drop=True)
for _, row in batch_jobs.iterrows():
    batch_id = row['id']
    status = row['status']
    input_filepath = os.path.join(BATCH_PATH, row['input_file'])
    if (status=='completed') or (status=='expired'):
        n_errors = emb.write_batch_to_embedding_store(batch_id)
        if n_errors==0:
            emb.update_batch_job(batch_id, status='written')
            output_filepath = os.path.join(BATCH_PATH, row['output_file'])
            os.remove(output_filepath)
            os.remove(input_filepath)


Batch job batch_69177e620dc481908b3f1aa64ef1fe67 processed:
    Total requests: 3958
    Total written: 3958
    Total errors: 0
    Total already existing: 0


In [5]:
emb.close_connections()