In [3]:
import pandas as pd
from sqlalchemy import create_engine, text
import os
from openai import OpenAI
import json

In [4]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
engine = create_engine(
    "postgresql://dhchoi:dhchoikvphil@internal.dh-choi.com:41991/kvphil_db"
)
df = pd.read_sql("SELECT * FROM sources", engine)

In [5]:
df.to_pickle("sources.pkl")

In [9]:
def generate_embeddings(x):
    custom_id = f"request-{x['id']}"
    model = "text-embedding-3-large"
    message = f"{x['content']}"
    request = {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/embeddings",
        "body": {"model": model, "input": message},
    }
    return request


pd.DataFrame(df.apply(generate_embeddings, axis=1).tolist()).to_json(
    "hume_batch.jsonl", orient="records", lines=True
)

hume_batch_file = client.files.create(
    file=open("hume_batch.jsonl", "rb"),
    purpose="batch",
)
hume_batch = client.batches.create(
    input_file_id=hume_batch_file.id,
    endpoint="/v1/embeddings",
    completion_window="24h",
    metadata={"description": "nightly eval job"},
)

In [13]:
output_file_id = client.batches.retrieve(hume_batch.id).output_file_id
output_file = client.files.retrieve(output_file_id)
output_file

FileObject(id='file-JttiFeKRbHHpnhAY458NLQ', bytes=711036825, created_at=1732356557, filename='batch_6741834c3364819082a995cdf75710e3_output.jsonl', object='file', purpose='batch_output', status='processed', status_details=None)

In [20]:
with open("hume_batch_output.jsonl", "w", encoding="utf-8") as f:
    data = client.files.content(output_file.id)
    data = data.read().decode("utf-8")
    f.write(data)

In [16]:
batch_data = []
for idx, row in pd.read_json("hume_batch_output.jsonl", lines=True).iterrows():
    custom_id = row["custom_id"].split("-")[1]
    embedding = row["response"]["body"]["data"][0]["embedding"]
    batch_data.append({"id": custom_id, "embedding": embedding})
batch_df = pd.DataFrame(batch_data)

Unnamed: 0,id,embedding
0,0,"[0.0023190142, -0.019855855000000002, 0.005896..."
1,1,"[-0.021769166000000003, -0.007441734, -0.00528..."
2,2,"[-0.010515945, -0.004091165, -0.0026601746, -0..."
3,3,"[0.0031086365000000003, 0.018671235, -0.005157..."
4,4,"[-0.030661957000000004, -0.012744579, -0.01623..."
...,...,...
16781,16781,"[0.002298085, 0.016510515, -0.02266849, -0.034..."
16782,16782,"[0.027077505, 0.03412655, -0.00020101323, 0.04..."
16783,16783,"[0.026228834000000003, 0.009013152, -0.0154178..."
16784,16784,"[0.014483221000000001, -0.0025272262, 0.001221..."


In [26]:
batch_df["id"] = batch_df["id"].astype("int64")
df = df.merge(batch_df, on="id", how="left")
df = df.drop(columns=["embedding_x"])
df = df.rename(columns={"embedding_y": "embedding"})

Unnamed: 0,id,section_id,paragraph_id,content,philosopher_id,url,embedding_x,embedding_y
0,0,SECTI.,"P 1.1, Bea 3",1. SOME objects produce immediately an agreea...,hume,https://davidhume.org/texts/p/1#1,,"[0.0023190142, -0.019855855000000002, 0.005896..."
1,1,SECTI.,"P 1.2, Bea 3","Some objects again, by being naturally confor...",hume,https://davidhume.org/texts/p/1#2,,"[-0.021769166000000003, -0.007441734, -0.00528..."
2,2,SECTI.,"P 1.3, Bea 3","2. All good or evil, whence-ever it arises, p...",hume,https://davidhume.org/texts/p/1#3,,"[-0.010515945, -0.004091165, -0.0026601746, -0..."
3,3,SECTI.,"P 1.4, Bea 3","When good is certain or very probable, it pro...",hume,https://davidhume.org/texts/p/1#4,,"[0.0031086365000000003, 0.018671235, -0.005157..."
4,4,SECTI.,"P 1.5, Bea 3","When either good or evil is uncertain, it giv...",hume,https://davidhume.org/texts/p/1#5,,"[-0.030661957000000004, -0.012744579, -0.01623..."
...,...,...,...,...,...,...,...,...
16781,16781,CHAP.LXXI.JAMES II.Conduct of the prince of Or...,H 71.83n23,"23. Brief observations, &c.",hume,https://davidhume.org/texts/h/71#83n23,,"[0.002298085, 0.016510515, -0.02266849, -0.034..."
16782,16782,CHAP.LXXI.JAMES II.Conduct of the prince of Or...,H 71.89n24,"24. Life of Clarendon, p. 237.",hume,https://davidhume.org/texts/h/71#89n24,,"[0.027077505, 0.03412655, -0.00020101323, 0.04..."
16783,16783,CHAP.LXXI.JAMES II.Conduct of the prince of Or...,H 71.94n25,"25. Scobell, i. 44, 134. ii. 88, 230.",hume,https://davidhume.org/texts/h/71#94n25,,"[0.026228834000000003, 0.009013152, -0.0154178..."
16784,16784,CHAP.LXXI.JAMES II.Conduct of the prince of Or...,H 71.100n26,26. The duke of Buckingham died on the 16th o...,hume,https://davidhume.org/texts/h/71#100n26,,"[0.014483221000000001, -0.0025272262, 0.001221..."


In [30]:
df.to_sql("sources", engine, if_exists="replace", index=False)

  self.meta.reflect(


786