In [None]:
%cd ..

In [None]:
import os

import pandas as pd
from dotenv import load_dotenv
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from tqdm import tqdm

from prompts import load_template
from utils import DataVersionManager

In [None]:
load_dotenv()
dvm = DataVersionManager()

In [None]:
df = dvm.search_latest_train_data()
df.head()

In [None]:
PARAGRAPH = "paragraph"
QUESTION = "question"
CHOICES = "choices"
QUESTION_PLUS = "question_plus"
ANSWER = "answer"
REASONING = "reasoning"

In [None]:
prompt = PromptTemplate.from_template(
    template=load_template(file_name="reasoning.txt", template_type="keyword_extraction")
)
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=os.getenv("OPENAI_API_KEY"),
)
chain = prompt | llm

In [None]:
def infer(row: pd.Series) -> str:
    response = chain.invoke(
        {
            "paragraph": row[PARAGRAPH],
            "question": row[QUESTION],
            "question_plus": row[QUESTION_PLUS],
            "choices": row[CHOICES],
            "answer": row[ANSWER],
        },
    )
    return response.content

In [None]:
tqdm.pandas()
sampled_df: pd.DataFrame = df.sample(20, random_state=1004)
sampled_df["reasoning"] = sampled_df.progress_apply(lambda row: infer(row), axis=1)
sampled_df.head()

In [None]:
sampled_df.to_csv(dvm.update_file_path(1, 0, is_experiment=False, save_in_experiment=True), index=False)