In [12]:
# import libraries
import re
import os
import json
import uuid
import time
import datetime
import importlib
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from src.llm_factory import LLMFactory
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableMap

In [13]:
# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
open_api_key = os.getenv("SBS_OPENAI_API_KEY")
deepseek_api_key = os.getenv("SBS_DEEPSEEK_API_KEY")

# Initialize LLM clients
openai_llm = OpenAI(
    api_key=open_api_key
)

# deekseek_llm = ChatDeepSeek(
#     model="deepseek-chat",
#     temperature=0.7,
#     api_key=deepseek_api_key,
# )

deekseek_llm = LLMFactory("deepseek", "langchain", "deepseek-chat", deepseek_api_key, 0.7).get_llm()

JSONL_FILE = "requests.jsonl"
OUTPUT_FILE = "output.jsonl"
VOCAB_FOLDER_PATH = "pali_class/vocab"
EXERCISE_FOLDER_PATH = "pali_class/exercises"

In [14]:
new_suttas = pd.read_csv("pali_class/new_suttas.csv")

In [15]:
# keyword = input("Enter a Pali word: ").strip() # Remove extra spaces

keyword = "asesa"

def search_pali_in_csv(keyword):
    """Search for a Pali word in all CSV files in the folder."""
    result = {
        "id": -1,
        "pali": "",
        "meaning": "",
        "pos": "",
        "exercise_number": "",
        "prdc": []
    }

    for filename in os.listdir(VOCAB_FOLDER_PATH):
        if filename.endswith(".csv"): # Only search in CSV files
            file_path = os.path.join(VOCAB_FOLDER_PATH, filename)
            df = pd.read_csv(file_path, dtype=str)
            example_columns = [col for col in df.columns if "example" in col.lower()]
            match = df[df["pali"] == keyword]

            if not match.empty:
                result["id"] = match["id"].values[0]
                result["pali"] = match["pali"].values[0]
                result["meaning"] = match["meaning"].values[0]
                result["pos"] = match["pos"].values[0]

                for col in example_columns:
                    sentence = match[col].values[0]
                    if isinstance(sentence, str): # Ensure it's a string before regex
                        # Extract text inside <b>...</b>
                        extracted = re.findall(r"<b>(.*?)</b>", sentence)
                        
                        for ext in extracted:
                            if ext not in result['prdc']:
                                result['prdc'].append(ext)
                    
                print("Match found in file:", filename)
                
                number = filename.split("_")[-1].split(".")[0] # Extract the number
                result["exercise_number"] = number

                break

    if result["id"] == -1:
        print("No matches found")

    return result

result = search_pali_in_csv(keyword)
result

Match found in file: vocab_class_2.csv


{'id': '9869',
 'pali': 'asesa',
 'meaning': 'complete; without remainder',
 'pos': 'adj',
 'exercise_number': '2',
 'prdc': ['asesa', 'asesā']}

In [16]:
# Find exercise number
exercise_number = result['exercise_number']
found_exercise = False
target_exercise = ""
exercise_data = ""

for filename in os.listdir(EXERCISE_FOLDER_PATH):
    if filename.endswith(".txt") and f"_{exercise_number}." in filename:
        found_exercise = True
        target_exercise = filename
        print("Found:", filename)

if not found_exercise:
    print("Exercise not found")
else:
    exercise_data = open(os.path.join(EXERCISE_FOLDER_PATH, target_exercise), "r").read().strip()
    print(exercise_data)

Found: exercises_class_2.txt
Found: restructured_exercises_class_2.txt
Topic: Class 2 Exercises

Pali: namo tassa bhagavato arahato sammā-sambuddhassa
English Translation: Homage to him, the Blessed One, the Worthy One, the fully Enlightened One.

Pali: avijjāya tv'eva asesa-virāga-nirodhā saṅkhāra-nirodho, saṅkhāra-nirodhā viññāṇa-nirodho, viññāṇa-nirodhā nāmarūpa-nirodho, nāmarūpa-nirodhā saḷāyatana-nirodho, saḷāyatana-nirodhā phassa-nirodho, phassa-nirodhā vedanā-nirodho, vedanā-nirodhā taṇhā-nirodho, taṇhā-nirodhā upādāna-nirodho, upādāna-nirodhā bhava-nirodho, bhava-nirodhā jāti-nirodho, jāti-nirodhā jarā-maraṇaṃ soka-parideva-dukkha-domanass'upāyāsā nirujjhanti.
English Translation: But from the complete fading away and cessation of ignorance there is cessation of volitional formations; from the cessation of volitional formations, cessation of consciousness; from the cessation of consciousness, cessation of name-and-form; from the cessation of name-and-form, cessation of the six 

In [17]:
SYSTEM_PROMPT = """
You are an AI assistant specialized in Pali language processing. Your primary task is to extract **full sentences** from an exercise dataset based on a given Pali word. Follow these instructions carefully:

1. **Extract and Structure Data:**
   - Identify sentences in the exercise dataset that contain the **given Pali word**, recognizing its various forms (e.g., declensions and conjugations).
   - Extract **full sentences** that provide meaningful context. Do **not** return single-word outputs (e.g., "<b>Anussarati</b>").
   - A valid **sutta reference number** (also known as the **sutta number**) must be **explicitly present within the selected sentence**.
   - **STRICT RULE**: 
      - If the **sutta reference number** (e.g., "AN 3.71") is **not found directly in the selected sentence itself**, **discard the selected sentence**.
   - Remove any **sutta reference number** from the selected sentence (e.g., "AN 3.71", "AN 3.71 (simpl)", or similar citation formats) once identified.
   - Store the **sutta reference number** in the `"class_source"` field.
   - **Handling `(simpl)` in `class_source` Formatting:**
      - If the extracted **sutta reference number** contains **"(simpl)"**, preserve it (e.g., `"DN 19.7 (simpl)"`).
      - If **"(simpl)"** is **not present**, return only the core sutta number (e.g., `"DN 19.7"`).

2. **Ensure Correct Sutta Reference Name Retrieval:**
   - The **sutta reference name** (also known as the **sutta reference name**) will not be available in the exercise dataset.
   - After obtaining the **sutta reference number**, retrieve the corresponding **sutta reference name**.
      - Example: If **sutta reference number** is `"AN 2.1"`, the **sutta reference name** should be `"vajjasuttaṃ"`.
      - If the correct **sutta reference name** cannot be determined, return `"unknown"` instead of making an incorrect guess.
   - Store the **sutta reference name** in the `"class_sutta"` field.

3. **Formatting Rule for `"class_example"`:**
   - The **selected Pali word** is **highlighted alone** within `<b>...</b>`.
   - The capitalization of the entire sentence must remain exactly as in the exercise dataset.

4. **`"pali"` Field Rule:**  
   - The `"pali"` field **must always** match the **exact given Pali word**, including numbering if present (e.g., `"anussarati 1.1"`).
   - Even if the extracted sentence contains a different **morphological form**, the `"pali"` field must **not** be altered.

5. **`"explanation"` Field Rule:**  
   - The `"explanation"` field should provide a brief justification for why the response was generated **if the user asks a question**.
   - If the user requests a solution, provide a practical suggestion to achieve the desired outcome.
   - If **no question is provided**, return an **empty string** (`""`).

6. **Output Structure and Formatting:**
   - If a valid sentence is found and the sutta reference number is explicitly present within it, return the output in **JSON format** with the following structure:
      {
         "id": "<Given Pali ID>",
         "pali": "<Given Pali word>",
         "selected_pali_word": "<Selected Pali word>",
         "class_source": "<Sutta Reference Number>",
         "class_sutta": "<Sutta Reference Name>",
         "class_example": "<A full sentence with <b>Selected Pali word</b> highlighted, preserving original capitalization>",
         "english_translation": "<English translation>",
         "explanation": "<A justification for why this response was generated and a solution if applicable, or an empty string if no question was asked>"
      }
   - If the sutta reference number is missing from the selected sentence or no valid sentence is found, return a **JSON object** with empty values for all fields except "id" and "pali":
      {
         "id": "<Given Pali ID>",
         "pali": "<Given Pali word>",
         "selected_pali_word": "",
         "class_source": "",
         "class_sutta": "",
         "class_example": "",
         "english_translation": "",
         "explanation": ""
      }

7. **Strict JSON Output rules:**
   **IMPORTANT: FOLLOW THESE RULES STRICTLY**
   - **DO NOT** wrap the JSON output in Markdown formatting (e.g., ` ```json ... ``` `).
   - **DO NOT** add any extra text before or after the JSON response.
   - **DO NOT** format the output as a code block.
   - **ONLY RETURN A PLAIN JSON OBJECT.**
   - **Failure to follow these rules will result in an invalid response.**
"""

In [18]:
# if len(result["prdc"]) > 0:
#     USER_PROMPT = f"""
# For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
# which has "{result['pos']}" as its grammatical part of speech, "{result['meaning']}" as Pali word`s english meaning and \
# **Found declensions and conjugations**: {', '.join(result['prdc'])}, \
# find example sentence in exercise data: "{exercise_data}".
# """
# else:
#     USER_PROMPT = f"""
# For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
# which has "{result['pos']}" as its grammatical part of speech and "{result['meaning']}" as Pali word`s english meaning, \
# find example sentence in exercise data: "{exercise_data}".
# """

USER_PROMPT = f"""
For the given Pali word: "{result['pali']}" with ID: "{result['id']}", \
extract a relevant sentence in exercise data: "{exercise_data}". \
User question: Why this sentence is relevant as there are not any sutta reference number in it?
"""

print(USER_PROMPT)


For the given Pali word: "asesa" with ID: "9869", extract a relevant sentence in exercise data: "Topic: Class 2 Exercises

Pali: namo tassa bhagavato arahato sammā-sambuddhassa
English Translation: Homage to him, the Blessed One, the Worthy One, the fully Enlightened One.

Pali: avijjāya tv'eva asesa-virāga-nirodhā saṅkhāra-nirodho, saṅkhāra-nirodhā viññāṇa-nirodho, viññāṇa-nirodhā nāmarūpa-nirodho, nāmarūpa-nirodhā saḷāyatana-nirodho, saḷāyatana-nirodhā phassa-nirodho, phassa-nirodhā vedanā-nirodho, vedanā-nirodhā taṇhā-nirodho, taṇhā-nirodhā upādāna-nirodho, upādāna-nirodhā bhava-nirodho, bhava-nirodhā jāti-nirodho, jāti-nirodhā jarā-maraṇaṃ soka-parideva-dukkha-domanass'upāyāsā nirujjhanti.
English Translation: But from the complete fading away and cessation of ignorance there is cessation of volitional formations; from the cessation of volitional formations, cessation of consciousness; from the cessation of consciousness, cessation of name-and-form; from the cessation of name-and-

In [19]:
# Deepseek Inference
response_text = deekseek_llm.chat(system_prompt=SYSTEM_PROMPT, user_prompt=USER_PROMPT)

# Convert string to JSON
try:
    response_json = json.loads(response_text)
    print(json.dumps(response_json, indent=4, ensure_ascii=False)) # Pretty print
    # sutta_name = new_suttas[new_suttas['sutta_number'] == response_json['class_source'].replace(" ", "").replace("(simpl)", "")]["sutta_name"].tolist()
    # print(sutta_name)
except json.JSONDecodeError:
    print("Invalid JSON response:", response_text)

    # Clean response by removing first ```json and last ```
    cleaned = re.sub(r'^```json\n|\n```$', '', response_text)
    response_json = json.loads(cleaned)

    print(response_json)
    print(json.dumps(response_json, indent=4, ensure_ascii=False)) # Pretty print

Invalid JSON response: ```json
{
   "id": "9869",
   "pali": "asesa",
   "selected_pali_word": "asesa-virāga-nirodhā",
   "class_source": "SN 10.8",
   "class_sutta": "unknown",
   "class_example": "avijjāya tv'eva <b>asesa-virāga-nirodhā</b> saṅkhāra-nirodho, saṅkhāra-nirodhā viññāṇa-nirodho, viññāṇa-nirodhā nāmarūpa-nirodho, nāmarūpa-nirodhā saḷāyatana-nirodho, saḷāyatana-nirodhā phassa-nirodho, phassa-nirodhā vedanā-nirodho, vedanā-nirodhā taṇhā-nirodho, taṇhā-nirodhā upādāna-nirodho, upādāna-nirodhā bhava-nirodho, bhava-nirodhā jāti-nirodho, jāti-nirodhā jarā-maraṇaṃ soka-parideva-dukkha-domanass'upāyāsā nirujjhanti.",
   "english_translation": "But from the complete fading away and cessation of ignorance there is cessation of volitional formations; from the cessation of volitional formations, cessation of consciousness; from the cessation of consciousness, cessation of name-and-form; from the cessation of name-and-form, cessation of the six sense bases; from the cessation of the s

In [20]:
eval_vocab_df = pd.read_csv("pali_class/vocab/vocab_class_2.csv")
pick_palis = eval_vocab_df["pali"]
pick_palis

0               attha 2.1
1               anālaya 1
2                anuttara
3                 apara 1
4      aparena samayena 1
              ...        
118            vūpasama 1
119              vedanā 1
120              hattha 1
121            adhigama 2
122              ābādha 2
Name: pali, Length: 123, dtype: object

In [21]:
SYSTEM_PROMPT = """
You are an AI assistant specialized in Pali language processing. Your primary task is to extract **full sentences** from an exercise dataset based on a given Pali word. Follow these instructions carefully:

1. **Extract and Structure Data:**
   - Identify sentences in the exercise dataset that contain the **given Pali word**, recognizing its various forms (e.g., declensions and conjugations).
   - Extract **full sentences** that provide meaningful context. Do **not** return single-word outputs (e.g., "<b>Anussarati</b>").
   - A valid **sutta reference number** (also known as the **sutta number**) must be **explicitly present within the selected sentence**.
   - **STRICT RULE**: 
      - If the **sutta reference number** (e.g., "AN 3.71") is **not found directly in the selected sentence itself**, **discard the selected sentence**.
   - Remove any **sutta reference number** from the selected sentence (e.g., "AN 3.71", "AN 3.71 (simpl)", or similar citation formats) once identified.
   - Store the **sutta reference number** in the `"class_source"` field.
   - **Handling `(simpl)` in `class_source` Formatting:**
      - If the extracted **sutta reference number** contains **"(simpl)"**, preserve it (e.g., `"DN 19.7 (simpl)"`).
      - If **"(simpl)"** is **not present**, return only the core sutta number (e.g., `"DN 19.7"`).

2. **Ensure Correct Sutta Reference Name Retrieval:**
   - The **sutta reference name** (also known as the **sutta reference name**) will not be available in the exercise dataset.
   - After obtaining the **sutta reference number**, retrieve the corresponding **sutta reference name**.
      - Example: If **sutta reference number** is `"AN 2.1"`, the **sutta reference name** should be `"vajjasuttaṃ"`.
      - If the correct **sutta reference name** cannot be determined, return `"unknown"` instead of making an incorrect guess.
   - Store the **sutta reference name** in the `"class_sutta"` field.

3. **Formatting Rule for `"class_example"`:**
   - The **selected Pali word** is **highlighted alone** within `<b>...</b>`.
   - The capitalization of the entire sentence must remain exactly as in the exercise dataset.

4. **`"pali"` Field Rule:**  
   - The `"pali"` field **must always** match the **exact given Pali word**, including numbering if present (e.g., `"anussarati 1.1"`).
   - Even if the extracted sentence contains a different **morphological form**, the `"pali"` field must **not** be altered.

5. **`"explanation"` Field Rule:**  
   - The `"explanation"` field should provide a brief justification for why the response was generated **if the user asks a question**.
   - If the user requests a solution, provide a practical suggestion to achieve the desired outcome.
   - If **no question is provided**, return an **empty string** (`""`).

6. **Output Structure and Formatting:**
   - If a valid sentence is found and the sutta reference number is explicitly present within it, return the output in **JSON format** with the following structure:  
      - id: <Given Pali ID>,
      - pali: <Given Pali word>,
      - selected_pali_word: <Selected Pali word>,
      - class_source: <Sutta Reference Number>,
      - class_sutta: <Sutta Reference Name>,
      - class_example: <A full sentence with <b>Selected Pali word</b> highlighted, preserving original capitalization>,
      - english_translation: <English translation>,
      - explanation: <A justification for why this response was generated and a solution if applicable, or an empty string if no question was asked>
   - If the sutta reference number is missing from the selected sentence or no valid sentence is found, return a **JSON object** with empty values for all fields except "id" and "pali":
      - id: <Given Pali ID>,
      - pali: <Given Pali word>,
      - selected_pali_word: "",
      - class_source: "",
      - class_sutta: "",
      - class_example: "",
      - english_translation: "",
      - explanation: ""

7. **Strict JSON Output rules:**
   **IMPORTANT: FOLLOW THESE RULES STRICTLY**
   - **DO NOT** wrap the JSON output in Markdown formatting (e.g., ` ```json ... ``` `).
   - **DO NOT** add any extra text before or after the JSON response.
   - **DO NOT** format the output as a code block.
   - **ONLY RETURN A PLAIN JSON OBJECT.**
   - **Failure to follow these rules will result in an invalid response.**      
"""

In [25]:
# Step 1: Setup DeepSeek LLM
llm = ChatOpenAI(
    base_url="https://api.deepseek.com",
    api_key=deepseek_api_key,
    model="deepseek-chat",
)

# Step 2: Create LangChain-compatible prompt
prompt = ChatPromptTemplate([
    ("system", SYSTEM_PROMPT.strip()),
    ("human", "For the given Pali word: {pali} with ID: {id}, extract a relevant sentence in exercise data: {exercise}")
])
print(prompt.input_variables)

# Step 3: Create chain using `prompt | llm`
chain = prompt | llm

# Step 4: Define batch input
batch_inputs = []
for pali in pick_palis:
# for pali in pick_palis[0:5]:
    result = search_pali_in_csv(pali)
    batch_inputs.append({
        "pali": result['pali'],
        "id": result['id'], 
        "exercise": exercise_data,
    })

# Step 5: Use RunnableMap to batch it
batch_chain = RunnableMap({"response": chain})
results = batch_chain.batch(batch_inputs)

# Step 6: Clean + parse results
parsed_results = []
for i, r in enumerate(results):
    raw = r["response"].content
    cleaned = re.sub(r'^```json\s*|\s*```$', '', raw.strip(), flags=re.DOTALL)
    try:
        parsed = json.loads(cleaned)
    except json.JSONDecodeError:
        try:
            parsed = json.loads(json.loads(cleaned)) # fallback for escaped JSON
        except:
            parsed = {"error": "Failed to parse", "raw": raw}
    parsed_results.append(parsed)

# for pr in parsed_results:
#     print(json.dumps(pr, indent=4, ensure_ascii=False))

['exercise', 'id', 'pali']
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.cs

In [26]:
# Step 7: Convert to DataFrame and save
df = pd.DataFrame(parsed_results)
df.to_csv("pali_class/output/deepseek/output.csv", index=False, encoding="utf-8")
print("CSV file saved as pali_class/output/deepseek/output.csv")

CSV file saved as pali_class/output/deepseek/output.csv


In [None]:
# OpenAI Inference
llm_model = ["gpt-4o", "gpt-4-turbo"]

# Define single request
response = openai_llm.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "system", 
            "content": SYSTEM_PROMPT
        },
        {
            "role": "user", 
            "content": USER_PROMPT
        }
    ],
    max_tokens=1000
)

# Extract response content
response_text = response.choices[0].message.content

# Convert string to JSON
try:
    response_json = json.loads(response_text)
    print(json.dumps(response_json, indent=4, ensure_ascii=False)) # Pretty print
    # sutta_name = new_suttas[new_suttas['sutta_number'] == response_json['class_source'].replace(" ", "").replace("(simpl)", "")]["sutta_name"].tolist()
    # print(sutta_name)
except json.JSONDecodeError:
    print("Invalid JSON response:", response_text)
    # Clean response by removing first ```json and last ```
    print(json.dumps(re.sub(r'^```json\n|\n```$', '', response_text), indent=4, ensure_ascii=False))

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
eval_vocab_df = pd.read_csv("pali_class/vocab/vocab_class_2.csv")
pick_palis = eval_vocab_df["pali"]
pick_palis

0               attha 2.1
1               anālaya 1
2                anuttara
3                 apara 1
4      aparena samayena 1
              ...        
118            vūpasama 1
119              vedanā 1
120              hattha 1
121            adhigama 2
122              ābādha 2
Name: pali, Length: 123, dtype: object

In [None]:
# eval_exercise_data = open("pali_class/exercises/exercises_class_2.txt", "r").read().strip()
eval_exercise_data = open("pali_class/restructured_exercises/restructured_exercises_class_2.2.txt", "r").read().strip()
print(eval_exercise_data)

Class 2 Exercises

namo tassa bhagavato arahato sammā-sambuddhassa
Homage to him, the Blessed One, the Worthy One, the fully Enlightened One.

avijjāya tv'eva asesa-virāga-nirodhā saṅkhāra-nirodho, saṅkhāra-nirodhā viññāṇa-nirodho, viññāṇa-nirodhā nāmarūpa-nirodho, nāmarūpa-nirodhā saḷāyatana-nirodho, saḷāyatana-nirodhā phassa-nirodho, phassa-nirodhā vedanā-nirodho, vedanā-nirodhā taṇhā-nirodho, taṇhā-nirodhā upādāna-nirodho, upādāna-nirodhā bhava-nirodho, bhava-nirodhā jāti-nirodho, jāti-nirodhā jarā-maraṇaṃ soka-parideva-dukkha-domanass'upāyāsā nirujjhanti.
But from the complete fading away and cessation of ignorance there is cessation of volitional formations; from the cessation of volitional formations, cessation of consciousness; from the cessation of consciousness, cessation of name-and-form; from the cessation of name-and-form, cessation of the six sense bases; from the cessation of the six sense bases, cessation of contact; from the cessation of contact, cessation of feeling; f

In [None]:
def generate_unique_id(user_id):
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # Format: YYYYMMDDHHMMSS
    unique_str = uuid.uuid4().hex
    return f"{user_id}-{timestamp}-{unique_str}"

# Overwrite the file with nothing
with open(JSONL_FILE, "w") as f:
    # pass
    f.write("")

for p in pick_palis.head(3):
# for p in pick_palis:
    result = search_pali_in_csv(p)
    # Generate a unique ID for the request (user-timestamp-unique)
    unique_id = generate_unique_id("user-1")
    
    # if len(result["prdc"]) > 0:
    #     USER_PROMPT = f"""
    # For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
    # which has "{result['pos']}" as its grammatical part of speech, "{result['meaning']}" as Pali word`s english meaning and \
    # **Found declensions and conjugations**: {', '.join(result['prdc'])}, \
    # find example sentence in exercise data: "{exercise_data}".
    # """
    # else:
    #     USER_PROMPT = f"""
    # For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
    # which has "{result['pos']}" as its grammatical part of speech and "{result['meaning']}" as Pali word`s english meaning, \
    # find example sentence in exercise data: "{exercise_data}".
    # """

    USER_PROMPT = f"""
    For the given Pali word: "{result['pali']}" with ID: "{result['id']}", \
    extract a relevant sentence in exercise data: "{exercise_data}".
    """

    request = {
        "custom_id": unique_id, 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {
            "model": "gpt-4-turbo", 
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": USER_PROMPT}
            ],
            "max_tokens": 1000
        }
    }

    # Write to JSONL file (append mode for multiple requests)
    with open(JSONL_FILE, "a", encoding="utf-8") as f:
        json.dump(request, f) # Convert dictionary to JSON string
        f.write("\n") # Newline for the next JSON object

    print(f"Request saved to {JSONL_FILE}")

Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match 

In [None]:
batch_input_file = openai_llm.files.create(
    file=open(JSONL_FILE, "rb"),
    purpose="batch"
)

batch_input_file

FileObject(id='file-QuTXB7N9X3xt5PRwnxehVQ', bytes=1519429, created_at=1743597110, filename='requests.jsonl', object='file', purpose='batch', status='processed', status_details=None, expires_at=None)

In [None]:
batch_input_file_id = batch_input_file.id

# Start the batch process and store the response
batch_request = openai_llm.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Automated batch processing"}
)

# Get Batch ID
batch_id = batch_request.id
print("Batch ID:", batch_id)

Batch ID: batch_67ed2e3734ac8190b52cab123d98b067


In [None]:
# Function to check if the batch is completed
def wait_for_batch_completion(client, batch_id, check_interval=5):
    """Waits for the batch job to complete before proceeding."""
    while True:
        batch_status = client.batches.retrieve(batch_id)
        status = getattr(batch_status, "status", "").lower() # Access attribute directly

        print(f"Batch Status: {status}")

        if status in ["completed", "failed", "cancelled"]:
            break # Stop polling when the batch is done

        time.sleep(check_interval) # Wait before checking again

In [None]:
wait_for_batch_completion(openai_llm, batch_id)

# Proceed to the next step after completion
print("Batch processing is complete. Proceeding to the next step.")

Batch Status: validating
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status:

In [None]:
output_file_id = openai_llm.batches.retrieve(batch_id).output_file_id
file_response = openai_llm.files.content(output_file_id)
print(file_response.text)

{"id": "batch_req_67ed2f27e2048190b15e1f22d49b961c", "custom_id": "user-1-20250402203034-49f624d423924127a1e770debbf50e1e", "response": {"status_code": 200, "request_id": "1ffec4565d0dd5b3f994c2e9345e332b", "body": {"id": "chatcmpl-BHrYBrDDTmfYho1hpdDIFXmrh3Lqh", "object": "chat.completion", "created": 1743597123, "model": "gpt-4-turbo-2024-04-09", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n   \"id\": \"2603\",\n   \"pali\": \"attha 2.1\",\n   \"selected_pali_word\": \"atthamhi\",\n   \"class_source\": \"MN 35\",\n   \"class_sutta\": \"unknown\",\n   \"class_example\": \"[Good are] friends in [arisen] need [Good are] friends when need [arises]\",\n   \"english_translation\": \"[Good are] friends in [arisen] need [Good are] friends when need [arises]\",\n   \"explanation\": \"\"\n}", "refusal": null, "annotations": []}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 3225, "completion_tokens": 116, "total_tokens": 3341, "prompt_token

In [None]:
# Initialize an empty DataFrame with predefined columns
df = pd.DataFrame(columns=["id", "pali", "selected_pali_word", "class_source", "class_sutta", "class_example", "english_translation", "explanation"])

for line in file_response.iter_lines():
    if line: # Ignore empty lines
        try:
            data = json.loads(line) # Parse each JSON object
            # Extract response text from OpenAI output
            response_text = data.get("response", {}).get("body", {}).get('choices', [{}])[0].get('message', {}).get('content', '{}')

            # Convert response_text (JSON string) into a dictionary
            extracted_data = json.loads(response_text)

            df = pd.concat([df, pd.DataFrame([extracted_data])], ignore_index=True)
        except json.JSONDecodeError as e:
            # print(f"Error decoding JSON: {e}, Line: {line}")

            data = json.loads(line) # Parse each JSON object
            
            # Extract response text from OpenAI output
            response_text = data.get("response", {}).get("body", {}).get('choices', [{}])[0].get('message', {}).get('content', '{}')
            
            # Clean response by removing first ```json and last ```
            response_text = json.dumps(re.sub(r'^```json\n|\n```$', '', response_text), indent=4, ensure_ascii=False)
            response_text = json.loads(response_text) # Decode first if double-encoded

            # Convert response_text (JSON string) into a dictionary
            extracted_data = json.loads(response_text)

            df = pd.concat([df, pd.DataFrame([extracted_data])], ignore_index=True)

print(df)

        id                pali selected_pali_word           class_source  \
0     2603           attha 2.1           atthamhi                  MN 35   
1     4036           anālaya 1            anālayo               SN 56.11   
2     4524            anuttara           anuttaro              VIN 1.1.3   
3     6258             apara 1                                             
4     6463  aparena samayena 1   aparena samayena            VIN 1.4.1.2   
..     ...                 ...                ...                    ...   
118  70132          vūpasama 1                                             
119  70238            vedanā 1             vedanā                SN 10.8   
120  70807            hattha 1           hatthena  VIN PAT SE 55 (simpl)   
121  74544          adhigama 2         adhigamāya        DN 22.1 (simpl)   
122  76068            ābādha 2                                             

    class_sutta                                      class_example  \
0       unknown  

In [None]:
# Save DataFrame to CSV
df.to_csv("pali_class/output/r-output-2.3.csv", index=False, encoding="utf-8")

print("CSV file saved as pali_class/output/r-output-2.3.csv")

CSV file saved as pali_class/output/r-output-2.2.csv
