In [41]:
# import libraries
import re
import os
import json
import uuid
import time
import datetime
import importlib
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from src.llm_factory import LLMFactory
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableMap

In [42]:
# Define functions
def search_pali_in_csv(keyword, vocab_folder_path):
    """Search for a Pali word in all CSV files in the folder."""
    result = {
        "id": -1,
        "pali": "",
        "meaning": "",
        "pos": "",
        "exercise_number": "",
        "prdc": []
    }

    for filename in os.listdir(vocab_folder_path):
        if filename.endswith(".csv"): # Only search in CSV files
            file_path = os.path.join(vocab_folder_path, filename)
            df = pd.read_csv(file_path, dtype=str)
            example_columns = [col for col in df.columns if "example" in col.lower()]
            match = df[df["pali"] == keyword]

            if not match.empty:
                result["id"] = match["id"].values[0]
                result["pali"] = match["pali"].values[0]
                result["meaning"] = match["meaning"].values[0]
                result["pos"] = match["pos"].values[0]

                for col in example_columns:
                    sentence = match[col].values[0]
                    if isinstance(sentence, str): # Ensure it's a string before regex
                        # Extract text inside <b>...</b>
                        extracted = re.findall(r"<b>(.*?)</b>", sentence)
                        
                        for ext in extracted:
                            if ext not in result['prdc']:
                                result['prdc'].append(ext)
                    
                print("Match found in file:", filename)
                
                number = filename.split("_")[-1].split(".")[0] # Extract the number
                result["exercise_number"] = number

                break

    if result["id"] == -1:
        print("No matches found")

    return result

In [43]:
# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
open_api_key = os.getenv("SBS_OPENAI_API_KEY")
deepseek_api_key = os.getenv("SBS_DEEPSEEK_API_KEY")

# Get vocab and exercise path
VOCAB_FOLDER_PATH = "pali_class/vocab"
EXERCISE_FOLDER_PATH = "pali_class/exercises"

# Define variables for OpenAI bathch processing usage
JSONL_FILE = "requests.jsonl"
OUTPUT_FILE = "output.jsonl"

# Get sutta data
new_suttas = pd.read_csv("pali_class/new_suttas.csv")

# Initialize LLMFactory for OpenAI, temperature 0.7 will not be applied
openai_llm = LLMFactory("openai", "original", "gpt-4o", open_api_key, 0.7).get_llm()

# Initialize LLMFactory for DeepSeek
deekseek_llm = LLMFactory("deepseek", "langchain", "deepseek-chat", deepseek_api_key, 0.7).get_llm()

Given a Pali word, extract its meaning, part of speech (POS), exercise number, and the relevant example word enclosed in `<b></b>` from the 'example*' column, along with any other relevant information.

In [44]:
# keyword = input("Enter a Pali word: ").strip() # Remove extra spaces
keyword = "attha 2.1"

result = search_pali_in_csv(keyword, VOCAB_FOLDER_PATH)
result

Match found in file: vocab_class_2.csv


{'id': '2603',
 'pali': 'attha 2.1',
 'meaning': 'need (for); want (for)',
 'pos': 'masc',
 'exercise_number': '2',
 'prdc': ['atthamhi', 'attho']}

Retrieve the exercise data corresponding to the result after searching.

In [45]:
# Find exercise number
exercise_number = result['exercise_number']
found_exercise = False
target_exercise = ""
exercise_data = ""

for filename in os.listdir(EXERCISE_FOLDER_PATH):
    if filename.endswith(".txt") and f"_{exercise_number}." in filename:
        found_exercise = True
        target_exercise = filename
        print("Found:", filename)

if not found_exercise:
    print("Exercise not found")
else:
    exercise_data = open(os.path.join(EXERCISE_FOLDER_PATH, target_exercise), "r").read().strip()
    print(exercise_data)

Found: exercises_class_2.txt
Found: restructured_exercises_class_2.txt
Topic: Class 2 Exercises

Pali: namo tassa bhagavato arahato sammā-sambuddhassa
English Translation: Homage to him, the Blessed One, the Worthy One, the fully Enlightened One.

Pali: avijjāya tv'eva asesa-virāga-nirodhā saṅkhāra-nirodho, saṅkhāra-nirodhā viññāṇa-nirodho, viññāṇa-nirodhā nāmarūpa-nirodho, nāmarūpa-nirodhā saḷāyatana-nirodho, saḷāyatana-nirodhā phassa-nirodho, phassa-nirodhā vedanā-nirodho, vedanā-nirodhā taṇhā-nirodho, taṇhā-nirodhā upādāna-nirodho, upādāna-nirodhā bhava-nirodho, bhava-nirodhā jāti-nirodho, jāti-nirodhā jarā-maraṇaṃ soka-parideva-dukkha-domanass'upāyāsā nirujjhanti.
English Translation: But from the complete fading away and cessation of ignorance there is cessation of volitional formations; from the cessation of volitional formations, cessation of consciousness; from the cessation of consciousness, cessation of name-and-form; from the cessation of name-and-form, cessation of the six 

Define user and system prompt.

In [46]:
SYSTEM_PROMPT = """
You are an AI assistant specialized in Pali language processing. Your primary task is to extract **full sentences** from an exercise dataset based on a given Pali word. Follow these instructions carefully:

1. **Extract and Structure Data:**
   - Identify sentences in the exercise dataset that contain the **given Pali word**, recognizing its various forms (e.g., declensions and conjugations).
   - Extract **full sentences** that provide meaningful context. Do **not** return single-word outputs (e.g., "<b>Anussarati</b>").
   - A valid **sutta reference number** (also known as the **sutta number**) must be **explicitly present within the selected sentence**.
   - **STRICT RULE**: 
      - If the **sutta reference number** (e.g., "AN 3.71") is **not found directly in the selected sentence itself**, **discard the selected sentence**.
   - Remove any **sutta reference number** from the selected sentence (e.g., "AN 3.71", "AN 3.71 (simpl)", or similar citation formats) once identified.
   - Store the **sutta reference number** in the `"class_source"` field.
   - **Handling `(simpl)` in `class_source` Formatting:**
      - If the extracted **sutta reference number** contains **"(simpl)"**, preserve it (e.g., `"DN 19.7 (simpl)"`).
      - If **"(simpl)"** is **not present**, return only the core sutta number (e.g., `"DN 19.7"`).

2. **Ensure Correct Sutta Reference Name Retrieval:**
   - The **sutta reference name** (also known as the **sutta reference name**) will not be available in the exercise dataset.
   - After obtaining the **sutta reference number**, retrieve the corresponding **sutta reference name**.
      - Example: If **sutta reference number** is `"AN 2.1"`, the **sutta reference name** should be `"vajjasuttaṃ"`.
      - If the correct **sutta reference name** cannot be determined, return `"unknown"` instead of making an incorrect guess.
   - Store the **sutta reference name** in the `"class_sutta"` field.

3. **Formatting Rule for `"class_example"`:**
   - The **selected Pali word** is **highlighted alone** within `<b>...</b>`.
   - The capitalization of the entire sentence must remain exactly as in the exercise dataset.

4. **`"pali"` Field Rule:**  
   - The `"pali"` field **must always** match the **exact given Pali word**, including numbering if present (e.g., `"anussarati 1.1"`).
   - Even if the extracted sentence contains a different **morphological form**, the `"pali"` field must **not** be altered.

5. **`"explanation"` Field Rule:**  
   - The `"explanation"` field should provide a brief justification for why the response was generated **if the user asks a question**.
   - If the user requests a solution, provide a practical suggestion to achieve the desired outcome.
   - If **no question is provided**, return an **empty string** (`""`).

6. **Output Structure and Formatting:**
   - If a valid sentence is found and the sutta reference number is explicitly present within it, return the output in **JSON format** with the following structure:
      {
         "id": "<Given Pali ID>",
         "pali": "<Given Pali word>",
         "selected_pali_word": "<Selected Pali word>",
         "class_source": "<Sutta Reference Number>",
         "class_sutta": "<Sutta Reference Name>",
         "class_example": "<A full sentence with <b>Selected Pali word</b> highlighted, preserving original capitalization>",
         "english_translation": "<English translation>",
         "explanation": "<A justification for why this response was generated and a solution if applicable, or an empty string if no question was asked>"
      }
   - If the sutta reference number is missing from the selected sentence or no valid sentence is found, return a **JSON object** with empty values for all fields except "id" and "pali":
      {
         "id": "<Given Pali ID>",
         "pali": "<Given Pali word>",
         "selected_pali_word": "",
         "class_source": "",
         "class_sutta": "",
         "class_example": "",
         "english_translation": "",
         "explanation": ""
      }

7. **Strict JSON Output rules:**
   **IMPORTANT: FOLLOW THESE RULES STRICTLY**
   - **DO NOT** wrap the JSON output in Markdown formatting (e.g., ` ```json ... ``` `).
   - **DO NOT** add any extra text before or after the JSON response.
   - **DO NOT** format the output as a code block.
   - **ONLY RETURN A PLAIN JSON OBJECT.**
   - **Failure to follow these rules will result in an invalid response.**
"""

USER_PROMPT = f"""
For the given Pali word: "{result['pali']}" with ID: "{result['id']}", User question: Why this sentence is relevant as there are not any sutta reference numbers in it?
"""

print(USER_PROMPT)


For the given Pali word: "attha 2.1" with ID: "2603", User question: Why this sentence is relevant as there are not any sutta reference numbers in it?



Perform a single inference to monitor the result produced by DeepSeek.

In [47]:
# Deepseek Inference
response_text = deekseek_llm.chat(system_prompt=SYSTEM_PROMPT, user_prompt=USER_PROMPT)

# Convert string to JSON
try:
    response_json = json.loads(response_text)
    print(json.dumps(response_json, indent=4, ensure_ascii=False)) # Pretty print
    # sutta_name = new_suttas[new_suttas['sutta_number'] == response_json['class_source'].replace(" ", "").replace("(simpl)", "")]["sutta_name"].tolist()
    # print(sutta_name)
except json.JSONDecodeError:
    print("Invalid JSON response:", response_text)

    # Clean response by removing first ```json and last ```
    cleaned = re.sub(r'^```json\n|\n```$', '', response_text)
    response_json = json.loads(cleaned)

    print(json.dumps(response_json, indent=4, ensure_ascii=False)) # Pretty print

Invalid JSON response: ```json
{
   "id": "2603",
   "pali": "attha 2.1",
   "selected_pali_word": "",
   "class_source": "",
   "class_sutta": "",
   "class_example": "",
   "english_translation": "",
   "explanation": "The response was generated because no valid sentence containing the Pali word 'attha 2.1' with an explicit sutta reference number was found in the dataset. The strict rule requires the sutta reference number to be present within the selected sentence itself; otherwise, the sentence is discarded. Since no such sentence exists, the output fields are left empty."
}
```
{
    "id": "2603",
    "pali": "attha 2.1",
    "selected_pali_word": "",
    "class_source": "",
    "class_sutta": "",
    "class_example": "",
    "english_translation": "",
    "explanation": "The response was generated because no valid sentence containing the Pali word 'attha 2.1' with an explicit sutta reference number was found in the dataset. The strict rule requires the sutta reference number to b

Perform batch processing on all Pali words from a specified vocab .csv file and send the data to DeepSeek for inference the result.

In [48]:
vocab_df_for_evaluation = pd.read_csv("pali_class/vocab/vocab_class_2.csv")
pick_palis = vocab_df_for_evaluation["pali"]
pick_palis

0               attha 2.1
1               anālaya 1
2                anuttara
3                 apara 1
4      aparena samayena 1
              ...        
118            vūpasama 1
119              vedanā 1
120              hattha 1
121            adhigama 2
122              ābādha 2
Name: pali, Length: 123, dtype: object

In [49]:
SYSTEM_PROMPT_BATCH_PROCESSING = """
You are an AI assistant specialized in Pali language processing. Your primary task is to extract **full sentences** from an exercise dataset based on a given Pali word. Follow these instructions carefully:

1. **Extract and Structure Data:**
   - Identify sentences in the exercise dataset that contain the **given Pali word**, recognizing its various forms (e.g., declensions and conjugations).
   - Extract **full sentences** that provide meaningful context. Do **not** return single-word outputs (e.g., "<b>Anussarati</b>").
   - A valid **sutta reference number** (also known as the **sutta number**) must be **explicitly present within the selected sentence**.
   - **STRICT RULE**: 
      - If the **sutta reference number** (e.g., "AN 3.71") is **not found directly in the selected sentence itself**, **discard the selected sentence**.
   - Remove any **sutta reference number** from the selected sentence (e.g., "AN 3.71", "AN 3.71 (simpl)", or similar citation formats) once identified.
   - Store the **sutta reference number** in the `"class_source"` field.
   - **Handling `(simpl)` in `class_source` Formatting:**
      - If the extracted **sutta reference number** contains **"(simpl)"**, preserve it (e.g., `"DN 19.7 (simpl)"`).
      - If **"(simpl)"** is **not present**, return only the core sutta number (e.g., `"DN 19.7"`).

2. **Ensure Correct Sutta Reference Name Retrieval:**
   - The **sutta reference name** (also known as the **sutta reference name**) will not be available in the exercise dataset.
   - After obtaining the **sutta reference number**, retrieve the corresponding **sutta reference name**.
      - Example: If **sutta reference number** is `"AN 2.1"`, the **sutta reference name** should be `"vajjasuttaṃ"`.
      - If the correct **sutta reference name** cannot be determined, return `"unknown"` instead of making an incorrect guess.
   - Store the **sutta reference name** in the `"class_sutta"` field.

3. **Formatting Rule for `"class_example"`:**
   - The **selected Pali word** is **highlighted alone** within `<b>...</b>`.
   - The capitalization of the entire sentence must remain exactly as in the exercise dataset.

4. **`"pali"` Field Rule:**  
   - The `"pali"` field **must always** match the **exact given Pali word**, including numbering if present (e.g., `"anussarati 1.1"`).
   - Even if the extracted sentence contains a different **morphological form**, the `"pali"` field must **not** be altered.

5. **Output Structure and Formatting:**
   - If a valid sentence is found and the sutta reference number is explicitly present within it, return the output in **JSON format** with the following structure:  
      - id: <Given Pali ID>,
      - pali: <Given Pali word>,
      - selected_pali_word: <Selected Pali word>,
      - class_source: <Sutta Reference Number>,
      - class_sutta: <Sutta Reference Name>,
      - class_example: <A full sentence with <b>Selected Pali word</b> highlighted, preserving original capitalization>,
      - english_translation: <English translation>,
   - If the sutta reference number is missing from the selected sentence or no valid sentence is found, return a **JSON object** with empty values for all fields except "id" and "pali":
      - id: <Given Pali ID>,
      - pali: <Given Pali word>,
      - selected_pali_word: "",
      - class_source: "",
      - class_sutta: "",
      - class_example: "",
      - english_translation: "",

6. **Strict JSON Output rules:**
   **IMPORTANT: FOLLOW THESE RULES STRICTLY**
   - **DO NOT** wrap the JSON output in Markdown formatting (e.g., ` ```json ... ``` `).
   - **DO NOT** add any extra text before or after the JSON response.
   - **DO NOT** format the output as a code block.
   - **ONLY RETURN A PLAIN JSON OBJECT.**
   - **Failure to follow these rules will result in an invalid response.**      
"""

USER_PROMPT_BATCH_PROCESSING = "For the given Pali word: {pali} with ID: {id}, extract a relevant sentence in exercise data: {exercise}"

In [50]:
batch_input = []

for pali in pick_palis[0:5]:
    result = search_pali_in_csv(pali, VOCAB_FOLDER_PATH)
    batch_input.append({
        "pali": result['pali'],
        "id": result['id'], 
        "exercise": exercise_data,
    })

batch_result_deekseek = deekseek_llm.batch_processing(SYSTEM_PROMPT_BATCH_PROCESSING, USER_PROMPT_BATCH_PROCESSING, batch_input)

# Convert to DataFrame and save
df = pd.DataFrame(batch_result_deekseek)
df.to_csv("pali_class/output/deepseek/output.csv", index=False, encoding="utf-8")

print("CSV file saved as pali_class/output/deepseek/output.csv")

Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
Match found in file: vocab_class_2.csv
CSV file saved as pali_class/output/deepseek/output.csv


In [51]:
ground_truth = pd.read_csv("pali_class/ground_truth/exercise-2_ground_truth.csv", encoding="ISO-8859-1")
evaluation_result = pd.read_csv("pali_class/output/deepseek/output.csv", encoding="utf-8")

# Rename columns for clarity
ground_truth = ground_truth.rename(columns={
    "target_word": "target_word_gt",
    "class_source": "class_source_gt",
    "class_sutta": "class_sutta_gt",
    "class_example": "class_example_gt",
    "english_translation": "english_translation_gt"
})
evaluation_result = evaluation_result.rename(columns={
    "selected_pali_word": "target_word_pred",
    "class_source": "class_source_pred",
    "class_sutta": "class_sutta_pred",
    "class_example": "class_example_pred",
    "english_translation": "english_translation_pred"
})

# Merge by ID
df = pd.merge(ground_truth, evaluation_result, on="id", how="inner")

# Strict comparison (no text normalization)
df["target_word_match"] = df["target_word_gt"] == df["target_word_pred"]
df["class_source_match"] = df["class_source_gt"] == df["class_source_pred"]
df["class_example_match"] = df["class_example_gt"] == df["class_example_pred"]
df["english_translation_match"] = df["english_translation_gt"] == df["english_translation_pred"]

# Accuracy summary
metrics = {
    "target_word_accuracy": df["target_word_match"].mean(),
    "class_source_accuracy": df["class_source_match"].mean(),
    "class_example_accuracy": df["class_example_match"].mean(),
    "english_translation_accuracy": df["english_translation_match"].mean()
}

# Output metrics
for k, v in metrics.items():
    print(f"{k}: {v:.2%}")

# Overall Match Accuracy
df["all_match"] = (
    df["target_word_match"] &
    df["class_source_match"] &
    df["class_example_match"] &
    df["english_translation_match"]
)

target_word_accuracy: 60.00%
class_source_accuracy: 0.00%
class_example_accuracy: 20.00%
english_translation_accuracy: 20.00%


Perform a single inference to monitor the result produced by OpenAI.

In [52]:
SYSTEM_PROMPT = """
You are an AI assistant specialized in Pali language processing. Your primary task is to extract **full sentences** from an exercise dataset based on a given Pali word. Follow these instructions carefully:

1. **Extract and Structure Data:**
   - Identify sentences in the exercise dataset that contain the **given Pali word**, recognizing its various forms (e.g., declensions and conjugations).
   - Extract **full sentences** that provide meaningful context. Do **not** return single-word outputs (e.g., "<b>Anussarati</b>").
   - A valid **sutta reference number** (also known as the **sutta number**) must be **explicitly present within the selected sentence**.
   - **STRICT RULE**: 
      - If the **sutta reference number** (e.g., "AN 3.71") is **not found directly in the selected sentence itself**, **discard the selected sentence**.
   - Remove any **sutta reference number** from the selected sentence (e.g., "AN 3.71", "AN 3.71 (simpl)", or similar citation formats) once identified.
   - Store the **sutta reference number** in the `"class_source"` field.
   - **Handling `(simpl)` in `class_source` Formatting:**
      - If the extracted **sutta reference number** contains **"(simpl)"**, preserve it (e.g., `"DN 19.7 (simpl)"`).
      - If **"(simpl)"** is **not present**, return only the core sutta number (e.g., `"DN 19.7"`).

2. **Ensure Correct Sutta Reference Name Retrieval:**
   - The **sutta reference name** (also known as the **sutta reference name**) will not be available in the exercise dataset.
   - After obtaining the **sutta reference number**, retrieve the corresponding **sutta reference name**.
      - Example: If **sutta reference number** is `"AN 2.1"`, the **sutta reference name** should be `"vajjasuttaṃ"`.
      - If the correct **sutta reference name** cannot be determined, return `"unknown"` instead of making an incorrect guess.
   - Store the **sutta reference name** in the `"class_sutta"` field.

3. **Formatting Rule for `"class_example"`:**
   - The **selected Pali word** is **highlighted alone** within `<b>...</b>`.
   - The capitalization of the entire sentence must remain exactly as in the exercise dataset.

4. **`"pali"` Field Rule:**  
   - The `"pali"` field **must always** match the **exact given Pali word**, including numbering if present (e.g., `"anussarati 1.1"`).
   - Even if the extracted sentence contains a different **morphological form**, the `"pali"` field must **not** be altered.

5. **`"explanation"` Field Rule:**  
   - The `"explanation"` field should provide a brief justification for why the response was generated **if the user asks a question**.
   - If the user requests a solution, provide a practical suggestion to achieve the desired outcome.
   - If **no question is provided**, return an **empty string** (`""`).

6. **Output Structure and Formatting:**
   - If a valid sentence is found and the sutta reference number is explicitly present within it, return the output in **JSON format** with the following structure:
      {
         "id": "<Given Pali ID>",
         "pali": "<Given Pali word>",
         "selected_pali_word": "<Selected Pali word>",
         "class_source": "<Sutta Reference Number>",
         "class_sutta": "<Sutta Reference Name>",
         "class_example": "<A full sentence with <b>Selected Pali word</b> highlighted, preserving original capitalization>",
         "english_translation": "<English translation>",
         "explanation": "<A justification for why this response was generated and a solution if applicable, or an empty string if no question was asked>"
      }
   - If the sutta reference number is missing from the selected sentence or no valid sentence is found, return a **JSON object** with empty values for all fields except "id" and "pali":
      {
         "id": "<Given Pali ID>",
         "pali": "<Given Pali word>",
         "selected_pali_word": "",
         "class_source": "",
         "class_sutta": "",
         "class_example": "",
         "english_translation": "",
         "explanation": ""
      }

7. **Strict JSON Output rules:**
   **IMPORTANT: FOLLOW THESE RULES STRICTLY**
   - **DO NOT** wrap the JSON output in Markdown formatting (e.g., ` ```json ... ``` `).
   - **DO NOT** add any extra text before or after the JSON response.
   - **DO NOT** format the output as a code block.
   - **ONLY RETURN A PLAIN JSON OBJECT.**
   - **Failure to follow these rules will result in an invalid response.**
"""

USER_PROMPT = f"""
For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
which has "{result['pos']}" as its grammatical part of speech and "{result['meaning']}" as Pali word`s english meaning, \
find example sentence in exercise data: "{exercise_data}".
"""

print(USER_PROMPT)


For the Pali word: "aparena samayena 1" and its id: "6463", which has "idiom" as its grammatical part of speech and "at another time; later" as Pali word`s english meaning, find example sentence in exercise data: "Topic: Class 2 Exercises

Pali: namo tassa bhagavato arahato sammā-sambuddhassa
English Translation: Homage to him, the Blessed One, the Worthy One, the fully Enlightened One.

Pali: avijjāya tv'eva asesa-virāga-nirodhā saṅkhāra-nirodho, saṅkhāra-nirodhā viññāṇa-nirodho, viññāṇa-nirodhā nāmarūpa-nirodho, nāmarūpa-nirodhā saḷāyatana-nirodho, saḷāyatana-nirodhā phassa-nirodho, phassa-nirodhā vedanā-nirodho, vedanā-nirodhā taṇhā-nirodho, taṇhā-nirodhā upādāna-nirodho, upādāna-nirodhā bhava-nirodho, bhava-nirodhā jāti-nirodho, jāti-nirodhā jarā-maraṇaṃ soka-parideva-dukkha-domanass'upāyāsā nirujjhanti.
English Translation: But from the complete fading away and cessation of ignorance there is cessation of volitional formations; from the cessation of volitional formations, cessati

In [54]:
response_text = openai_llm.chat(SYSTEM_PROMPT, USER_PROMPT)

# Convert string to JSON
try:
    response_json = json.loads(response_text)
    print(json.dumps(response_json, indent=4, ensure_ascii=False)) # Pretty print
    # sutta_name = new_suttas[new_suttas['sutta_number'] == response_json['class_source'].replace(" ", "").replace("(simpl)", "")]["sutta_name"].tolist()
    # print(sutta_name)
except json.JSONDecodeError:
    print("Invalid JSON response:", response_text)
    # Clean response by removing first ```json and last ```
    print(json.dumps(re.sub(r'^```json\n|\n```$', '', response_text), indent=4, ensure_ascii=False))

{
    "id": "6463",
    "pali": "aparena samayena 1",
    "selected_pali_word": "aparena samayena",
    "class_source": "VIN 1.4.1.2",
    "class_sutta": "unknown",
    "class_example": "on <b>another occasion</b>, the Awakened One…",
    "english_translation": "on another occasion, the Awakened One…",
    "explanation": ""
}


In [55]:
openai = OpenAI(
    api_key=open_api_key
)

try:
    models = openai.models.list()
    print("Connection successful. Available models:")
    for model in models.data:
        print(model.id)
except Exception as e:
    print("Connection failed:", e)

Connection successful. Available models:
gpt-4-1106-preview
dall-e-3
dall-e-2
gpt-4o-audio-preview-2024-10-01
gpt-4-turbo-preview
text-embedding-3-small
gpt-4-turbo-2024-04-09
gpt-4-turbo
babbage-002
gpt-4
text-embedding-ada-002
chatgpt-4o-latest
text-embedding-3-large
gpt-4o-mini-audio-preview
gpt-4o-audio-preview
gpt-4o-mini-realtime-preview
gpt-4o-mini-realtime-preview-2024-12-17
gpt-4.1-nano
gpt-3.5-turbo-instruct-0914
gpt-4o-mini-search-preview
gpt-4.1-nano-2025-04-14
gpt-3.5-turbo-16k
gpt-4o-realtime-preview
davinci-002
gpt-3.5-turbo-1106
gpt-4o-search-preview
gpt-3.5-turbo-instruct
gpt-3.5-turbo
o3-mini-2025-01-31
gpt-4o-mini-search-preview-2025-03-11
gpt-4-0125-preview
gpt-4o-2024-11-20
gpt-4o-2024-05-13
o1-2024-12-17
o1
gpt-4-0613
o1-mini
gpt-4o-mini-tts
o1-pro
gpt-4o-transcribe
gpt-4.5-preview
o1-pro-2025-03-19
gpt-4.5-preview-2025-02-27
gpt-4o-search-preview-2025-03-11
gpt-image-1
o1-mini-2024-09-12
tts-1-hd
gpt-4o
tts-1-hd-1106
gpt-4o-2024-08-06
gpt-4o-mini-2024-07-18
gpt-4

In [56]:
def generate_unique_id():
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # Format: YYYYMMDDHHMMSS
    unique_str = uuid.uuid4().hex
    return f"{timestamp}-{unique_str}"

# Overwrite the file with nothing
with open(JSONL_FILE, "w") as f:
    # pass
    f.write("")

for p in pick_palis.head(3):
# for p in pick_palis:
    result = search_pali_in_csv(p, VOCAB_FOLDER_PATH)
    # Generate a unique ID for the request (user-timestamp-unique)
    unique_id = generate_unique_id()
    
    # if len(result["prdc"]) > 0:
    #     USER_PROMPT = f"""
    # For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
    # which has "{result['pos']}" as its grammatical part of speech, "{result['meaning']}" as Pali word`s english meaning and \
    # **Found declensions and conjugations**: {', '.join(result['prdc'])}, \
    # find example sentence in exercise data: "{exercise_data}".
    # """
    # else:
    #     USER_PROMPT = f"""
    # For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
    # which has "{result['pos']}" as its grammatical part of speech and "{result['meaning']}" as Pali word`s english meaning, \
    # find example sentence in exercise data: "{exercise_data}".
    # """

    USER_PROMPT = f"""
    For the given Pali word: "{result['pali']}" with ID: "{result['id']}", \
    extract a relevant sentence in exercise data: "{exercise_data}".
    """

    request = {
        "custom_id": unique_id, 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {
            "model": "gpt-4-turbo", 
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": USER_PROMPT}
            ],
            "max_tokens": 1000
        }
    }

    # Write to JSONL file (append mode for multiple requests)
    with open(JSONL_FILE, "a", encoding="utf-8") as f:
        json.dump(request, f) # Convert dictionary to JSON string
        f.write("\n") # Newline for the next JSON object

    print(f"Request saved to {JSONL_FILE}")

Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl


In [57]:
batch_input_file = openai.files.create(
    file=open(JSONL_FILE, "rb"),
    purpose="batch"
)

batch_input_file

FileObject(id='file-9dFw5ppdDjhB4txeKxZp4N', bytes=37030, created_at=1748759520, filename='requests.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)

In [58]:
batch_input_file_id = batch_input_file.id

# Start the batch process and store the response
batch_request = openai.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Automated batch processing"}
)

# Get Batch ID
batch_id = batch_request.id
print("Batch ID:", batch_id)

Batch ID: batch_683bf3eaea908190945887198a2e9443


In [59]:
# Function to check if the batch is completed
def wait_for_batch_completion(client, batch_id, check_interval=5):
    """Waits for the batch job to complete before proceeding."""
    while True:
        batch_status = client.batches.retrieve(batch_id)
        status = getattr(batch_status, "status", "").lower() # Access attribute directly

        print(f"Batch Status: {status}")

        if status in ["completed", "failed", "cancelled"]:
            break # Stop polling when the batch is done

        time.sleep(check_interval) # Wait before checking again

In [60]:
wait_for_batch_completion(openai, batch_id)

# Proceed to the next step after completion
print("Batch processing is complete. Proceeding to the next step.")

Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status: in_progress
Batch Status

In [62]:
output_file_id = openai.batches.retrieve(batch_id).output_file_id
file_response = openai.files.content(output_file_id)
print(file_response.text)

{"id": "batch_req_683bf4e4ab888190931d278a58fb1ad2", "custom_id": "20250601143158-89d3dcd543054122af6f74b7e4b39d69", "response": {"status_code": 200, "request_id": "0ff512e775a10e899b3fb87803c989b6", "body": {"id": "chatcmpl-BdWX8MlKq2fghWZMVxstq4q5qXS28", "object": "chat.completion", "created": 1748759550, "model": "gpt-4-turbo-2024-04-09", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n   \"id\": \"2603\",\n   \"pali\": \"attha 2.1\",\n   \"selected_pali_word\": \"\",\n   \"class_source\": \"\",\n   \"class_sutta\": \"\",\n   \"class_example\": \"\",\n   \"english_translation\": \"\",\n   \"explanation\": \"\"\n}", "refusal": null, "annotations": []}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 3225, "completion_tokens": 62, "total_tokens": 3287, "prompt_tokens_details": {"cached_tokens": 0, "audio_tokens": 0}, "completion_tokens_details": {"reasoning_tokens": 0, "audio_tokens": 0, "accepted_prediction_tokens": 0, "rejected_predic

In [63]:
# Initialize an empty DataFrame with predefined columns
df = pd.DataFrame(columns=["id", "pali", "selected_pali_word", "class_source", "class_sutta", "class_example", "english_translation", "explanation"])

for line in file_response.iter_lines():
    if line: # Ignore empty lines
        try:
            data = json.loads(line) # Parse each JSON object
            # Extract response text from OpenAI output
            response_text = data.get("response", {}).get("body", {}).get('choices', [{}])[0].get('message', {}).get('content', '{}')

            # Convert response_text (JSON string) into a dictionary
            extracted_data = json.loads(response_text)

            df = pd.concat([df, pd.DataFrame([extracted_data])], ignore_index=True)
        except json.JSONDecodeError as e:
            # print(f"Error decoding JSON: {e}, Line: {line}")

            data = json.loads(line) # Parse each JSON object
            
            # Extract response text from OpenAI output
            response_text = data.get("response", {}).get("body", {}).get('choices', [{}])[0].get('message', {}).get('content', '{}')
            
            # Clean response by removing first ```json and last ```
            response_text = json.dumps(re.sub(r'^```json\n|\n```$', '', response_text), indent=4, ensure_ascii=False)
            response_text = json.loads(response_text) # Decode first if double-encoded

            # Convert response_text (JSON string) into a dictionary
            extracted_data = json.loads(response_text)

            df = pd.concat([df, pd.DataFrame([extracted_data])], ignore_index=True)

print(df)

     id       pali selected_pali_word class_source class_sutta  \
0  2603  attha 2.1                                               
1  4036  anālaya 1            anālayo     SN 56.11     unknown   
2  4524   anuttara           anuttaro    VIN 1.1.3     unknown   

                                       class_example  \
0                                                      
1  The relinquishing, the abandonment, the <b>non...   
2  Itipi so bhagavā arahaṃ sammāsambuddho vijjāca...   

                                 english_translation explanation  
0                                                                 
1  The relinquishing, the abandonment, the non-at...              
2  It is so (thus) “He [is] the Blessed One, the ...              


In [66]:
# Save DataFrame to CSV
df.to_csv("pali_class/output/openai/output.csv", index=False, encoding="utf-8")

print("CSV file saved as pali_class/output/openai/output.csv")

CSV file saved as pali_class/output/openai/output.csv
