In [1]:
# import libraries
import re
import os
import json
import uuid
import datetime
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

In [2]:
# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
open_ai_key = os.getenv("SBS_OPENAI_API_KEY")

# Initialize OpenAI client
client = OpenAI()
client.api_key = open_ai_key

JSONL_FILE = "requests.jsonl"
OUTPUT_FILE = "output.jsonl"
VOCAB_FOLDER_PATH = "pali_class/vocab"
EXERCISE_FOLDER_PATH = "pali_class/exercises"

In [3]:
new_suttas = pd.read_csv("pali_class/new_suttas.csv")

In [4]:
# keyword = input("Enter a Pali word: ").strip() # Remove extra spaces

keyword = "anussarati 1.1"

def search_pali_in_csv(keyword):
    """Search for a Pali word in all CSV files in the folder."""
    result = {
        "id": -1,
        "pali": "",
        "meaning": "",
        "pos": "",
        "exercise_number": "",
        "prdc": []
    }

    for filename in os.listdir(VOCAB_FOLDER_PATH):
        if filename.endswith(".csv"): # Only search in CSV files
            file_path = os.path.join(VOCAB_FOLDER_PATH, filename)
            df = pd.read_csv(file_path, dtype=str)
            example_columns = [col for col in df.columns if "example" in col.lower()]
            match = df[df["pali"] == keyword]

            if not match.empty:
                result["id"] = match["id"].values[0]
                result["pali"] = match["pali"].values[0]
                result["meaning"] = match["meaning"].values[0]
                result["pos"] = match["pos"].values[0]

                for col in example_columns:
                    sentence = match[col].values[0]
                    if isinstance(sentence, str): # Ensure it's a string before regex
                        # Extract text inside <b>...</b>
                        extracted = re.findall(r"<b>(.*?)</b>", sentence)
                        
                        for ext in extracted:
                            if ext not in result['prdc']:
                                result['prdc'].append(ext)
                    
                print("Match found in file:", filename)
                
                number = filename.split("_")[-1].split(".")[0] # Extract the number
                result["exercise_number"] = number

                break

    if result["id"] == -1:
        print("No matches found")

    return result

result = search_pali_in_csv(keyword)
result

Match found in file: vocab_class_3.csv


{'id': '5326',
 'pali': 'anussarati 1.1',
 'meaning': 'remembers; recollects; bears in mind',
 'pos': 'pr',
 'exercise_number': '3',
 'prdc': ['anussarati', 'anussaretha', 'anussaranti', 'anussarāmi']}

In [5]:
# Find exercise number
exercise_number = result['exercise_number']
found_exercise = False
target_exercise = ""
exercise_data = ""

for filename in os.listdir(EXERCISE_FOLDER_PATH):
    if filename.endswith(".txt") and f"_{exercise_number}." in filename:
        found_exercise = True
        target_exercise = filename
        print("Found:", filename)

if not found_exercise:
    print("Exercise not found")
else:
    exercise_data = open(os.path.join(EXERCISE_FOLDER_PATH, target_exercise), "r").read().strip()
    print(exercise_data)

Found: exercises_class_3.txt
Class 3

ahaṃ bhavantaṃ gotamaṃ saraṇaṃ gacchāmi
I go for refuge to the master Gotama
 

DHP 130 sabbe tasanti daṇḍassa.
All are fearful of a stick.
 
MN 27 tathāgate saddhaṃ paṭilabhati
He obtains faith in the Tathāgata.
 
MN 64	 so tattha ṭhito āsavānaṃ khayaṃ pāpuṇāti.
Remaining there, he reaches the destruction of the defilements. 
 
DHP 354	sabbarasaṃ dhammaraso jināti
The flavour of the Dhamma surpasses all other flavours.
 

MN 39 so ‘idaṃ dukkhan’ti yathābhūtaṃ pajānāti
He knows clearly “this [is] suffering” as it truly is.


 

VIN PAT NP 10 paṭiggaṇhātu āyasmā cīvaracetāpannaṃ
Let the venerable receive the robe-fund.

 
DN 19.7 (simpl) rājā jotipālaṃ māṇavaṃ āmantayati
The king addresses the young gentleman (brahmin), Protector of the Light (Jotipāla). 
 
MN 39	 (simpl) so kāyaṃ vivekajena pītisukhena paripūreti
He completely fills up the body with joy and happiness produced by seclusion.
 

DN 22.18	 sukhaṃ ca kāyena paṭisaṃvedeti
He experiences 

In [85]:
SYSTEM_PROMPT = """
You are an AI assistant specialized in Pali language processing. Your primary task is to extract **full sentences** from an exercise dataset based on a given Pali word. Follow these instructions carefully:

1. **Extract and Structure Data:**
   - Identify sentences in the exercise dataset that contain the **given Pali word**, recognizing its various forms (e.g., declensions and conjugations).
   - Extract **full sentences** that provide meaningful context. Do **not** return single-word outputs (e.g., "<b>Anussarati</b>").
   - Remove any sutta references (e.g., 'AN 3.71', 'AN 3.71 (simpl)', or similar citation formats).
   - Ensure that the selected sentence remains **fully understandable** without its source reference while preserving its original meaning.

2. **Formatting Rule for `"class_example"`:**
   - The **selected Pali word** is **highlighted alone** within `<b>...</b>`.
   - The capitalization of the entire sentence must remain exactly as in the exercise dataset.

3. **`"pali"` Field Rule**  
   - The `"pali"` field **must always** match the **exact given Pali word**, including numbering if present (e.g., `"anussarati 1.1"`).
   - Even if the extracted sentence contains a different **morphological form**, the `"pali"` field must **not** be altered.

4. **`"explanation"` Field Rule:**  
   - The `"explanation"` field should provide a brief justification for why the response was generated **if the user asks a question**.
   - If the user requests a solution, provide a practical suggestion to achieve the desired outcome.
   - If **no question is provided**, return an **empty string** (`""`).

5. **Output Format:**
   - Return output in **JSON format** with the following structure:
      {
         "id": "<Given Pali ID>",
         "pali": "<Given Pali word>",
         "selected_pali_word": "<Selected Pali word>",
         "class_example": "<A full sentence with <b>Selected Pali word</b> highlighted, preserving original capitalization>",
         "english_translation": "<English translation>",
         "explanation": ""<A justification for why this response was generated and a solution if applicable, or an empty string if no question was asked>"
      }
   - If no valid sentence is found, return an empty JSON object `{}`.
   - Do **not** wrap JSON in triple backticks (```json ... ```).
"""

In [86]:
# if len(result["prdc"]) > 0:
#     USER_PROMPT = f"""
# For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
# which has "{result['pos']}" as its grammatical part of speech, "{result['meaning']}" as Pali word`s english meaning and \
# **Found declensions and conjugations**: {', '.join(result['prdc'])}, \
# find example sentence in exercise data: "{exercise_data}".
# """
# else:
#     USER_PROMPT = f"""
# For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
# which has "{result['pos']}" as its grammatical part of speech and "{result['meaning']}" as Pali word`s english meaning, \
# find example sentence in exercise data: "{exercise_data}".
# """

USER_PROMPT = f"""
For the given Pali word: "{result['pali']}" with ID: "{result['id']}", \
extract a relevant sentence in exercise data: "{exercise_data}". \
"""

print(USER_PROMPT)


For the given Pali word: "anussarati 1.1" with ID: "5326", extract a relevant sentence in exercise data: "Class 3

ahaṃ bhavantaṃ gotamaṃ saraṇaṃ gacchāmi
I go for refuge to the master Gotama
 

DHP 130 sabbe tasanti daṇḍassa.
All are fearful of a stick.
 
MN 27 tathāgate saddhaṃ paṭilabhati
He obtains faith in the Tathāgata.
 
MN 64	 so tattha ṭhito āsavānaṃ khayaṃ pāpuṇāti.
Remaining there, he reaches the destruction of the defilements. 
 
DHP 354	sabbarasaṃ dhammaraso jināti
The flavour of the Dhamma surpasses all other flavours.
 

MN 39 so ‘idaṃ dukkhan’ti yathābhūtaṃ pajānāti
He knows clearly “this [is] suffering” as it truly is.


 

VIN PAT NP 10 paṭiggaṇhātu āyasmā cīvaracetāpannaṃ
Let the venerable receive the robe-fund.

 
DN 19.7 (simpl) rājā jotipālaṃ māṇavaṃ āmantayati
The king addresses the young gentleman (brahmin), Protector of the Light (Jotipāla). 
 
MN 39	 (simpl) so kāyaṃ vivekajena pītisukhena paripūreti
He completely fills up the body with joy and happiness prod

In [87]:
llm_model = ["gpt-4o", "gpt-4-turbo"]

# Define single request
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "system", 
            "content": SYSTEM_PROMPT
        },
        {
            "role": "user", 
            "content": USER_PROMPT
        }
    ],
    max_tokens=1000
)

# Extract response content
response_text = response.choices[0].message.content

# Convert string to JSON
try:
    response_json = json.loads(response_text)
    print(json.dumps(response_json, indent=4, ensure_ascii=False)) # Pretty print
    # sutta_name = new_suttas[new_suttas['sutta_number'] == response_json['class_source'].replace(" ", "").replace("(simpl)", "")]["sutta_name"].tolist()
    # print(sutta_name)
except json.JSONDecodeError:
    print("Invalid JSON response:", response_text)

{
    "id": "5326",
    "pali": "anussarati 1.1",
    "selected_pali_word": "anussarati",
    "class_example": "sāvako dhammaṃ <b>anussarati</b>",
    "english_translation": "The disciple recollects the Dhamma.",
    "explanation": ""
}


In [82]:
eval_vocab_df = pd.read_csv("pali_class/vocab/vocab_class_4.csv")
pick_palis = eval_vocab_df["pali"]
pick_palis

0         agacchi
1            aggi
2      aṅgulimāla
3       aṭṭhāsi 1
4       atthi 1.1
          ...    
96         hoti 1
97          hotha
98          hontu
99       homa 2.2
100    chindati 5
Name: pali, Length: 101, dtype: object

In [93]:
eval_exercise_data = open("pali_class/exercises/exercises_class_4.txt", "r").read().strip()
print(eval_exercise_data)

Class 4


bālā pamādaṃ anuyuñjanti DHP 026 simpl
 
Fools indulge in negligence.



sugatiṃ saggaṃ lokaṃ upapajjāmi AN 3.66 simpl

I go towards a good destination, a heavenly plane of existence.
 


abhisaṅkharoti kāyena kusalaṃ. AN 3.70

He does good deed by means of the body.
 
DN 22.5	ime sālī ime vīhī ime muggā ime māsā ime tilā ime taṇḍulā’ti

“These are fine rices, paddy, mung beans, speckled beans, sesame, dehusked rice.”
 
UD 45 	atha kho bhagavā bhikkhū āmantesi

And then the Blessed One addressed the monks.
 

SN 21.9 (simpl) atha kho āyasmā tisso upasaṅkami,  ekamantaṃ nisīdi.

And then Venerable Tissa approached and sat down to one side. 
 
SN 35.121 āyasmā rāhulo bhagavato bhāsitaṃ abhinandi

Venerable Rahula delighted in the speech of the Blessed One.
 
MN 39	 (simpl) ahaṃ sakamhā gāmā amuṃ gāmaṃ agacchiṃ

I went from my own village to such and such a village.
 


AN 5.57	byādhi dhammo amhi
I am of the nature to get sick. (lit. I am of the sickness nature)

 

MN 39	 samaṇ

In [94]:
def generate_unique_id(user_id):
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # Format: YYYYMMDDHHMMSS
    unique_str = uuid.uuid4().hex
    return f"{user_id}-{timestamp}-{unique_str}"

for p in pick_palis:    
    result = search_pali_in_csv(p)
    # Generate a unique ID for the request (user-timestamp-unique)
    unique_id = generate_unique_id("user-1")
    
    if len(result["prdc"]) > 0:
        USER_PROMPT = f"""
    For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
    which has "{result['pos']}" as its grammatical part of speech, "{result['meaning']}" as Pali word`s english meaning and \
    **Found declensions and conjugations**: {', '.join(result['prdc'])}, \
    find example sentence in exercise data: "{exercise_data}".
    """
    else:
        USER_PROMPT = f"""
    For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
    which has "{result['pos']}" as its grammatical part of speech and "{result['meaning']}" as Pali word`s english meaning, \
    find example sentence in exercise data: "{exercise_data}".
    """
        
    request = {
        "custom_id": unique_id, 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {
            "model": "gpt-4-turbo", 
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": USER_PROMPT}
            ],
            "max_tokens": 1000
        }
    }

    # Write to JSONL file (append mode for multiple requests)
    with open(JSONL_FILE, "a", encoding="utf-8") as f:
        json.dump(request, f) # Convert dictionary to JSON string
        f.write("\n") # Newline for the next JSON object

    print(f"Request saved to {JSONL_FILE}")

Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match found in file: vocab_class_4.csv
Request saved to requests.jsonl
Match 

In [95]:
batch_input_file = client.files.create(
    file=open(JSONL_FILE, "rb"),
    purpose="batch"
)

batch_input_file

FileObject(id='file-D9hSPKvLquyuysojkrBYrB', bytes=841991, created_at=1740378144, filename='requests.jsonl', object='file', purpose='batch', status='processed', status_details=None, expires_at=None)

In [96]:
batch_input_file_id = batch_input_file.id

# Start the batch process and store the response
batch_request = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Automated batch processing"}
)

# Get Batch ID
batch_id = batch_request.id
print("Batch ID:", batch_id)

Batch ID: batch_67bc1021f5cc81908e64376c673456c6


In [101]:
# Check batch status
client.batches.retrieve(batch_id)

Batch(id='batch_67bc1021f5cc81908e64376c673456c6', completion_window='24h', created_at=1740378145, endpoint='/v1/chat/completions', input_file_id='file-D9hSPKvLquyuysojkrBYrB', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1740378252, error_file_id=None, errors=None, expired_at=None, expires_at=1740464545, failed_at=None, finalizing_at=1740378246, in_progress_at=1740378146, metadata={'description': 'Automated batch processing'}, output_file_id='file-5g45BfFnMQHgEVa3V6my8j', request_counts=BatchRequestCounts(completed=101, failed=0, total=101))

In [102]:
output_file_id = client.batches.retrieve(batch_id).output_file_id
file_response = client.files.content(output_file_id)
print(file_response.text)

{"id": "batch_req_67bc10863da88190a8d9c82f66bd9d3f", "custom_id": "user-1-20250224142156-9fd8c51689f74ae8b86e2782ad13551a", "response": {"status_code": 200, "request_id": "efb06ae2474892aa42478ed9f4e7b40b", "body": {"id": "chatcmpl-B4M9DIz8ZQN3rk9NO5bMiVw9qiTXy", "object": "chat.completion", "created": 1740378147, "model": "gpt-4-turbo-2024-04-09", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"id\": \"468\",\n  \"pali\": \"agacchi\",\n  \"class_source\": \"DN 19.7 (simpl)\",\n  \"class_sutta\": \"mah\u0101govindasutta\u1e43\",\n  \"class_example\": \"r\u0101j\u0101 jotip\u0101la\u1e43 m\u0101\u1e47ava\u1e43 \u0101mantayati\",\n  \"english_translation\": \"The king addresses the young gentleman (brahmin), Protector of the Light (Jotip\u0101la).\"\n}", "refusal": null}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 2309, "completion_tokens": 113, "total_tokens": 2422, "prompt_tokens_details": {"cached_tokens": 0, "audio_tokens": 0}

In [103]:
# Initialize an empty DataFrame with predefined columns
df = pd.DataFrame(columns=["id", "pali", "class_source", "class_sutta", "class_example", "english_translation"])

for line in file_response.iter_lines():
    if line: # Ignore empty lines
        try:
            data = json.loads(line) # Parse each JSON object
        
            # Extract response text from OpenAI output
            response_text = data.get("response", {}).get("body", {}).get('choices', [{}])[0].get('message', {}).get('content', '{}')

            # Convert response_text (JSON string) into a dictionary
            extracted_data = json.loads(response_text)

            df = pd.concat([df, pd.DataFrame([extracted_data])], ignore_index=True)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}, Line: {line}")

print(df)

Error decoding JSON: Expecting value: line 1 column 1 (char 0), Line: {"id": "batch_req_67bc1088bcf481908fdeb413cfe597dc", "custom_id": "user-1-20250224142205-486ca414649f41bf8aa4a6a5e54a8a37", "response": {"status_code": 200, "request_id": "169ad91f38d3a276e8d3fb254dee8aa7", "body": {"id": "chatcmpl-B4M9JVaJfu8nWem0f4UBtr2K4vtfS", "object": "chat.completion", "created": 1740378153, "model": "gpt-4-turbo-2024-04-09", "choices": [{"index": 0, "message": {"role": "assistant", "content": "```json\n{\n  \"id\": \"30807\",\n  \"pali\": \"tila\",\n  \"class_source\": \"DN 19.7 (simpl)\",\n  \"class_sutta\": \"mah\u0101govindasutta\u1e43\",\n  \"class_example\": \"r\u0101j\u0101 jotip\u0101la\u1e43 m\u0101\u1e47ava\u1e43 \u0101mantayati\",\n  \"english_translation\": \"The king addresses the young gentleman (brahmin), Protector of the Light (Jotip\u0101la).\"\n}\n```", "refusal": null}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 2297, "completion_tokens": 117, "to

In [104]:
# Save DataFrame to CSV
df.to_csv("output.csv", index=False, encoding="utf-8")

print("CSV file saved as output.csv")

CSV file saved as output.csv
