In [61]:
# import libraries
import re
import os
import json
import uuid
import datetime
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

In [62]:
# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
open_ai_key = os.getenv("SBS_OPENAI_API_KEY")

# Initialize OpenAI client
client = OpenAI()
client.api_key = open_ai_key

JSONL_FILE = "requests.jsonl"
OUTPUT_FILE = "output.jsonl"
VOCAB_FOLDER_PATH = "pali_class/vocab"
EXERCISE_FOLDER_PATH = "pali_class/exercises"

In [63]:
new_suttas = pd.read_csv("pali_class/new_suttas.csv")

In [64]:
# keyword = input("Enter a Pali word: ").strip() # Remove extra spaces

keyword = "anussarati 1.1"

def search_pali_in_csv(keyword):
    """Search for a Pali word in all CSV files in the folder."""
    result = {
        "id": -1,
        "pali": "",
        "meaning": "",
        "pos": "",
        "exercise_number": "",
        "prdc": []
    }

    for filename in os.listdir(VOCAB_FOLDER_PATH):
        if filename.endswith(".csv"): # Only search in CSV files
            file_path = os.path.join(VOCAB_FOLDER_PATH, filename)
            df = pd.read_csv(file_path, dtype=str)
            example_columns = [col for col in df.columns if "example" in col.lower()]
            match = df[df["pali"] == keyword]

            if not match.empty:
                result["id"] = match["id"].values[0]
                result["pali"] = match["pali"].values[0]
                result["meaning"] = match["meaning"].values[0]
                result["pos"] = match["pos"].values[0]

                for col in example_columns:
                    sentence = match[col].values[0]
                    if isinstance(sentence, str): # Ensure it's a string before regex
                        # Extract text inside <b>...</b>
                        extracted = re.findall(r"<b>(.*?)</b>", sentence)
                        
                        for ext in extracted:
                            if ext not in result['prdc']:
                                result['prdc'].append(ext)
                    
                print("Match found in file:", filename)
                
                number = filename.split("_")[-1].split(".")[0] # Extract the number
                result["exercise_number"] = number

                break

    if result["id"] == -1:
        print("No matches found")

    return result

result = search_pali_in_csv(keyword)
result

Match found in file: vocab_class_3.csv


{'id': '5326',
 'pali': 'anussarati 1.1',
 'meaning': 'remembers; recollects; bears in mind',
 'pos': 'pr',
 'exercise_number': '3',
 'prdc': ['anussarati', 'anussaretha', 'anussaranti', 'anussarāmi']}

In [65]:
# Find exercise number
exercise_number = result['exercise_number']
found_exercise = False
target_exercise = ""
exercise_data = ""

for filename in os.listdir(EXERCISE_FOLDER_PATH):
    if filename.endswith(".txt") and f"_{exercise_number}." in filename:
        found_exercise = True
        target_exercise = filename
        print("Found:", filename)

if not found_exercise:
    print("Exercise not found")
else:
    exercise_data = open(os.path.join(EXERCISE_FOLDER_PATH, target_exercise), "r").read().strip()
    print(exercise_data)

Found: exercises_class_3.txt
Class 3

ahaṃ bhavantaṃ gotamaṃ saraṇaṃ gacchāmi
I go for refuge to the master Gotama
 

DHP 130 sabbe tasanti daṇḍassa.
All are fearful of a stick.
 
MN 27 tathāgate saddhaṃ paṭilabhati
He obtains faith in the Tathāgata.
 
MN 64	 so tattha ṭhito āsavānaṃ khayaṃ pāpuṇāti.
Remaining there, he reaches the destruction of the defilements. 
 
DHP 354	sabbarasaṃ dhammaraso jināti
The flavour of the Dhamma surpasses all other flavours.
 

MN 39 so ‘idaṃ dukkhan’ti yathābhūtaṃ pajānāti
He knows clearly “this [is] suffering” as it truly is.


 

VIN PAT NP 10 paṭiggaṇhātu āyasmā cīvaracetāpannaṃ
Let the venerable receive the robe-fund.

 
DN 19.7 (simpl) rājā jotipālaṃ māṇavaṃ āmantayati
The king addresses the young gentleman (brahmin), Protector of the Light (Jotipāla). 
 
MN 39	 (simpl) so kāyaṃ vivekajena pītisukhena paripūreti
He completely fills up the body with joy and happiness produced by seclusion.
 

DN 22.18	 sukhaṃ ca kāyena paṭisaṃvedeti
He experiences 

In [66]:
SYSTEM_PROMPT = """
You are an AI assistant specialized in Pali language processing. Follow these instructions carefully:

1. **Sentences Matching**:
   - Identify sentences in the exercise dataset that contain the given Pali word, including its declensions and conjugations.
   - Use the provided **Pali word`s english meaning** of the given Pali word to better understand its meaning and ensure accurate sentences selection.
   - If the Pali word's **Found declensions and conjugations** is provided, wisely use it to match the sentences in the provided exercise dataset.luding its declensions and conjugations, in the sentences. Ensure that the word appears in its correct grammatical form within the sentences, avoiding partial or irrelevant match.
   - Strictly match the Pali word, including its declensions and conjugations, in the sentences. Ensure that the word appears in its correct grammatical form within the sentences, avoiding partial or irrelevant match.

2. **Extract and Structure Data**:
   - Extract the **source reference** for the selected sentences.
   - Extract the **corresponding English translation** for the sentences.
   - After obtaining the **source reference**, retrieve the corresponding **sutta reference**.
   - Example: If **source reference** (known as sutta number) is `"AN2.1"`, the **sutta reference** (known as sutta name) should be `"vajjasuttaṃ"`.
   - If the **"(simpl)"** found around **source reference**, format the `class_source` as:  
     `"DN 19.7 (simpl)"`
   - If **"(simpl)"** is **not present** around **source reference**, format normally as:  
     `"DN 19.7"`
   - Preserve the **exact original form** of the given Pali word **without modifications**.
   - Do not select sentences marked with **"$" or "%"**, kindly pick another sentences without such marks.
   - Extracts exactly one relevant sentences per word.

3. **Special Formatting for `class_source`**:
   - If the **"(simpl)"** marker is present around the **source reference**, format it as:
      ```
      "class_source": "DN 19.7 (simpl)"
      ```
   - If **"(simpl)"** is **not present**, format normally as:
      ```
      "class_source": "DN 19.7"
      ```

4. **Text Formatting**:
   - **Bold the target Pali word in `class_example`** by wrapping the **exact matched form** of the word in `<b></b>`.
   - Ensure **all occurrences** of the word in the sentences are bolded.
   - Example: If the target word is **dhamma**,  
     - Sentences: `"Ayaṃ dhammo sanantano."`  
     - Output: `"Ayaṃ <b>dhammo</b> sanantano."`

5. **Formatting Rules**:
   - **Ensure strict spacing in `class_source`**:  
     - Always format it as **"AN 10.48"**, **"MN 1"**, **"DN 22"**, etc.
     - There must be **a space** between the collection (e.g., AN, MN, DN) and the number.
     - Do **not** return `"AN10.48"`, `"MN1"`, etc.

6. **Output Format**:
   - Return the result in **JSON format** with the following structure:
     {
       "id": "<Pali ID>",
       "pali": "<Pali word>",
       "class_source": "<Source reference>",
       "class_sutta": "<Sutta reference>",
       "class_example": "<Selected sentences with <b>Pali word</b> highlighted>",
       "english_translation": "<English translation>"
     }
   - If no valid sentences is found, return an empty JSON object `{}`.
   - Do **not** wrap JSON in triple backticks (```json ... ```).
   - Preserve the **exact original form** of the given Pali word **without modifications** for json["pali"].
"""

In [67]:
if len(result["prdc"]) > 0:
    USER_PROMPT = f"""
For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
which has "{result['pos']}" as its grammatical part of speech, "{result['meaning']}" as Pali word`s english meaning and \
**Found declensions and conjugations**: {', '.join(result['prdc'])}, \
find example sentence in exercise data: "{exercise_data}".
"""
else:
    USER_PROMPT = f"""
For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
which has "{result['pos']}" as its grammatical part of speech and "{result['meaning']}" as Pali word`s english meaning, \
find example sentence in exercise data: "{exercise_data}".
"""

print(USER_PROMPT)


For the Pali word: "anussarati 1.1" and its id: "5326", which has "pr" as its grammatical part of speech, "remembers; recollects; bears in mind" as Pali word`s english meaning and **Found declensions and conjugations**: anussarati, anussaretha, anussaranti, anussarāmi, find example sentence in exercise data: "Class 3

ahaṃ bhavantaṃ gotamaṃ saraṇaṃ gacchāmi
I go for refuge to the master Gotama
 

DHP 130 sabbe tasanti daṇḍassa.
All are fearful of a stick.
 
MN 27 tathāgate saddhaṃ paṭilabhati
He obtains faith in the Tathāgata.
 
MN 64	 so tattha ṭhito āsavānaṃ khayaṃ pāpuṇāti.
Remaining there, he reaches the destruction of the defilements. 
 
DHP 354	sabbarasaṃ dhammaraso jināti
The flavour of the Dhamma surpasses all other flavours.
 

MN 39 so ‘idaṃ dukkhan’ti yathābhūtaṃ pajānāti
He knows clearly “this [is] suffering” as it truly is.


 

VIN PAT NP 10 paṭiggaṇhātu āyasmā cīvaracetāpannaṃ
Let the venerable receive the robe-fund.

 
DN 19.7 (simpl) rājā jotipālaṃ māṇavaṃ āmantayati


In [68]:
# Define single request
response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {
            "role": "system", 
            "content": SYSTEM_PROMPT
        },
        {
            "role": "user", 
            "content": USER_PROMPT
        }
    ],
    max_tokens=1000
)

# Extract response content
response_text = response.choices[0].message.content

# Convert string to JSON
try:
    response_json = json.loads(response_text)
    print(json.dumps(response_json, indent=4, ensure_ascii=False)) # Pretty print
    sutta_name = new_suttas[new_suttas['sutta_number'] == response_json['class_source'].replace(" ", "").replace("(simpl)", "")]["sutta_name"].tolist()
    print(sutta_name)
except json.JSONDecodeError:
    print("Invalid JSON response:", response_text)

{
    "id": "5326",
    "pali": "anussarati",
    "class_source": "AN 3.71 (simpl)",
    "class_sutta": "raṇṇasuttaṃ",
    "class_example": "sāvako <b>dhammaṃ</b> <b>anussarati</b>",
    "english_translation": "The disciple recollects the Dhamma."
}
['uposathasuttaṃ']


In [69]:
eval_vocab_df = pd.read_csv("pali_class/vocab/vocab_class_3.csv")
pick_palis = eval_vocab_df["pali"]
pick_palis

0          attha 1.1
1     anussarati 1.1
2       abhivaḍḍhati
3          ājānāti 1
4           ānanda 2
           ...      
72          vadati 1
73        vibhaṅga 1
74          vivekaja
75        viharati 1
76       pītisukha 2
Name: pali, Length: 77, dtype: object

In [70]:
eval_exercise_data = open("pali_class/exercises/exercises_class_3.txt", "r").read().strip()
print(eval_exercise_data)

Class 3

ahaṃ bhavantaṃ gotamaṃ saraṇaṃ gacchāmi
I go for refuge to the master Gotama
 

DHP 130 sabbe tasanti daṇḍassa.
All are fearful of a stick.
 
MN 27 tathāgate saddhaṃ paṭilabhati
He obtains faith in the Tathāgata.
 
MN 64	 so tattha ṭhito āsavānaṃ khayaṃ pāpuṇāti.
Remaining there, he reaches the destruction of the defilements. 
 
DHP 354	sabbarasaṃ dhammaraso jināti
The flavour of the Dhamma surpasses all other flavours.
 

MN 39 so ‘idaṃ dukkhan’ti yathābhūtaṃ pajānāti
He knows clearly “this [is] suffering” as it truly is.


 

VIN PAT NP 10 paṭiggaṇhātu āyasmā cīvaracetāpannaṃ
Let the venerable receive the robe-fund.

 
DN 19.7 (simpl) rājā jotipālaṃ māṇavaṃ āmantayati
The king addresses the young gentleman (brahmin), Protector of the Light (Jotipāla). 
 
MN 39	 (simpl) so kāyaṃ vivekajena pītisukhena paripūreti
He completely fills up the body with joy and happiness produced by seclusion.
 

DN 22.18	 sukhaṃ ca kāyena paṭisaṃvedeti
He experiences pleasure by means of the body

In [71]:
def generate_unique_id(user_id):
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # Format: YYYYMMDDHHMMSS
    unique_str = uuid.uuid4().hex
    return f"{user_id}-{timestamp}-{unique_str}"

for p in pick_palis:    
    result = search_pali_in_csv(p)
    # Generate a unique ID for the request (user-timestamp-unique)
    unique_id = generate_unique_id("user-1")
    
    if len(result["prdc"]) > 0:
        USER_PROMPT = f"""
    For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
    which has "{result['pos']}" as its grammatical part of speech, "{result['meaning']}" as Pali word`s english meaning and \
    **Found declensions and conjugations**: {', '.join(result['prdc'])}, \
    find example sentence in exercise data: "{exercise_data}".
    """
    else:
        USER_PROMPT = f"""
    For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
    which has "{result['pos']}" as its grammatical part of speech and "{result['meaning']}" as Pali word`s english meaning, \
    find example sentence in exercise data: "{exercise_data}".
    """
        
    request = {
        "custom_id": unique_id, 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {
            "model": "gpt-4-turbo", 
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": USER_PROMPT}
            ],
            "max_tokens": 1000
        }
    }

    # Write to JSONL file (append mode for multiple requests)
    with open(JSONL_FILE, "a", encoding="utf-8") as f:
        json.dump(request, f) # Convert dictionary to JSON string
        f.write("\n") # Newline for the next JSON object

    print(f"Request saved to {JSONL_FILE}")

Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match found in file: vocab_class_3.csv
Request saved to requests.jsonl
Match 

In [72]:
batch_input_file = client.files.create(
    file=open(JSONL_FILE, "rb"),
    purpose="batch"
)

batch_input_file

FileObject(id='file-Uki1Lu6HEXhDidepTA49Ui', bytes=643651, created_at=1740377508, filename='requests.jsonl', object='file', purpose='batch', status='processed', status_details=None, expires_at=None)

In [73]:
batch_input_file_id = batch_input_file.id

# Start the batch process and store the response
batch_request = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Automated batch processing"}
)

# Get Batch ID
batch_id = batch_request.id
print("Batch ID:", batch_id)

Batch ID: batch_67bc0da5fa9c8190b2121992992bf3f2


In [88]:
# Check batch status
client.batches.retrieve(batch_id)

Batch(id='batch_67bc0da5fa9c8190b2121992992bf3f2', completion_window='24h', created_at=1740377510, endpoint='/v1/chat/completions', input_file_id='file-Uki1Lu6HEXhDidepTA49Ui', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1740377753, error_file_id=None, errors=None, expired_at=None, expires_at=1740463910, failed_at=None, finalizing_at=1740377747, in_progress_at=1740377511, metadata={'description': 'Automated batch processing'}, output_file_id='file-KFR2Rr1ZyrNgpdUimDW9BG', request_counts=BatchRequestCounts(completed=77, failed=0, total=77))

In [89]:
output_file_id = client.batches.retrieve(batch_id).output_file_id
file_response = client.files.content(output_file_id)
print(file_response.text)

{"id": "batch_req_67bc0e93ac008190899ffe88aad93858", "custom_id": "user-1-20250224141130-bf9128e4092b4c55bf30526a54e1f38f", "response": {"status_code": 200, "request_id": "d8b19ffcdac3c1b00dc693bd224fea15", "body": {"id": "chatcmpl-B4LzUXFyrQLOemYEuKM97aJft6QjO", "object": "chat.completion", "created": 1740377544, "model": "gpt-4-turbo-2024-04-09", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"id\": \"2597\",\n  \"pali\": \"attha\",\n  \"class_source\": \"SN 22.1 (simpl)\",\n  \"class_sutta\": \"s\u0101riputtasutta\u1e43\",\n  \"class_example\": \"s\u0101dhu \u0101yasmanta\u1e43 s\u0101riputta\u1e43 pa\u1e6dibh\u0101tu etassa bh\u0101sitassa <b>attho</b>\",\n  \"english_translation\": \"It would be good if the meaning of this statement may become evident to Venerable S\u0101riputta. (so that he could explain it to us)\"\n}", "refusal": null}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 2308, "completion_tokens": 140, "total_tok

In [90]:
# Initialize an empty DataFrame with predefined columns
df = pd.DataFrame(columns=["id", "pali", "class_source", "class_sutta", "class_example", "english_translation"])

for line in file_response.iter_lines():
    if line: # Ignore empty lines
        try:
            data = json.loads(line) # Parse each JSON object
        
            # Extract response text from OpenAI output
            response_text = data.get("response", {}).get("body", {}).get('choices', [{}])[0].get('message', {}).get('content', '{}')

            # Convert response_text (JSON string) into a dictionary
            extracted_data = json.loads(response_text)

            df = pd.concat([df, pd.DataFrame([extracted_data])], ignore_index=True)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}, Line: {line}")

print(df)

       id          pali      class_source                    class_sutta  \
0    2597         attha   SN 22.1 (simpl)                sāriputtasuttaṃ   
1    5326    anussarati   AN 3.71 (simpl)                     tiṇasuttaṃ   
2    8292  abhivaḍḍhati           AN 3.79             kusalādhammasuttaṃ   
3   11370     ājānāti 1             MN 43              mahāvedallasuttaṃ   
4   11718        ānanda    MN 104 (simpl)            āneñjasappāyasuttaṃ   
..    ...           ...               ...                            ...   
72  66122      vadati 1     MN 30 (simpl)                 brāhmaṇasuttaṃ   
73  68557    vibhaṅga 1    MN 133 (simpl)     mahākaccānābhidhammasuttaṃ   
74  69606      vivekaja     MN 39 (simpl)            Mahā-Assapura Sutta   
75  69661    viharati 1  AN 4.180 (simpl)  therānaṃsenāsanavandanāsuttaṃ   
76  75647     pītisukha     MN 39 (simpl)             mahā āsādāyīsuttaṃ   

                                        class_example  \
0   sādhu āyasmantaṃ sāriputta

In [91]:
# Save DataFrame to CSV
df.to_csv("output.csv", index=False, encoding="utf-8")

print("CSV file saved as output.csv")

CSV file saved as output.csv
