In [1]:
# import libraries
import re
import os
import json
import uuid
import datetime
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

In [2]:
# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
open_ai_key = os.getenv("SBS_OPENAI_API_KEY")

# Initialize OpenAI client
client = OpenAI()
client.api_key = open_ai_key

JSONL_FILE = "requests.jsonl"
OUTPUT_FILE = "output.jsonl"
VOCAB_FOLDER_PATH = "pali_class/vocab"
EXERCISE_FOLDER_PATH = "pali_class/exercises"

In [3]:
new_suttas = pd.read_csv("pali_class/new_suttas.csv")

In [4]:
keyword = input("Enter a Pali word: ").strip() # Remove extra spaces

def search_pali_in_csv(keyword):
    """Search for a Pali word in all CSV files in the folder."""
    result = {
        "id": -1,
        "pali": "",
        "meaning": "",
        "pos": "",
        "exercise_number": "",
        "prdc": []
    }

    for filename in os.listdir(VOCAB_FOLDER_PATH):
        if filename.endswith(".csv"): # Only search in CSV files
            file_path = os.path.join(VOCAB_FOLDER_PATH, filename)
            df = pd.read_csv(file_path, dtype=str)
            example_columns = [col for col in df.columns if "example" in col.lower()]
            match = df[df["pali"] == keyword]

            if not match.empty:
                result["id"] = match["id"].values[0]
                result["pali"] = match["pali"].values[0]
                result["meaning"] = match["meaning"].values[0]
                result["pos"] = match["pos"].values[0]

                for col in example_columns:
                    sentence = match[col].values[0]
                    if isinstance(sentence, str): # Ensure it's a string before regex
                        # Extract text inside <b>...</b>
                        extracted = re.findall(r"<b>(.*?)</b>", sentence)
                        
                        for ext in extracted:
                            if ext not in result['prdc']:
                                result['prdc'].append(ext)
                    
                print("Match found in file:", filename)
                
                number = filename.split("_")[-1].split(".")[0] # Extract the number
                result["exercise_number"] = number

                break

    if result["id"] == -1:
        print("No matches found")

    return result

result = search_pali_in_csv(keyword)
result

Match found in file: vocab_class_2.csv


{'id': '2603',
 'pali': 'attha 2.1',
 'meaning': 'need (for); want (for)',
 'pos': 'masc',
 'exercise_number': '2',
 'prdc': ['atthamhi', 'attho']}

In [5]:
# Find exercise number
exercise_number = result['exercise_number']
found_exercise = False
target_exercise = ""
exercise_data = ""

for filename in os.listdir(EXERCISE_FOLDER_PATH):
    if filename.endswith(".txt") and f"_{exercise_number}." in filename:
        found_exercise = True
        target_exercise = filename
        print("Found:", filename)

if not found_exercise:
    print("Exercise not found")
else:
    exercise_data = open(os.path.join(EXERCISE_FOLDER_PATH, target_exercise), "r").read().strip()
    print(exercise_data)

Found: exercises_class_2.txt
Class 2 Exercises

namo tassa bhagavato arahato sammā-sambuddhassa	 					
Homage to him, the Blessed One, the Worthy One, the fully Enlightened One.
 
avijjāya tv'eva asesa-virāga-nirodhā saṅkhāra-nirodho, saṅkhāra-nirodhā viññāṇa-nirodho, viññāṇa-nirodhā nāmarūpa-nirodho, nāmarūpa-nirodhā saḷāyatana-nirodho, saḷāyatana-nirodhā phassa-nirodho, phassa-nirodhā vedanā-nirodho, vedanā-nirodhā taṇhā-nirodho, taṇhā-nirodhā upādāna-nirodho, upādāna-nirodhā bhava-nirodho, bhava-nirodhā jāti-nirodho, jāti-nirodhā jarā-maraṇaṃ soka-parideva-dukkha-domanass'upāyāsā nirujjhanti. 
But from the complete fading away and cessation of ignorance there is cessation of volitional formations; from the cessation of volitional formations, cessation of consciousness; from the cessation of consciousness, cessation of name-and-form; from the cessation of name-and-form, cessation of the six sense bases; from the cessation of the six sense bases, cessation of contact; from the cessati

In [6]:
SYSTEM_PROMPT = """
You are an AI assistant specialized in Pali language processing. Follow these instructions carefully:

1. **Sentences Matching**:
   - Identify sentences in the exercise dataset that contain the given Pali word, including its declensions and conjugations.
   - Use the provided **Pali word`s english meaning** of the given Pali word to better understand its meaning and ensure accurate sentences selection.
   - If the Pali word's **Found declensions and conjugations** is provided, wisely use it to match the sentences in the provided exercise dataset.luding its declensions and conjugations, in the sentences. Ensure that the word appears in its correct grammatical form within the sentences, avoiding partial or irrelevant match.
   - Strictly match the Pali word, including its declensions and conjugations, in the sentences. Ensure that the word appears in its correct grammatical form within the sentences, avoiding partial or irrelevant match.

2. **Extract and Structure Data**:
   - Extract the **source reference** for the selected sentences.
   - Extract the **corresponding English translation** for the sentences.
   - After obtaining the **source reference**, retrieve the corresponding **sutta reference**.
   - Example: If **source reference** (known as sutta number) is `"AN2.1"`, the **sutta reference** (known as sutta name) should be `"vajjasuttaṃ"`.
   - If the **"(simpl)"** found around **source reference**, format the `class_source` as:  
     `"DN 19.7 (simpl)"`
   - If **"(simpl)"** is **not present** around **source reference**, format normally as:  
     `"DN 19.7"`
   - Preserve the **exact original form** of the given Pali word **without modifications**.
   - Extracts exactly one relevant sentences per word.

3. **Special Formatting for `class_source`**:
   - If the **"(simpl)"** marker is present around the **source reference**, format it as:
      ```
      "class_source": "DN 19.7 (simpl)"
      ```
   - If **"(simpl)"** is **not present**, format normally as:
      ```
      "class_source": "DN 19.7"
      ```

4. **Text Formatting**:
   - **Bold the target Pali word in `class_example`** by wrapping the **exact matched form** of the word in `<b></b>`.
   - Ensure **all occurrences** of the word in the sentences are bolded.
   - Example: If the target word is **dhamma**,  
     - Sentences: `"Ayaṃ dhammo sanantano."`  
     - Output: `"Ayaṃ <b>dhammo</b> sanantano."`

5. **Formatting Rules**:
   - **Ensure strict spacing in `class_source`**:  
     - Always format it as **"AN 10.48"**, **"MN 1"**, **"DN 22"**, etc.
     - There must be **a space** between the collection (e.g., AN, MN, DN) and the number.
     - Do **not** return `"AN10.48"`, `"MN1"`, etc.

6. **Output Format**:
   - Return the result in **JSON format** with the following structure:
     {
       "id": "<Pali ID>",
       "pali": "<Pali word>",
       "class_source": "<Source reference>",
       "class_sutta": "<Sutta reference>",
       "class_example": "<Selected sentences with <b>Pali word</b> highlighted>",
       "english_translation": "<English translation>"
     }
   - If no valid sentences is found, return an empty JSON object `{}`.
   - Do **not** wrap JSON in triple backticks (```json ... ```).
   - Preserve the **exact original form** of the given Pali word **without modifications** for json["pali"].
"""

In [7]:
# if len(result["prdc"]) > 0:
#     SYSTEM_PROMPT += f"\n\n**Found declensions and conjugations**: {', '.join(result['prdc'])}"

In [8]:
if len(result["prdc"]) > 0:
    USER_PROMPT = f"""
For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
which has "{result['pos']}" as its grammatical part of speech, "{result['meaning']}" as Pali word`s english meaning and \
**Found declensions and conjugations**: {', '.join(result['prdc'])}, \
find example sentence in exercise data: "{exercise_data}".
"""
else:
    USER_PROMPT = f"""
For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
which has "{result['pos']}" as its grammatical part of speech and "{result['meaning']}" as Pali word`s english meaning, \
find example sentence in exercise data: "{exercise_data}".
"""

print(USER_PROMPT)


For the Pali word: "attha 2.1" and its id: "2603", which has "masc" as its grammatical part of speech, "need (for); want (for)" as Pali word`s english meaning and **Found declensions and conjugations**: atthamhi, attho, find example sentence in exercise data: "Class 2 Exercises

namo tassa bhagavato arahato sammā-sambuddhassa	 					
Homage to him, the Blessed One, the Worthy One, the fully Enlightened One.
 
avijjāya tv'eva asesa-virāga-nirodhā saṅkhāra-nirodho, saṅkhāra-nirodhā viññāṇa-nirodho, viññāṇa-nirodhā nāmarūpa-nirodho, nāmarūpa-nirodhā saḷāyatana-nirodho, saḷāyatana-nirodhā phassa-nirodho, phassa-nirodhā vedanā-nirodho, vedanā-nirodhā taṇhā-nirodho, taṇhā-nirodhā upādāna-nirodho, upādāna-nirodhā bhava-nirodho, bhava-nirodhā jāti-nirodho, jāti-nirodhā jarā-maraṇaṃ soka-parideva-dukkha-domanass'upāyāsā nirujjhanti. 
But from the complete fading away and cessation of ignorance there is cessation of volitional formations; from the cessation of volitional formations, cessation of

In [9]:
# Define single request
response = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {
            "role": "system", 
            "content": SYSTEM_PROMPT
        },
        {
            "role": "user", 
            "content": USER_PROMPT
        }
    ],
    max_tokens=1000
)

# Extract response content
response_text = response.choices[0].message.content

# Convert string to JSON
try:
    response_json = json.loads(response_text)
    print(json.dumps(response_json, indent=4, ensure_ascii=False)) # Pretty print
    sutta_name = new_suttas[new_suttas['sutta_number'] == response_json['class_source'].replace(" ", "").replace("(simpl)", "")]["sutta_name"].tolist()
    print(sutta_name)
except json.JSONDecodeError:
    print("Invalid JSON response:", response_text)

{
    "id": "2603",
    "pali": "attha",
    "class_source": "DHP 331",
    "class_sutta": "sahāyasuttaṃ",
    "class_example": "[Good are] friends in <b>atthamhi</b> sahāyā [Good are] friends when need [arises]",
    "english_translation": "[Good are] friends in [arisen] need"
}
[]


In [10]:
eval_vocab_df = pd.read_csv("pali_class/vocab/vocab_class_2.csv")
pick_palis = eval_vocab_df["pali"][0:20]
pick_palis

0                      attha 2.1
1                      anālaya 1
2                       anuttara
3                        apara 1
4             aparena samayena 1
5                         arahaṃ
6                      alamariya
7     alamariyañāṇadassanavisesa
8                          asesa
9                         avijjā
10                     ākāsa 1.2
11                     āsava 1.2
12                       āvāsa 1
13                         itipi
14                    uttari 1.1
15           uttarimanussadhamma
16                    uddisati 1
17                     upādāna 2
18                     upāyāsa 1
19                       upāsaka
Name: pali, dtype: object

In [11]:
eval_exercise_data = open("pali_class/exercises/exercises_class_2.txt", "r").read().strip()
print(eval_exercise_data)

Class 2 Exercises

namo tassa bhagavato arahato sammā-sambuddhassa	 					
Homage to him, the Blessed One, the Worthy One, the fully Enlightened One.
 
avijjāya tv'eva asesa-virāga-nirodhā saṅkhāra-nirodho, saṅkhāra-nirodhā viññāṇa-nirodho, viññāṇa-nirodhā nāmarūpa-nirodho, nāmarūpa-nirodhā saḷāyatana-nirodho, saḷāyatana-nirodhā phassa-nirodho, phassa-nirodhā vedanā-nirodho, vedanā-nirodhā taṇhā-nirodho, taṇhā-nirodhā upādāna-nirodho, upādāna-nirodhā bhava-nirodho, bhava-nirodhā jāti-nirodho, jāti-nirodhā jarā-maraṇaṃ soka-parideva-dukkha-domanass'upāyāsā nirujjhanti. 
But from the complete fading away and cessation of ignorance there is cessation of volitional formations; from the cessation of volitional formations, cessation of consciousness; from the cessation of consciousness, cessation of name-and-form; from the cessation of name-and-form, cessation of the six sense bases; from the cessation of the six sense bases, cessation of contact; from the cessation of contact, cessation of f

In [12]:
def generate_unique_id(user_id):
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # Format: YYYYMMDDHHMMSS
    unique_str = uuid.uuid4().hex
    return f"{user_id}-{timestamp}-{unique_str}"

for p in pick_palis:    
    result = search_pali_in_csv(p)
    # Generate a unique ID for the request (user-timestamp-unique)
    unique_id = generate_unique_id("user-1")
    
    if len(result["prdc"]) > 0:
        USER_PROMPT = f"""
    For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
    which has "{result['pos']}" as its grammatical part of speech, "{result['meaning']}" as Pali word`s english meaning and \
    **Found declensions and conjugations**: {', '.join(result['prdc'])}, \
    find example sentence in exercise data: "{exercise_data}".
    """
    else:
        USER_PROMPT = f"""
    For the Pali word: "{result['pali']}" and its id: "{result['id']}", \
    which has "{result['pos']}" as its grammatical part of speech and "{result['meaning']}" as Pali word`s english meaning, \
    find example sentence in exercise data: "{exercise_data}".
    """
        
    request = {
        "custom_id": unique_id, 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {
            "model": "gpt-4-turbo", 
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": USER_PROMPT}
            ],
            "max_tokens": 1000
        }
    }

    # Write to JSONL file (append mode for multiple requests)
    with open(JSONL_FILE, "a", encoding="utf-8") as f:
        json.dump(request, f) # Convert dictionary to JSON string
        f.write("\n") # Newline for the next JSON object

    print(f"Request saved to {JSONL_FILE}")

Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match found in file: vocab_class_2.csv
Request saved to requests.jsonl
Match 

In [13]:
batch_input_file = client.files.create(
    file=open(JSONL_FILE, "rb"),
    purpose="batch"
)

batch_input_file

FileObject(id='file-RLZ5PAKwM5jg6TJqy5g3r7', bytes=198215, created_at=1740374207, filename='requests.jsonl', object='file', purpose='batch', status='processed', status_details=None, expires_at=None)

In [14]:
batch_input_file_id = batch_input_file.id

# Start the batch process and store the response
batch_request = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Automated batch processing"}
)

# Get Batch ID
batch_id = batch_request.id
print("Batch ID:", batch_id)

Batch ID: batch_67bc00c0f51c8190b11e1912ccb71202


In [35]:
# Check batch status
client.batches.retrieve(batch_id)

Batch(id='batch_67bc00c0f51c8190b11e1912ccb71202', completion_window='24h', created_at=1740374209, endpoint='/v1/chat/completions', input_file_id='file-RLZ5PAKwM5jg6TJqy5g3r7', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1740374384, error_file_id=None, errors=None, expired_at=None, expires_at=1740460609, failed_at=None, finalizing_at=1740374381, in_progress_at=1740374210, metadata={'description': 'Automated batch processing'}, output_file_id='file-P7gK5Chs4v4Fxa4BtHcA2q', request_counts=BatchRequestCounts(completed=20, failed=0, total=20))

In [36]:
output_file_id = client.batches.retrieve(batch_id).output_file_id
file_response = client.files.content(output_file_id)
print(file_response.text)

{"id": "batch_req_67bc016e31c48190b73ee0ea6e4d31e9", "custom_id": "user-1-20250224131644-9860a50b055f4a97afb6dc3fc3f7e9e9", "response": {"status_code": 200, "request_id": "adf22f59bd4ab0bee5377b6491598d57", "body": {"id": "chatcmpl-B4L8pKhfk8Xar4RKBdN0X98mbelK1", "object": "chat.completion", "created": 1740374279, "model": "gpt-4-turbo-2024-04-09", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"id\": \"2603\",\n  \"pali\": \"attha\",\n  \"class_source\": \"DN 22.5\",\n  \"class_sutta\": \"mah\u0101satipa\u1e6d\u1e6dh\u0101nasutta\u1e43\",\n  \"class_example\": \"<b>atthi</b> imasmi\u1e43 k\u0101ye kes\u0101 lom\u0101 nakh\u0101 dant\u0101 taco\",\n  \"english_translation\": \"There are in this body, hair [on the head], hairs of the body, nails, teeth, skin.\"\n}", "refusal": null}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 2727, "completion_tokens": 122, "total_tokens": 2849, "prompt_tokens_details": {"cached_tokens": 0, "audi

In [38]:
# Initialize an empty DataFrame with predefined columns
df = pd.DataFrame(columns=["id", "pali", "class_source", "class_sutta", "class_example", "english_translation"])

for line in file_response.iter_lines():
    if line: # Ignore empty lines
        try:
            data = json.loads(line) # Parse each JSON object
        
            # Extract response text from OpenAI output
            response_text = data.get("response", {}).get("body", {}).get('choices', [{}])[0].get('message', {}).get('content', '{}')

            # Convert response_text (JSON string) into a dictionary
            extracted_data = json.loads(response_text)

            df = pd.concat([df, pd.DataFrame([extracted_data])], ignore_index=True)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}, Line: {line}")

print(df)

       id                        pali      class_source  \
0    2603                       attha           DN 22.5   
1    4036                   anālaya 1  SN 56.11 (simpl)   
2    4524                    anuttara   DN 22.1 (simpl)   
3    6258                     apara 1          AN 10.48   
4    6463            aparena samayena       VIN 1.4.1.2   
5    8943                      arahaṃ   DN 22.1 (simpl)   
6    9175                   alamariya          AN 10.48   
7    9177  alamariyañāṇadassanavisesa          AN 10.48   
8    9869                       asesa           SN 10.8   
9   10520                      avijjā   DN 22.1 (simpl)   
10  11040                       ākāsa          AN 10.48   
11  12812                       āsava   DN 22.1 (simpl)   
12  13049                     āvāsa 1         VIN 1.1.4   
13  13479                       itipi   DN 22.1 (simpl)   
14  14712                      uttari          AN 10.48   
15  14727         uttarimanussadhamma          AN 10.48 

In [39]:
# Save DataFrame to CSV
df.to_csv("output.csv", index=False, encoding="utf-8")

print("CSV file saved as output.csv")

CSV file saved as output.csv
