In [238]:
# import libraries
import os
import json
import uuid
import datetime
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

In [239]:
# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
open_ai_key = os.getenv("SBS_OPENAI_API_KEY")

# Initialize OpenAI client
client = OpenAI()
client.api_key = open_ai_key

JSONL_FILE = "requests.jsonl"
OUTPUT_FILE = "output.jsonl"
VOCAB_FOLDER_PATH = "pali_class/vocab"
EXERCISE_FOLDER_PATH = "pali_class/exercises"

In [None]:
new_suttas = pd.read_csv("pali_class/new_suttas.csv")

Unnamed: 0,sutta_number,sutta_name
0,AN2.1,vajjasuttaṃ
1,AN2.10,vassūpanāyikasuttaṃ
2,AN2.2,padhānasuttaṃ
3,AN2.3,tapanīyasuttaṃ
4,AN2.4,atapanīyasuttaṃ
...,...,...
4807,VV81,kaṇḍakavimānavatthuṃ
4808,VV82,anekavaṇṇavimānavatthuṃ
4809,VV83,maṭṭhakuṇḍalīvimānavatthuṃ
4810,VV84,serīsakavimānavatthuṃ


In [240]:
keyword = input("Enter a Pali word: ").strip() # Remove extra spaces

def search_pali_in_csv(keyword):
    """Search for a Pali word in all CSV files in the folder."""
    result = {
        "id": -1,
        "pali": "",
        "meaning": "",
        "pos": "",
        "exercise_number": ""
    }

    for filename in os.listdir(VOCAB_FOLDER_PATH):
        if filename.endswith(".csv"): # Only search in CSV files
            file_path = os.path.join(VOCAB_FOLDER_PATH, filename)
            df = pd.read_csv(file_path, dtype=str) 
            match = df[df["pali"] == keyword]

            if not match.empty:
                result["id"] = match["id"].values[0]
                result["pali"] = match["pali"].values[0]
                result["meaning"] = match["meaning"].values[0]
                result["pos"] = match["pos"].values[0]
                
                print("Match found in file:", filename)
                
                number = filename.split("_")[-1].split(".")[0] # Extract the number
                result["exercise_number"] = number

                break

    if result["id"] == -1:
        print("No matches found")

    return result

result = search_pali_in_csv(keyword)
result

Match found in file: vocab_class_2.csv


{'id': '13479',
 'pali': 'itipi',
 'meaning': '(item being named); it is so; thus; just like this',
 'pos': 'sandhi',
 'exercise_number': '2'}

In [241]:
# Find exercise number
exercise_number = result['exercise_number']
found_exercise = False
target_exercise = ""
exercise_data = ""

for filename in os.listdir(EXERCISE_FOLDER_PATH):
    if filename.endswith(".txt") and f"_{exercise_number}." in filename:
        found_exercise = True
        target_exercise = filename
        print("Found:", filename)

if not found_exercise:
    print("Exercise not found")
else:
    exercise_data = open(os.path.join(EXERCISE_FOLDER_PATH, target_exercise), "r").read().strip()
    print(exercise_data)

Found: exercises_class_2.txt
Class 2 Exercises

namo tassa bhagavato arahato sammā-sambuddhassa	 					
Homage to him, the Blessed One, the Worthy One, the fully Enlightened One.
 
avijjāya tv'eva asesa-virāga-nirodhā saṅkhāra-nirodho, saṅkhāra-nirodhā viññāṇa-nirodho, viññāṇa-nirodhā nāmarūpa-nirodho, nāmarūpa-nirodhā saḷāyatana-nirodho, saḷāyatana-nirodhā phassa-nirodho, phassa-nirodhā vedanā-nirodho, vedanā-nirodhā taṇhā-nirodho, taṇhā-nirodhā upādāna-nirodho, upādāna-nirodhā bhava-nirodho, bhava-nirodhā jāti-nirodho, jāti-nirodhā jarā-maraṇaṃ soka-parideva-dukkha-domanass'upāyāsā nirujjhanti. 
But from the complete fading away and cessation of ignorance there is cessation of volitional formations; from the cessation of volitional formations, cessation of consciousness; from the cessation of consciousness, cessation of name-and-form; from the cessation of name-and-form, cessation of the six sense bases; from the cessation of the six sense bases, cessation of contact; from the cessati

In [279]:
SYSTEM_PROMPT = """
You are an AI assistant specialized in Pali language processing. Follow these instructions carefully:

1. **Sentence Matching**:
   - Identify sentences in the exercise dataset that contain the given Pali word, including its declensions and conjugated forms.
   - Use the provided **English translation** of the given Pali word to better understand its meaning and ensure accurate sentence selection.

2. **Extract and Structure Data**:
   - Extract the **source reference** for the selected sentence.
   - Extract the **corresponding English translation** for the sentence.
   - After obtaining the **source reference**, retrieve the corresponding **sutta reference**.
   - Example: If **source reference** (known as sutta number) is `"AN2.1"`, the **sutta reference** (known as sutta name) should be `"vajjasuttaṃ"`.
   - If the **"(simpl)"** found around **source reference**, format the `class_source` as:  
     `"DN 19.7 (simpl)"`
   - If **"(simpl)"** is **not present** around **source reference**, format normally as:  
     `"DN 19.7"`

3. **Special Formatting for `class_source`**:
   - If the **"(simpl)"** marker is present around the **source reference**, format it as:
      ```
      "class_source": "DN 19.7 (simpl)"
      ```
   - If **"(simpl)"** is **not present**, format normally as:
      ```
      "class_source": "DN 19.7"
      ```

4. **Text Formatting**:
   - **Bold the target Pali word in `class_example`** by wrapping the **exact matched form** of the word in `<b></b>`.
   - Ensure **all occurrences** of the word in the sentence are bolded.
   - Example: If the target word is **dhamma**,  
     - Sentence: `"Ayaṃ dhammo sanantano."`  
     - Output: `"Ayaṃ <b>dhammo</b> sanantano."`

5. **Formatting Rules**:
   - **Ensure strict spacing in `class_source`**:  
     - Always format it as **"AN 10.48"**, **"MN 1"**, **"DN 22"**, etc.
     - There must be **a space** between the collection (e.g., AN, MN, DN) and the number.
     - Do **not** return `"AN10.48"`, `"MN1"`, etc.

6. **Output Format**:
   - Return the result in **JSON format** with the following structure:
     {
       "id": "<Pali ID>",
       "pali": "<Pali word>",
       "class_source": "<Source reference>",
       "class_sutta": "<Sutta reference>",
       "class_example": "<Selected sentence with <b>Pali word</b> highlighted>",
       "english_translation": "<English translation>"
     }
   - If no valid sentence is found, return an empty JSON object `{}`.
   - Do **not** wrap JSON in triple backticks (```json ... ```).
"""

In [None]:
USER_PROMPT = f"""
For the Pali word: "{result['pali']}" and its id: "{result['id']}", which has "{result['pos']}" as its grammatical part of speech and "{result['meaning']}" as English translation, find example sentence in exercise data: "{exercise_data}".
"""

In [285]:
# Define single request
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "system", 
            "content": SYSTEM_PROMPT
        },
        {
            "role": "user", 
            "content": USER_PROMPT
        }
    ],
    max_tokens=1000
)

# Extract response content
response_text = response.choices[0].message.content

# Convert string to JSON
try:
    response_json = json.loads(response_text)
    print(json.dumps(response_json, indent=4, ensure_ascii=False)) # Pretty print
    sutta_name = new_suttas[new_suttas['sutta_number'] == response_json['class_source'].replace(" ", "").replace("(simpl)", "")]["sutta_name"].tolist()
    print(sutta_name)
except json.JSONDecodeError:
    print("Invalid JSON response:", response_text)

{
    "id": "13479",
    "pali": "itipi",
    "class_source": "DN 22.5",
    "class_sutta": "mahāsatipaṭṭhānasuttaṃ",
    "class_example": "atthi imasmiṃ kāye kesā lomā nakhā dantā taco. It is in this body, hair, hair, nails, teeth, and skin.",
    "english_translation": "There are in this body, hair [on the head], hairs of the body, nails, teeth, skin."
}
[]


In [None]:
def generate_unique_id(user_id):
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # Format: YYYYMMDDHHMMSS
    unique_str = uuid.uuid4().hex
    return f"{user_id}-{timestamp}-{unique_str}"

# Generate a unique ID for the request (user-timestamp-unique)
generate_unique_id = generate_unique_id("user-1")

request = {
    "custom_id": generate_unique_id, 
    "method": "POST", 
    "url": "/v1/chat/completions", 
    "body": {
        "model": "gpt-3.5-turbo-0125", 
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": USER_PROMPT}
        ],
        "max_tokens": 1000
    }
}

# Write to JSONL file (append mode for multiple requests)
with open(JSONL_FILE, "a", encoding="utf-8") as f:
    json.dump(request, f) # Convert dictionary to JSON string
    f.write("\n") # Newline for the next JSON object

print(f"Request saved to {JSONL_FILE}")

Request saved to requests.jsonl


In [85]:
batch_input_file = client.files.create(
    file=open(JSONL_FILE, "rb"),
    purpose="batch"
)

batch_input_file

FileObject(id='file-LAEqKjyGAPTrEXGxArwc2n', bytes=15116, created_at=1740116690, filename='requests.jsonl', object='file', purpose='batch', status='processed', status_details=None, expires_at=None)

In [86]:
batch_input_file_id = batch_input_file.id

# Start the batch process and store the response
batch_request = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Automated batch processing"}
)

# Get Batch ID
batch_id = batch_request.id
print("Batch ID:", batch_id)

Batch ID: batch_67b812d7c18881909b030da0a490c343


In [97]:
# Check batch status
client.batches.retrieve(batch_id)

Batch(id='batch_67b812d7c18881909b030da0a490c343', completion_window='24h', created_at=1740116695, endpoint='/v1/chat/completions', input_file_id='file-LAEqKjyGAPTrEXGxArwc2n', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1740116789, error_file_id=None, errors=None, expired_at=None, expires_at=1740203095, failed_at=None, finalizing_at=1740116788, in_progress_at=1740116696, metadata={'description': 'Automated batch processing'}, output_file_id='file-3FNMaqZDxxS6uMEGkMK8mb', request_counts=BatchRequestCounts(completed=2, failed=0, total=2))

In [98]:
file_response = client.files.content("file-3FNMaqZDxxS6uMEGkMK8mb")
print(file_response.text)

{"id": "batch_req_67b81334c77c8190be644166a8e81b78", "custom_id": "user-1-20250221134424-bc7948efd66b4e0e870a75a64a950ce5", "response": {"status_code": 200, "request_id": "de88c96277332f548f0199f2eea352b5", "body": {"id": "chatcmpl-B3G9gE1c5IOOXWKxC13QIrEZynXCm", "object": "chat.completion", "created": 1740116784, "model": "gpt-3.5-turbo-0125", "choices": [{"index": 0, "message": {"role": "assistant", "content": "```json\n{\n  \"pali_sentence\": \"<b>attha 2.1</b>\",\n  \"translation\": \"But from the complete fading away and cessation of ignorance there is cessation of volitional formations; from the cessation of volitional formations, cessation of consciousness; from the cessation of consciousness, cessation of name-and-form; from the cessation of name-and-form, cessation of the six sense bases; from the cessation of the six sense bases, cessation of contact; from the cessation of contact, cessation of feeling; from the cessation of feeling, cessation of craving; from the cessation o