In [23]:
# import libraries
import re
import csv
import pandas as pd

In [24]:
# Load vocab_class_3.csv and suttas.csv
vocab_file = "pali_class/vocab/vocab_class_3.csv"
sutta_file = "pali_class/suttas.csv"

vocab_df = pd.read_csv(vocab_file)
sutta_df = pd.read_csv(sutta_file, sep="\t", encoding="utf-8")

# Load exercises_class_3.txt
exercises_file = "pali_class/exercises/exercises_class_3.txt"

with open(exercises_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Display first few lines
for line in lines[:10]:
    print(line.strip())

Class 3

ahaṃ bhavantaṃ gotamaṃ saraṇaṃ gacchāmi
I go for refuge to the master Gotama


DHP 130 sabbe tasanti daṇḍassa.
All are fearful of a stick.

MN 27 tathāgate saddhaṃ paṭilabhati


In [25]:
user_input = "attha 1.1"

In [26]:
match_word = vocab_df[vocab_df["pali"] == user_input]
search_result = {
    'id': -1,
    'pali': '',
    'class_source': '',
    'class_sutta': '',
    'class_example': '', 
    'english_translation': ''
}

if len(match_word) > 0:
    # Possible relavant declension and conjugation
    prdc = []
    search_result['id'] = match_word['id'].values[0]
    search_result['pali'] = match_word['pali'].values[0]
    
    example_columns = [col for col in vocab_df.columns if "example" in col.lower()]

    for col in example_columns:
        sentence = match_word[col].values[0]

        # Extract text inside <b>...</b>
        match = re.findall(r"<b>(.*?)</b>", sentence)
        
        for m in match:
            if m not in prdc:
                prdc.append(m)
else:
    print("No match found")

prdc

['attho', 'atthaṃ']

In [27]:
def extract_sutta_info(text):
    """Extract Sutta reference and (simpl) marker if present."""
    match = re.search(r'([A-Z]+ \d+(?:\.\d+)?)\s*(\(simpl\))?', text)

    if match:
        sutta_reference = match.group(1) # Extract Sutta reference
        simpl_marker = match.group(2) if match.group(2) else "" # Extract (simpl) if found
        return {"sutta_reference": sutta_reference, "simpl_marker": simpl_marker}

    return None

def clean_sentence(text, target_word):
    """Removes leading numbering and sutta reference from a sentence."""
    # Remove leading number + tab (e.g., "4.\t")
    text = re.sub(r'^\d+\.\t', '', text)

    # Remove sutta reference + (simpl) if present (e.g., "AN 3.71 (simpl)")
    text = re.sub(r'([A-Z]+ \d+(?:\.\d+)?)(?: \(simpl\))?$', '', text).strip()

    # Highlight target word using <b></b>
    if target_word:
        text = re.sub(fr'\b{re.escape(target_word)}\b', fr'<b>{target_word}</b>', text)

    return text

In [28]:
with open(exercises_file, encoding='utf-8') as f:
    # content = f.read()
    lines = f.readlines() # Read all lines into a list

    for i, line in enumerate(lines):
            if prdc[0] in line: # If the word is found
                print(f"'{prdc[0]}' found at line {i+1}: {line.strip()}")

                # Get the next sentence (if available)
                if i + 1 < len(lines):
                    next_sentence = lines[i + 1].strip()
                    print(f"Next sentence: {next_sentence}")

                # Extract Sutta reference
                sutta_info = extract_sutta_info(line.strip())

                if sutta_info:
                    if sutta_info['simpl_marker'] != '':     
                        search_result['class_source'] = sutta_info['sutta_reference'] + " " + sutta_info['simpl_marker']
                    else:
                        search_result['class_source'] = sutta_info['sutta_reference']
                
                    # Get Sutta name
                    # Remove spaces in `sutta_number` (e.g., "AN 2.1" → "AN2.1")
                    sutta_number = sutta_info['sutta_reference'].replace(" ", "")
                    sutta_name = sutta_df[sutta_df["sutta_number"] == sutta_number]

                    search_result['class_sutta'] = sutta_name['sutta_name'].values[0]
                
                # Example Usage
                sentence = line.strip()
                cleaned_sentence = clean_sentence(sentence, prdc[0])

                search_result['class_example'] = cleaned_sentence
                search_result['english_translation'] = next_sentence
                
                break

'attho' found at line 64: 13.	sādhu āyasmantaṃ sāriputtaṃ paṭibhātu etassa bhāsitassa attho SN 22.1 (simpl)
Next sentence: It would be good if the meaning of this statement may become evident to Venerable Sāriputta. (so that he could explain it to us)


In [29]:
search_result

{'id': 2597,
 'pali': 'attha 1.1',
 'class_source': 'SN 22.1 (simpl)',
 'class_sutta': 'nakulapitusuttaṃ',
 'class_example': 'sādhu āyasmantaṃ sāriputtaṃ paṭibhātu etassa bhāsitassa <b>attho</b>',
 'english_translation': 'It would be good if the meaning of this statement may become evident to Venerable Sāriputta. (so that he could explain it to us)'}

In [None]:
batch_process = vocab_df["pali"].tolist()

for b in batch_process:
    match_word = vocab_df[vocab_df["pali"] == b]
    search_result = {
        'id': -1,
        'pali': '',
        'class_source': '',
        'class_sutta': '',
        'class_example': '', 
        'english_translation': ''
    }

    # Possible relavant declension and conjugation
    prdc = []
    search_result['id'] = match_word['id'].values[0]
    search_result['pali'] = match_word['pali'].values[0]
    
    example_columns = [col for col in vocab_df.columns if "example" in col.lower()]
    
    for col in example_columns:
        sentence = match_word[col].values[0]

        # Extract text inside <b>...</b>
        match = re.findall(r"<b>(.*?)</b>", sentence)
        
        for m in match:
            if m not in prdc:
                prdc.append(m)

TypeError: expected string or bytes-like object, got 'float'