In [None]:
from bs4 import BeautifulSoup
import requests
from myanmartools import ZawgyiDetector
from icu import Transliterator
import time
from tqdm import tqdm
import pandas as pd
from datasets import Dataset

In [43]:
# Initialize the detector and converter
detector = ZawgyiDetector()
converter = Transliterator.createInstance('Zawgyi-my')

# Example conversion function
def zawgyi_to_unicode(text):
    score = detector.get_zawgyi_probability(text)
    if score > 0.5:  # If likely Zawgyi
        return converter.transliterate(text)
    return text  # Already Unicode

# Scrape Dhamma Articles

In [None]:
page_url = "https://www.dhammaransi.com"

def get_page_links():
    # https://www.dhammaransi.com/index.php/new.html
    with open("ဓမ္မအမေးအဖြေများ - New - Dhammaransi.com.html") as file:
        html_data = file.read()

    soup = BeautifulSoup(html_data, 'html')

    header_td_tags = soup.find_all("td", headers="categorylist_header_title")

    links = [page_url + header_td_tag.find('a').get("href") for header_td_tag in header_td_tags]

    return links

page_links = get_page_links()

In [4]:
page_links

['https://www.dhammaransi.com/index.php/new/2603-2024-12-24-01-20-48.html',
 'https://www.dhammaransi.com/index.php/new/2602-2024-12-23-22-58-28.html',
 'https://www.dhammaransi.com/index.php/new/2578-2024-01-03-01-08-14.html',
 'https://www.dhammaransi.com/index.php/new/2577-2024-01-03-01-01-25.html',
 'https://www.dhammaransi.com/index.php/new/2575-2023-12-27-10-57-19.html',
 'https://www.dhammaransi.com/index.php/new/2574-2023-12-27-10-53-41.html',
 'https://www.dhammaransi.com/index.php/new/2565-2022-03-19-07-59-20.html',
 'https://www.dhammaransi.com/index.php/new/2564-2022-03-19-07-55-35.html',
 'https://www.dhammaransi.com/index.php/new/2563-2022-03-19-07-52-10.html',
 'https://www.dhammaransi.com/index.php/new/2562-2022-03-19-07-49-55.html',
 'https://www.dhammaransi.com/index.php/new/2561-2022-03-19-07-48-57.html',
 'https://www.dhammaransi.com/index.php/new/2560-2022-03-19-07-45-09.html',
 'https://www.dhammaransi.com/index.php/new/2559-2022-03-19-07-44-02.html',
 'https://ww

In [27]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Referer': 'https://www.google.com',
}

def get_page_data(link):
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.content, 'html')

    title = soup.find("h2", class_="art-postheader")
    title = title.getText()

    div_tags = soup.find_all("div", dir="auto")
    body = "\n".join([div_tag.getText() for div_tag in div_tags])

    if body == "":
        div_tag = soup.find("div", class_="art-article")
        bodies = div_tag.find_all("p")
        body = "\n".join([body.get_text(separator="\n", strip=True) for body in bodies])


    return zawgyi_to_unicode(title), zawgyi_to_unicode(body)

In [32]:
data = {"url": [], "title": [], "body": []}
q_and_a_count = 0

for page_link in tqdm(page_links, desc="Downloading Articles"):
    time.sleep(3)
    title, body = get_page_data(page_link)
    if "( ဖြေ )" in body:
        q_and_a_count += 1

    data["url"].append(page_link)
    data["title"].append(title.replace('\xa0', ''))
    data["body"].append(body.replace('\xa0', ''))


Downloading Articles: 100%|██████████| 1434/1434 [1:40:09<00:00,  4.19s/it]  


# Upload the Dhamma Article Dataset to HuggingFace

In [None]:
ds = Dataset.from_dict(data)
ds.push_to_hub("chuuhtetnaing/dhamma-article-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

# Generate the Question and Answer Dataset with Gemma 3

In [1]:
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
import json

In [None]:
hf_hub_download(repo_id="google/gemma-3-27b-it-qat-q4_0-gguf", filename="gemma-3-27b-it-q4_0.gguf",)

gemma-3-27b-it-q4_0.gguf:   0%|          | 0.00/17.2G [00:00<?, ?B/s]

'/root/.cache/huggingface/hub/models--google--gemma-3-27b-it-qat-q4_0-gguf/snapshots/17cf0f6ad611f1a57a1640daa57eb427d6e67ed6/gemma-3-27b-it-q4_0.gguf'

In [2]:
llm = Llama(
      model_path="/root/.cache/huggingface/hub/models--google--gemma-3-27b-it-qat-q4_0-gguf/snapshots/17cf0f6ad611f1a57a1640daa57eb427d6e67ed6/gemma-3-27b-it-q4_0.gguf",
      n_gpu_layers=-1,
      n_ctx=1024 * 20,
      verbose=False
)

ds = load_dataset("chuuhtetnaing/dhamma-article-dataset")

llama_init_from_model: n_ctx_per_seq (20480) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


README.md:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/4.84M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1434 [00:00<?, ? examples/s]

In [3]:
SYSTEM_PROMPT = """
You are a helpful assistant that generates comprehension-style question and answer pairs from a given article written in Burmese (Myanmar language). Only generate questions that can be answered using information found directly in the provided title and body. Do not include any questions that require external context or background knowledge (such as the author's name, source, or publication date).

Output the questions and answers in the following JSON format:
[
  {"question": "question 1", "answer": "answer 1"},
  {"question": "question 2", "answer": "answer 2"},
  ...
]

Generate about 10 questions that test a good understanding of the content, covering key concepts, causes, effects, and definitions found in the text. Do not include any extra text or explanation outside the JSON format.
""".strip()

USER_PROMPT = """
Title:
{title}

Body:
{body}
""".strip()

In [4]:
def generate(title, body):
    result = llm.create_chat_completion(
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {
                "role": "user",
                "content": USER_PROMPT.format(title=title, body=body.replace('\xa0', ''))
            }
        ],
        # response_format={ "type": "json_object" },
        max_tokens=4096,
        temperature=0.1,
    )
    
    return json.loads(result['choices'][0]['message']['content'].replace("```json", "").replace("```", ""))

In [5]:
json_file_path = 'q_and_a_output.jsonl'

def load_jsonl(file_path):
    data = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  # Skip empty lines
                # Parse each line as a JSON object
                line_data = json.loads(line)
                # Merge into the main dictionary
                data.update(line_data)
    return data

# Load the data
q_and_a_data = load_jsonl(json_file_path)

In [6]:
for row in tqdm(ds['train'], desc="Generate Q&A Pairs"):
    url = row['url']
    title = row['title']
    body = row['body']

    if url in q_and_a_data:
        continue
    
    error = True
    
    while error:
        try:
            result = generate(title, body)
            error = False
        except json.JSONDecodeError as e:
            error = True
    
    q_and_a_data[url] = result
    with open(json_file_path, 'a') as f:
        new_entry = {url: result}
        f.write(json.dumps(new_entry) + '\n')

Generate Q&A Pairs: 100%|██████████| 1434/1434 [00:19<00:00, 73.86it/s]


# Upload the Dhamma Question and Answer Dataset to the HuggingFace

In [20]:
from datasets import Dataset

In [None]:
q_and_a_data = load_jsonl(json_file_path)

In [12]:
len(q_and_a_data)

1434

In [16]:
data = {"question": [], "answer": [], "source": []}

for url, q_and_as in q_and_a_data.items():
    for q_and_a in q_and_as:
        data['question'].append(q_and_a['question'])
        data['answer'].append(q_and_a['answer'])
        data['source'].append(url)

In [22]:
ds = Dataset.from_dict(data)
ds.push_to_hub("chuuhtetnaing/dhamma-question-answer-dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/chuuhtetnaing/dhamma-question-answer-dataset/commit/f6f862e87e3b27b094905ea744f814ba7de06adf', commit_message='Upload dataset', commit_description='', oid='f6f862e87e3b27b094905ea744f814ba7de06adf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chuuhtetnaing/dhamma-question-answer-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chuuhtetnaing/dhamma-question-answer-dataset'), pr_revision=None, pr_num=None)