In [1]:
import csv
import os
import re

csv_files = ["train_tharu.csv"]
output_file = "tharu_corpus.txt"

'''Remove:
English letters
English digits
Devanagari digits
Special characters
'''
CLEAN_RE = re.compile(
    r"[A-Za-z0-9\u0966-\u096F,.\-!?;:\"'()\[\]{}<>@#$%^&*_+=/\\|~`]"
)

unique_sentences = set()
final_sentences = []

for csv_path in csv_files:
    if not os.path.exists(csv_path):
        print(f"File not found: {csv_path}")
        continue

    print(f"Reading: {csv_path}")

    with open(csv_path, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader, None)  # skip header

        for row in reader:
            if len(row) > 1:
                text = row[1].strip()
                if not text:
                    continue

                # Split by danda
                for sentence in re.split(r"[।॥]", text):
                    s = sentence.strip()
                    s = s.rstrip("|")                  #remove trailing |
                    s = CLEAN_RE.sub("", s)            #remove unwanted characters
                    s = re.sub(r"\s+", " ", s).strip() #remove extra spaces

                    if len(s) < 3:
                        continue

                    # Deduplication
                    if s not in unique_sentences:
                        unique_sentences.add(s)
                        final_sentences.append(s)

# save to cleaned corpus
with open(output_file, "w", encoding="utf-8") as out:
    for s in final_sentences:
        out.write(s + "\n")

print("\nDone!")
print(f"Total unique cleaned sentences: {len(final_sentences):,}")
print(f"Saved to: {output_file}")


Reading: train_tharu.csv

Done!
Total unique cleaned sentences: 8,100
Saved to: tharu_corpus.txt


In [15]:
import requests
from bs4 import BeautifulSoup
import re
import time
from pathlib import Path
from typing import List, Optional


VERSION_ID = 3062           #Tharu, Dāngaurā Tharu
OUTPUT_FILE = Path("tharu_bible.txt")

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

BASE_URL = "https://www.bible.com/bible/{vid}/{book}.{chapter}.THL"

#New Testament books (USFM code + number of chapters)
BOOKS = [
    ("MAT", 28), ("MRK", 16), ("LUK", 24), ("JHN", 21),
    ("ACT", 28), ("ROM", 16), ("1CO", 16), ("2CO", 13),
    ("GAL", 6),  ("EPH", 6),  ("PHP", 4),  ("COL", 4),
    ("1TH", 5),  ("2TH", 3),  ("1TI", 6),  ("2TI", 4),
    ("TIT", 3),  ("PHM", 1),  ("HEB", 13), ("JAS", 5),
    ("1PE", 5),  ("2PE", 3),  ("1JN", 5),  ("2JN", 1),
    ("3JN", 1),  ("JUD", 1),  ("REV", 22)
]

def clean_verse_text(text: str) -> str:
    """Clean verse text: remove verse numbers, footnotes, collapse spaces"""
    #Remove leading verse number
    text = re.sub(r'^\d+\s*', '', text.strip())
    #Remove footnote markers
    text = re.sub(r'#.*?(?=\s|$)', '', text)
    #Collapse multiple spaces/newlines
    text = re.sub(r'\s+', ' ', text)
    #Remove trailing punctuation
    text = re.sub(r'\s*[\।।]?\s*$', '', text)
    return text.strip()


def scrape_chapter(book_code: str, chapter: int) -> Optional[List[str]]:
    url = BASE_URL.format(vid=VERSION_ID, book=book_code, chapter=chapter)
    
    try:
        resp = requests.get(url, headers=HEADERS, timeout=12)
        resp.raise_for_status()
    except requests.RequestException as e:
        print(f" Failed to fetch {book_code} {chapter}  ({e})")
        return None

    soup = BeautifulSoup(resp.text, "html.parser")

    #Main content container
    container = soup.find("div", class_="ChapterContent_bible-reader__LmLUa")
    if not container:
        print(f"  → No chapter content found for {book_code} {chapter}")
        return None

    verses = []

    for verse_span in container.find_all("span", class_="ChapterContent_verse__57FIw"):
        #Verse number
        label = verse_span.find("span", class_="ChapterContent_label__R2PLt")
        verse_num = label.get_text(strip=True) if label else "?"

        #Collect text
        text_parts = []
        for child in verse_span.descendants:
            if child.name == "span":
                classes = child.get("class", [])
                if "label" in classes or "note" in classes or "heading" in classes:
                    continue
            if isinstance(child, str) and child.strip():
                text_parts.append(child.strip())

        verse_text = clean_verse_text(" ".join(text_parts))
        if verse_text:
            verses.append(verse_text)

    return verses


def main():
    output_lines = []
    total_chapters = sum(chap for _, chap in BOOKS)

    print(f"Scraping Tharu (THL / 3062) – {len(BOOKS)} books ≈ {total_chapters} chapters\n")

    processed = 0

    for book_code, max_chap in BOOKS:
        print(f"→ {book_code} ({max_chap} chapters)")
        book_lines = [f"\n=== {book_code} ===\n"]

        for chap in range(1, max_chap + 1):
            processed += 1
            verses = scrape_chapter(book_code, chap)

            if verses:
                book_lines.append(f"--- Chapter {chap} ---")
                book_lines.extend(verses)
                book_lines.append("")
            else:
                book_lines.append(f"(Chapter {chap} failed or not available)")

            print(f"  {chap:2d}  done  ({processed}/{total_chapters})")
            time.sleep(2.4)   # polite delay (~25 req/min)

        output_lines.extend(book_lines)

    # Save
    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
        f.write("\n".join(line.rstrip() for line in output_lines if line.strip()) + "\n")

    print(f"\nDone. Saved to: {OUTPUT_FILE.absolute()}")
    print(f"Total lines: {len(output_lines):,}")


if __name__ == "__main__":
    main()

Scraping Tharu (THL / 3062) – 27 books ≈ 260 chapters

→ MAT (28 chapters)
   1  done  (1/260)
   2  done  (2/260)
   3  done  (3/260)
   4  done  (4/260)
   5  done  (5/260)
   6  done  (6/260)
   7  done  (7/260)
   8  done  (8/260)
   9  done  (9/260)
  10  done  (10/260)
  11  done  (11/260)
  12  done  (12/260)
  13  done  (13/260)
  14  done  (14/260)
  15  done  (15/260)
  16  done  (16/260)
  17  done  (17/260)
  18  done  (18/260)
  19  done  (19/260)
  20  done  (20/260)
  21  done  (21/260)
  22  done  (22/260)
  23  done  (23/260)
  24  done  (24/260)
  25  done  (25/260)
  26  done  (26/260)
  27  done  (27/260)
  28  done  (28/260)
→ MRK (16 chapters)
   1  done  (29/260)
   2  done  (30/260)
   3  done  (31/260)
   4  done  (32/260)
   5  done  (33/260)
   6  done  (34/260)
   7  done  (35/260)
   8  done  (36/260)
   9  done  (37/260)
  10  done  (38/260)
  11  done  (39/260)
  12  done  (40/260)
  13  done  (41/260)
  14  done  (42/260)
  15  done  (43/260)
  16  done 

In [20]:
import os
import re
from collections import OrderedDict
from pathlib import Path

files_to_merge = [
    "tharu_bible.txt",
    "tharu_corpus.txt"
]

output_file = Path("tharu_cleaned.txt")

'''Remove:
English letters
English digits
Devanagari digits
Special characters
'''
clean_pattern = re.compile(r"[A-Za-z0-9\u0966-\u096F,.\-!?;:\"'()\[\]{}<>@#$%^&*_+=/\\|~`]")

# Ordered set to remove duplicates while preserving order
seen = OrderedDict()

total_sentences_read = 0
total_duplicates_removed = 0

with output_file.open('w', encoding='utf-8') as outfile:
    for fname in files_to_merge:
        if not os.path.exists(fname):
            print(f"File not found: {fname}")
            continue

        with open(fname, 'r', encoding='utf-8') as infile:
            for line in infile:
                line = line.strip()
                if not line:
                    continue

                # Remove unwanted characters
                line = clean_pattern.sub('', line)

                # Normalize spaces
                line = re.sub(r'\s+', ' ', line).strip()
                if not line:
                    continue

                # Split into sentences by Devanagari danda
                sentences = re.split(r'[।॥]', line)
                for sentence in sentences:
                    sentence = sentence.strip()
                    if not sentence:
                        continue

                    total_sentences_read += 1

                    # Deduplicate
                    if sentence not in seen:
                        seen[sentence] = None
                        outfile.write(sentence + "\n")
                    else:
                        total_duplicates_removed += 1

print("Done!")
print(f"Total sentences read: {total_sentences_read}")
print(f"Duplicates removed: {total_duplicates_removed}")
print(f"Total unique sentences saved: {len(seen)}")
print(f"Saved cleaned corpus to: {output_file.resolve()}")

✅ Done!
Total sentences read: 23973
Duplicates removed: 2157
Total unique sentences saved: 21816
Saved cleaned corpus to: /Users/Dell/Desktop/finalproj/nepal-lang/tharu_cleaned.txt
