In [4]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

def get_article_links(home_url="https://www.tamangdajang.com/"):
    try:
        r = requests.get(home_url, timeout=12)
        r.raise_for_status()
    except Exception as e:
        print("Homepage failed ", e)
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if re.search(r'/(whaiim|tamsor|sengrap|mhasengrap|lehmngo|bichar|rimthim|gyalthim)/\d+', href):
            full = urljoin(home_url, href)
            if "tamangdajang.com" in full:
                links.add(full)

    return sorted(links)


def scrape_article(url):
    try:
        r = requests.get(url, timeout=12)
        r.raise_for_status()
    except Exception as e:
        print(f"  Failed {url} → {e}")
        return None

    soup = BeautifulSoup(r.text, "html.parser")

    # Title
    title_tag = soup.find("h1", class_="single-heading") or soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "No title"

    # Author & date
    meta = soup.find("div", class_="post-info")
    author = "Unknown"
    date_str = "Unknown"
    if meta:
        author_a = meta.find("a")
        if author_a:
            author = author_a.get_text(strip=True)
        date_span = meta.find("span", class_="rddate")
        if date_span:
            date_str = date_span.get_text(strip=True)

    # Main content
    content_div = soup.find("div", class_="news-body-content") or \
                  soup.find("div", class_="news-body") or \
                  soup.find("div", itemprop="articleBody")

    if not content_div:
        return {"url": url, "title": title, "author": author, "date": date_str, "content": "No content found"}

    # Clean junk
    for bad in content_div.find_all(["script", "style", "iframe", "form",
                               {"class": re.compile("sharedaddy|sd-|advert|banner", re.I)}]):
        bad.decompose()

    # ---- Extract paragraphs and remove English letters/digits ----
    paragraphs = [
    re.sub(r'[A-Za-z0-9]', '', p.get_text(strip=True))  # removes English letters & digits
    for p in content_div.find_all("p")
    if p.get_text(strip=True)
    ]
    return {
    "url": url,
    "title": title,
    "content": "\n\n".join(paragraphs) if paragraphs else "No readable text"
    }


def main():
    print("Collecting recent articles from homepage...\n")
    urls = get_article_links()

    if not urls:
        print("No article links found on homepage.")
        return

    print(f"Found {len(urls)} potential articles\n")

    saved = 0
    with open("tamangnews_scraped1.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(urls[:], 1): 
            data = scrape_article(url)
            if data and len(data["content"]) > 120:
                saved += 1
                print(data['title'])
                f.write(data['title'] + "\n")

                for sentence in data["content"].split("।"):
                    s = sentence.strip()
                    if s:
                        f.write(s + "\n")

    print(f"\nDone. Saved {saved} readable articles tamangnews_scraped1.txt")


if __name__ == "__main__":
    main()

Collecting recent articles from homepage...

Found 55 potential articles

असारे बजेट : भ्रष्टाचारला खेती


  for bad in content_div.find_all(["script", "style", "iframe", "form",


NameError: name 'all_sentences' is not defined

In [3]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import time
import os

# Configuration
BASE_URL = "https://media.ipsapps.org/taj/osa/hymns/C01/"
CHAPTER_PATTERN = r"Deva-01-XXE-\d{3}\.html"   # matches Deva-01-XXE-001.html etc.
START = 1
END = 53
DELAY = 1.2  # seconds between requests (polite to server)

def get_all_chapter_urls():
    print("Trying to list chapters from base URL...")
    try:
        r = requests.get(BASE_URL, timeout=12)
        r.raise_for_status()
    except Exception as e:
        print("Cannot access directory index →", e)
        print("Falling back to direct URL pattern (assuming 001–053 exist)...")
        urls = []
        for i in range(START, END + 1):
            num = f"{i:03d}"
            url = urljoin(BASE_URL, f"Deva-01-XXE-{num}.html")
            urls.append(url)
        return urls

    soup = BeautifulSoup(r.text, "html.parser")
    links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if re.search(CHAPTER_PATTERN, href):
            full = urljoin(BASE_URL, href)
            links.add(full)

    if not links:
        print("No links found in index → generating expected URLs")
        for i in range(START, END + 1):
            num = f"{i:03d}"
            url = urljoin(BASE_URL, f"Deva-01-XXE-{num}.html")
            links.add(url)

    return sorted(links)


def scrape_chapter(url, chapter_num):
    print(f"  Scraping {chapter_num:02d} → {url}")
    try:
        r = requests.get(url, timeout=12)
        r.encoding = 'utf-8'
        r.raise_for_status()
    except Exception as e:
        print(f"    → Failed: {e}")
        return None, None

    soup = BeautifulSoup(r.text, "html.parser")

    # Title
    title_tag = soup.find("title") or soup.find("div", class_="mt")
    title = title_tag.get_text(strip=True) if title_tag else f"भजन {chapter_num}"

    # Lyrics content: <div class="li">, <div class="li2">, <div class="s">
    verses = []
    for tag in soup.find_all(["div"], class_=["li", "li2", "s"]):
        text = tag.get_text(strip=True)
        if text and text.strip("।। "):  # skip empty/decorative
            verses.append(text)

    content = "\n".join(verses) if verses else "(कुनै सामग्री फेला परेन)"

    return title, content


def main():
    print("भजनगदे (Deva-01-XXE) – पूर्ण संग्रह डाउनलोड सुरु हुँदैछ...\n")
    
    urls = get_all_chapter_urls()
    print(f"Found / generated {len(urls)} chapter URLs\n")

    total_saved = 0

    with open("Tamang_bible2.txt", "w", encoding="utf-8") as f:
        for idx, url in enumerate(urls, 1):
            chapter_num = idx  # assuming order 1 to 53
            title, content = scrape_chapter(url, chapter_num)

            if title and content and "फेला परेन" not in content:
                total_saved += 1
                f.write(f"भजन {chapter_num:02d}   –   {title}\n")
                f.write(content + "\n\n")
            else:
                f.write(f"भजन {chapter_num:02d}   –   (उपलब्ध छैन वा खाली)\n")


            time.sleep(DELAY)

    print(f"सम्पन्न! {total_saved} भजनहरू सफलतापूर्वक बचत गरियो")
    print(f"फाइल नाम:Tamang_bible2.txt")

if __name__ == "__main__":
    main()

भजनगदे (Deva-01-XXE) – पूर्ण संग्रह डाउनलोड सुरु हुँदैछ...

Trying to list chapters from base URL...
No links found in index → generating expected URLs
Found / generated 53 chapter URLs

  Scraping 01 → https://media.ipsapps.org/taj/osa/hymns/C01/Deva-01-XXE-001.html
  Scraping 02 → https://media.ipsapps.org/taj/osa/hymns/C01/Deva-01-XXE-002.html
  Scraping 03 → https://media.ipsapps.org/taj/osa/hymns/C01/Deva-01-XXE-003.html
  Scraping 04 → https://media.ipsapps.org/taj/osa/hymns/C01/Deva-01-XXE-004.html
  Scraping 05 → https://media.ipsapps.org/taj/osa/hymns/C01/Deva-01-XXE-005.html
  Scraping 06 → https://media.ipsapps.org/taj/osa/hymns/C01/Deva-01-XXE-006.html
  Scraping 07 → https://media.ipsapps.org/taj/osa/hymns/C01/Deva-01-XXE-007.html
  Scraping 08 → https://media.ipsapps.org/taj/osa/hymns/C01/Deva-01-XXE-008.html
  Scraping 09 → https://media.ipsapps.org/taj/osa/hymns/C01/Deva-01-XXE-009.html
  Scraping 10 → https://media.ipsapps.org/taj/osa/hymns/C01/Deva-01-XXE-010.html
  S

In [10]:
import requests
from bs4 import BeautifulSoup
import re
import time
from pathlib import Path

# Configuration
VERSION_ID = 1177
OUTPUT_FILE = Path("tamang_bible2.txt")
BASE_URL = "https://www.bible.com/bible/{vid}/{book}.{chap}.TAJNT"

books = [
    ("MAT", 28), ("MRK", 16), ("LUK", 24), ("JHN", 21),
    ("ACT", 28), ("ROM", 16), ("1CO", 16), ("2CO", 13),
    ("GAL", 6),  ("EPH", 6),  ("PHP", 4),  ("COL", 4),
    ("1TH", 5),  ("2TH", 3),  ("1TI", 6),  ("2TI", 4),
    ("TIT", 3),  ("PHM", 1),  ("HEB", 13), ("JAS", 5),
    ("1PE", 5),  ("2PE", 3),  ("1JN", 5),  ("2JN", 1),
    ("3JN", 1),  ("JUD", 1),  ("REV", 22)
]

def clean_text(text):
    """Remove extra spaces, normalize, remove verse numbers from inline text"""
    text = re.sub(r'\s+', ' ', text.strip())
    text = re.sub(r'^\d+\s*', '', text)  # remove leading verse number if any
    return text

def scrape_chapter(book_code, chapter_num):
    url = BASE_URL.format(vid=VERSION_ID, book=book_code, chap=chapter_num)
    try:
        resp = requests.get(url, timeout=12, headers={"User-Agent": "Mozilla/5.0"})
        if resp.status_code != 200:
            print(f"  → Failed {book_code} {chapter_num} (status {resp.status_code})")
            return None

        soup = BeautifulSoup(resp.text, "html.parser")
        content_div = soup.find("div", class_="ChapterContent_bible-reader__LmLUa")
        if not content_div:
            print(f"  → No content found for {book_code} {chapter_num}")
            return None

        verses = []
        for verse_span in content_div.find_all("span", class_="ChapterContent_verse__57FIw"):
            label = verse_span.find("span", class_="ChapterContent_label__R2PLt")
            verse_num = label.get_text(strip=True) if label else "?"
            
            # Get all text after the label (exclude notes, headings, etc.)
            text_parts = []
            for child in verse_span.children:
                if child.name == "span" and "label" in child.get("class", []):
                    continue
                if child.name == "span" and "note" in child.get("class", []):
                    continue  # skip footnotes
                text_parts.append(child.get_text(strip=True))
            
            verse_text = clean_text(" ".join(text_parts))
            if verse_text:
                verses.append(f"{verse_num} {verse_text}")

        return verses

    except Exception as e:
        print(f"  → Error on {book_code} {chapter_num}: {e}")
        return None


def main():
    all_lines = []
    total_chapters = sum(ch for _, ch in books)
    processed = 0

    print(f"Starting scrape of Eastern Tamang NT (tajNT) – {len(books)} books, ~{total_chapters} chapters\n")

    for book_code, max_chap in books:
        print(f"→ {book_code} ({max_chap} chapters)")
        book_lines = [f"\n=== {book_code} ===\n"]

        for chap in range(1, max_chap + 1):
            processed += 1
            verses = scrape_chapter(book_code, chap)
            if verses:
                book_lines.append(f"\n--- Chapter {chap} ---\n")
                book_lines.extend(verses)
                book_lines.append("")  # blank line
            else:
                book_lines.append(f"(Chapter {chap} not available or failed)")
            
            print(f"  Chapter {chap:2d}  done  ({processed}/{total_chapters})")
            time.sleep(1.8)  # polite delay ~0.5–0.6 req/sec

        all_lines.extend(book_lines)

    # Save to file
    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
        f.write("\n".join(all_lines).strip() + "\n")

    print(f"\nDone! Saved to: {OUTPUT_FILE.absolute()}")
    print(f"Total lines written: {len(all_lines):,}")


if __name__ == "__main__":
    main()

Starting scrape of Eastern Tamang NT (tajNT) – 27 books, ~260 chapters

→ MAT (28 chapters)
  Chapter  1  done  (1/260)
  Chapter  2  done  (2/260)
  Chapter  3  done  (3/260)
  Chapter  4  done  (4/260)
  Chapter  5  done  (5/260)
  Chapter  6  done  (6/260)
  Chapter  7  done  (7/260)
  Chapter  8  done  (8/260)
  Chapter  9  done  (9/260)
  Chapter 10  done  (10/260)
  Chapter 11  done  (11/260)
  Chapter 12  done  (12/260)
  Chapter 13  done  (13/260)
  Chapter 14  done  (14/260)
  Chapter 15  done  (15/260)
  Chapter 16  done  (16/260)
  Chapter 17  done  (17/260)
  Chapter 18  done  (18/260)
  Chapter 19  done  (19/260)
  Chapter 20  done  (20/260)
  Chapter 21  done  (21/260)
  Chapter 22  done  (22/260)
  Chapter 23  done  (23/260)
  Chapter 24  done  (24/260)
  Chapter 25  done  (25/260)
  Chapter 26  done  (26/260)
  Chapter 27  done  (27/260)
  Chapter 28  done  (28/260)
→ MRK (16 chapters)
  Chapter  1  done  (29/260)
  Chapter  2  done  (30/260)
  Chapter  3  done  (31/260

In [17]:
import os
import re
from collections import OrderedDict   #to preserve insertion order while removing duplicates

# List of input files
files_to_merge = [
    "tamangnews_scraped1.txt",
    "tamang_bible.txt",
    "tamang_bible2.txt",
    "tamang_scraped2.txt"
]

output_file = "tamang_cleaned.txt"

# Pattern: remove English letters, ASCII digits, Devanagari digits, and many common punctuation/symbols
# You can adjust this pattern if you want to keep more/less characters
clean_pattern = re.compile(r"[A-Za-z0-9\u0966-\u096F,.\-!?;:\"'()\[\]{}<>@#$%^&*_+=/\\|~`]")

# track unique sentences while preserving order
seen = OrderedDict()   # key = sentence, value = None (just using it as ordered set)

total_lines_read = 0
total_sentences_before_dedup = 0
total_duplicates_removed = 0

for fname in files_to_merge:
    if not os.path.exists(fname):
        print(f"File not found, skipping: {fname}")
        continue
    
    print(f"Processing: {fname}")
    
    with open(fname, 'r', encoding='utf-8') as infile:
        for line in infile:
            line = line.strip()
            if not line:
                continue
            
            total_lines_read += 1
            
            # Remove unwanted characters
            cleaned = clean_pattern.sub('', line)
            
            # Normalize multiple spaces → single space
            cleaned = re.sub(r'\s+', ' ', cleaned).strip()
            
            if not cleaned:
                continue
            
            # Split into sentences using Devanagari danda (। and ॥)
            sentences = re.split(r'[।॥]', cleaned)
            
            for sent in sentences:
                sent = sent.strip()
                if not sent:
                    continue
                
                total_sentences_before_dedup += 1
                
                #Only keep if we haven't seen it before
                if sent not in seen:
                    seen[sent] = None
                else:
                    total_duplicates_removed += 1

print(f"Files processed           : {len(files_to_merge)}")
print(f"Total lines read          : {total_lines_read:,}")
print(f"Total sentences before dedup : {total_sentences_before_dedup:,}")
print(f"Duplicate sentences removed  : {total_duplicates_removed:,}")
print(f"Unique sentences kept        : {len(seen):,}")

# Write unique sentences in the order they first appeared
with open(output_file, 'w', encoding='utf-8') as outfile:
    for sentence in seen:
        outfile.write(sentence + "\n")

print(f"Done!")
print(f"Cleaned & deduplicated corpus saved to: {output_file}")
print(f"Final sentence count: {len(seen):,}")

Processing: tamangnews_scraped1.txt
Processing: tamang_bible.txt
Processing: tamang_bible2.txt
Processing: tamang_scraped2.txt

Files processed           : 4
Total lines read          : 9,792
Total sentences before dedup : 16,388
Duplicate sentences removed  : 1,669
Unique sentences kept        : 14,719

Done!
Cleaned & deduplicated corpus saved to: tamang_cleaned.txt
Final sentence count: 14,719
