In [3]:
#Parse with BeautifulSoup
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify())  # prints well-formatted HTML

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   Newa News - NepalBhasa News Portal
  </title>
  <link href="https://newanews.com/wp-content/themes/creation_news_theme/assets/style.css" rel="stylesheet"/>
  <link href="https://newanews.com/wp-content/themes/creation_news_theme/style.css" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com" rel="preconnect"/>
  <link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>
  <link href="https://newanews.com/wp-content/themes/creation_news_theme/assets/favicon.png" rel="shortcut icon" type="image/x-icon"/>
  <link crossorigin="anonymous" href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.3/dist/css/bootstrap.min.css" integrity="sha384-rbsA2VBKQhggwzxH7pPCaAqO46MgnOM80zW1RWuH61DGLwZJEdK2Kadq2F9CUG65" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@300;500&amp;display=swap" rel="sty

In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE_URL = "https://newanews.com/"

headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(BASE_URL, headers=headers)
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, "html.parser")

article_links = set()

for a in soup.find_all("a", href=True):
    href = a["href"]

    # WordPress article pattern
    if href.startswith("https://newanews.com/20"):
        article_links.add(href)

print("Articles found:", len(article_links))


Articles found: 54


In [1]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

#Get article URLs from homepage
def get_article_links():
    base_url = "https://newanews.com/"
    try:
        r = requests.get(base_url, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print("Cannot reach homepage", e)
        return []

    soup = BeautifulSoup(r.text, 'html.parser')
    article_links = set()

    for a in soup.find_all('a', href=True):
        href = a['href'].strip()
        if not href:
            continue

        #Convert relative to absolute link
        full_url = urljoin(base_url, href)

        #Flexible article URL pattern 
        if re.search(r'newanews\.com/\d{4}/\d{2}/', full_url):
            article_links.add(full_url.rstrip('/'))

    return list(article_links)

#scrape individual article content
def scrape_article_content(article_url):
    try:
        r = requests.get(article_url, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print(f"Failed → {article_url} | {e}")
        return None

    soup = BeautifulSoup(r.text, 'html.parser')

    #title
    title_div = soup.find('div', class_='show-title')
    title = title_div.get_text(strip=True) if title_div else 'No Title Found'

    #content
    content_div = soup.find('div', class_='show-desc')
    if not content_div:
        return None

    #remove unwanted elements
    for bad in content_div.find_all([
        'script', 'style', 'figure', 'iframe', 'form',
        'button', 'nav', 'footer'
    ]):
        bad.decompose()

    content = ' '.join(content_div.stripped_strings)

    return {
        'url': article_url,
        'title': title,
        'content': content
    }

def main():
    article_links = get_article_links()
    print(f"Found {len(article_links)} article links.\n")

    all_articles = []

    for i, link in enumerate(article_links, 1):
        article = scrape_article_content(link)
        if article and len(article['content']) > 50:
            all_articles.append(article)
            print(f"[{i}] Scraped → {article['title'][:60]}")
        else:
            print(f"[{i}] Skipped → no content")

    #SAVE TO FILE
    with open('newari_scraped1.txt', 'w', encoding='utf-8') as f:
        for article in all_articles:
            f.write(article['title'] + "\n")

            #remove end-of-sentence syntax (|)
            for sentence in article['content'].split('।'):
                s = sentence.strip()
                if s:
                    f.write(s + "\n")

    print("\nScraping complete.")
    print(f"Saved {len(all_articles)} articles to newari_scraped1.txt")

if __name__ == "__main__":
    main()

Found 54 article links.

[1] Scraped → स्लोभेनियाया नायः पदय् वीरमान श्रेष्ठ, एनआरएनए नेटवर्क ९० गू
[2] Scraped → केपी ओली कपय् मच्छिन्द्र त्यात, सनिश श्रेष्ठ उत्कृष्ट कासामि
[3] Scraped → बसन्तपुरय् चीर स्वाना होलि न्ह्यात
[4] Scraped → लितवःगु पौभा तइगु इतुम्बहाल म्यूजियमयात १ करोड बजेट : उपमेयर
[5] Scraped → चिकित्सा परिषदय् ५ म्ह दुजः नियुक्त
[6] Scraped → स्वयम्भू महाेत्सवय् चित्रकला धेधेबल्ला क्वचाल
[7] Scraped → अधिकारया खँ जक ल्हाना समाज खःगु लँपुइ वनी मखु – उपप्रधानमन्त
[8] Scraped → लुँया भाः प्रतितोला १२०० तका थहाँ वन
[9] Scraped → १६ करोड बराबरया रंग आयात
[10] Scraped → अन्तर्राष्ट्रिय मिसा फुटबलया नितिं नेपाली टिम तयार
[11] Scraped → येँ जिल्ला प्रशासनं हाेलीया नितिं सचेत यात
[12] Scraped → इजरायली सेनाया गाजा ब्वनेकुथिइ आक्रमणं निम्ह सित
[13] Scraped → थाैंनिसें खेँ दनिगु
[14] Scraped → सुमी मल्लं ‘मिसेज भोग नेपाल–२०२५’ त्याकल
[15] Scraped → पेट्रोलया ५ तका, डिजेल व मचिकंया ४ तका दन
[16] Scraped → थौं सर्वोच्च अदालतया छुं नं इजलास मच्वनीगु
[17] Scraped → देय् न्यंकं डेंगु

In [7]:
import requests
from bs4 import BeautifulSoup
import re

def get_article_urls():
    url = "https://nepalbhasatimes.com/"
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print("Cannot reach homepage →", e)
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if re.search(r'/\d{4,6}$', href) or re.search(r'/\d{4}/\d{1,2}/[^/]+/?$', href):
            full = "https://nepalbhasatimes.com" + href if href.startswith("/") else href
            links.add(full)
    return sorted(links)

def scrape_one_article(url):
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print("Failed")
        return None

    soup = BeautifulSoup(r.text, "html.parser")

    #Title 
    title_tag = soup.find("h1") or soup.find("title")
    title = title_tag.get_text(strip=True) if title_tag else "No title"

    #content 
    body = (
        soup.find("div", class_="blog-single-details") or
        soup.find("article") or
        soup.find("div", {"itemprop": "articleBody"})
    )

    if not body:
        return {"url": url, "title": title, "content": "No content found"}


    #Clean unwanted elements
    for junk in body.find_all(["script", "style", "iframe", "form",
                               {"class": re.compile("sharedaddy|sd-|advert|banner", re.I)}]):
        junk.decompose()

    # ---- Extract paragraphs and remove English letters/digits ----
    paragraphs = [
    re.sub(r'[A-Za-z0-9]', '', p.get_text(strip=True))  # removes English letters & digits
    for p in body.find_all("p")
    if p.get_text(strip=True)
    ]
    return {
    "url": url,
    "title": title,
    "content": "\n\n".join(paragraphs) if paragraphs else "No readable text"
    }
    
def main():
    print("Looking for articles on homepage...\n")
    urls = get_article_urls()

    if not urls:
        print("Found 0 articles")
        return

    print(f"Found {len(urls)} possible articles\n")
    results = []
    for i, url in enumerate(urls[:], 1):  
        data = scrape_one_article(url)
        if data and len(data["content"]) > 100:
            results.append(data)
            print(data['title'])
        else:
            print("Failed")

    #Save  text file
    if results:
        with open("newari_scraped2.txt", "w", encoding="utf-8") as f:
            for art in results:
                f.write(art['title'] + "\n")
                for sentence in art["content"].split("।"):
                    s = sentence.strip()
                    if s:
                        f.write(s + "\n")
        print(f"\nSaved {len(results)} articles to newari_scraped2.txt")
    else:
        print("\nNothing useful was scraped.")

if __name__ == "__main__":
    main()

Looking for articles on homepage...

Found 38 possible articles



  for junk in body.find_all(["script", "style", "iframe", "form",


बुद्धवचन – पालिभाषा
पासाया जंक्व
विश्व सांस्कृतिक उत्सव–२०२३ नेपाःया ज्वःमदुगु प्रस्तुति
बुँख्याचा
तिथिइ जुयाच्वंगु गोलमाल व समाधानया उपाय
नेपालभाषा न्ह्यसःलिसः कासा व न्ह्यसःलिसः
फिफा विश्वकप २०२६ : उद्घाटन कासा मेक्सिको व फाइनल अमेरिकाय्
ग्रामी अवार्ड २०२४ य् टेलर स्वीफ्ट उत्कृष्ट
बेलायतया जुजु चाल्र्सयात क्यान्सर
छगू युगय् छक्वः वःगु भाषिक अधिकार
अर्जेन्टिनायात कीर्तिमानी  कोपा अमेरिकाया उपाधि
थौं आइफोन १६ सार्वजनिक जुइगु
राजमतिया नेपाली भर्सन पिदन
यलया येँयाः
‘कतांमरि’ म्येचाःया पितब्वज्या
प्रमयात रास्वपाया गुहार
महागुथि सहकारीया रजत दँ हन
कर्णालीयात बुकाः जनकपुर फाइनलय्
विश्व साहित्य दबुली नेपालभाषाया उपन्यास
थीथी संस्था व व्यक्तित्वयात हन
लुँ व डलरया भावय् न्हूगु रेकर्ड
प्यम्ह मिसा साहित्यकारया अनुवाद श्रृंखला सफू पिदन
ट्राफिक व्यवस्थापनया निंतिं न्हूगु योजना हइगु
राष्ट्रपतियात भाषा आयोगया प्रतिवेदन
भुखाचं क्षति जूगु सम्पदा पुनःनिर्माण क्वचाल : महर्जन
यलया न्हू पुखूयाः अवैध संरचना थुनाबिल
नेपाल साहित्य मन्दिरया झिंन्याक्वःगु तःमुँज्या
ख्वपय् सांस्कृतिक सम्पदा पदयात्रा क्वचाल
Fail

In [14]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0"
}

def get_article_links():
    homepage = "https://subhaypost.com/"
    try:
        r = requests.get(homepage, headers=headers, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print("Cannot load homepage →", e)
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    links = set()

    # Find article links
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if re.search(r'/\d{4}/\d{2}/\d{2}/\d+/?$', href):
            full_url = urljoin(homepage, href)
            links.add(full_url)

    return sorted(links)

def scrape_article(url):
    try:
        r = requests.get(url, headers=headers, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print("Failed")
        return None

    soup = BeautifulSoup(r.text, "html.parser")

    # Title - usually in <h1>
    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "No title"

    # Main content block - most common on this site
    content_block = (
        soup.find("div", class_="inner-left") or
        soup.find("div", class_="seto_admin") or
        soup.find("div", class_="inner-featured-image") or
        soup.find("article")
    )

    if not content_block:
        return {"url": url, "title": title, "content": "No content found"}

    # Clean junk
    for bad in content_block.find_all(["script", "style", "iframe", "form",
                               {"class": re.compile("sharedaddy|sd-|advert|banner", re.I)}]):
        bad.decompose()

    # ---- Extract paragraphs and remove English letters/digits ----
    paragraphs = [
    re.sub(r'[A-Za-z0-9]', '', p.get_text(strip=True))  # removes English letters & digits
    for p in content_block.find_all("p")
    if p.get_text(strip=True)
    ]
    return {
    "url": url,
    "title": title,
    "content": "\n\n".join(paragraphs) if paragraphs else "No readable text"
    }
    
def main():
    print("Collecting article links from homepage...\n")
    urls = get_article_links()

    if not urls:
        print("No articles found on homepage. Site may load links with JavaScript.")
        return

    print(f"Found {len(urls)} possible articles\n")

    saved_count = 0
    with open("newari_scraped3.txt", "w", encoding="utf-8") as f:
        for i, url in enumerate(urls[:], 1): 
            data = scrape_article(url)
            if data and len(data["content"]) > 150:
                saved_count += 1
                print(data['title'])
                f.write(data['title']+"\n")
                for sentence in data["content"].split("।"):
                    s = sentence.strip()
                    if s:
                        f.write(s + "\n")

    print(f"\nDone! Saved {saved_count} articles to newari_scraped3.txt")


if __name__ == "__main__":
    main()

Collecting article links from homepage...

Found 75 possible articles



  for bad in content_block.find_all(["script", "style", "iframe", "form",


स्थानीयको कडा विरोधका वावजूद महानगरले भत्कायो बांगेमुढाको ‘त्वाःछेँ’, मंका: खल:द्वारा घोर भत्सर्ना
महानगरले त्वा:छेंमा डोजर चलाएको विरोधमा १८ संस्थाद्वारा विरोध प्रदर्शन , बालेनको पुतला दहन !
ताम्सिपाखा सडक महोत्सव -शुक्रबार
शंखरापुरमा धिमे बाजा र रञ्जना लिपि प्रशिक्षण
लाच्छि लर्निङ सेन्टरद्वारा इन्द्र जात्रा (ञेया)काे विषयवस्तुमा चित्रकलाहरुलाई प्रोत्साहन पुरस्कार
अष्ट महास्थानहरू-बुद्धको जीवनसँग सम्बन्धित आठ पवित्र स्थलहरू
समाजसेवामा शंखरापुर नगरका प्रवक्ता : राधाकृष्ण श्रेष्ठ
सम्राट अशोक: विजयादशमी र शान्तिको यात्रा
मालदिभ्समा सम्पन्न शारीरिक सुगठन प्रतियोगितामा पदक जित्न सफल नेपाली खेलाडीहरूलाई प्रधानमन्त्री केपी शर्मा ओलीको सम्मान
रुसी सेनापतिको मस्कोमै हत्या गर्न कसरी सफल भए युक्रेनी जासुस ?
सिटीवान टेलिभिजनको अध्यक्षमा पुनः पवन प्रजापति
सन् २०२४ : विमान दुर्घटनाको दुःखद वर्ष
Failed
नासाले सर्वाधिक तातो वर्षको पुष्टि
सदनमा थन्किएर बसेको शिक्षा विधेयक निर्माणका लागि कांग्रेसले दियो छलफल तीव्रता
कात्तिकेमा स्कुल बस दुर्घटना तीनजनाको मृत्यु, ४० घाइते
बेलून दुर्घटनामा परेका उपप्रधानम

In [21]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

def get_article_links():
    url = "https://elohanprakashan.com.np/"
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print("Homepage error:", e)
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if re.search(r'/\?p=\d+', href) or re.search(r'/\d{4}/\d{2}/[^/]+/?$', href):
            full = urljoin(url, href)
            if "elohanprakashan.com.np" in full:
                links.add(full)

    return sorted(links)


def scrape_article(url):
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print("  Failed:", url, "→", e)
        return None

    soup = BeautifulSoup(r.text, "html.parser")

    #Title
    title_tag = soup.find("h1", class_="entry-title") or soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "No title"

    #Main content area
    content_area = (
        soup.find("div", class_="entry-content") or
        soup.find("article") or
        soup.find("div", class_="post-content") or
        soup.find("div", id="content")
    )

    if not content_area:
        return {"url": url, "title": title, "content": "No content found"}

    # Clean junk
    for bad in content_area.find_all(["script", "style", "iframe", "form",
                               {"class": re.compile("sharedaddy|sd-|advert|banner", re.I)}]):
        bad.decompose()

    #Extract paragraphs and remove English letters/digits
    paragraphs = [
    re.sub(r'[A-Za-z0-9]', '', p.get_text(strip=True))  #removes English letters & digits
    for p in content_area.find_all("p")
    if p.get_text(strip=True)
    ]
    return {
    "url": url,
    "title": title,
    "content": "\n\n".join(paragraphs) if paragraphs else "No readable text"
    }

def main():
    print("Finding articles on homepage...\n")
    urls = get_article_links()

    if not urls:
        print("No article links found.")
        return

    print(f"Found {len(urls)} possible articles\n")

    with open("newari_scraped4.txt", "w", encoding="utf-8") as f:
        count = 0
        for i, url in enumerate(urls[:], 1): 
            data = scrape_article(url)
            if data and len(data["content"]) > 100:
                count += 1
                print(data['title'])
                f.write(data['title'] + "\n")
                for sentence in data["content"].split("।"):
                    s = sentence.strip()
                    if s:
                        f.write(s + "\n")

    print(f"\nDone! Saved {count} articles to newari_scraped4.txt")


if __name__ == "__main__":
    main()

Finding articles on homepage...

Found 53 possible articles



  for bad in content_area.find_all(["script", "style", "iframe", "form",


नेवाः संकिपा ख्यलय् लकडाउनया लिच्वः
‘संगीतया क्षेत्र धइगु सुयागुं नकल यानाः वा करपिन्सं यात जिं नं याये धकाः यायेगु ज्या खँ थे ज्यागु क्षेत्र मखु । थ्व पवित्र क्षेत्र खः । ’ सरिता शाही, म्येहालामि
‘.. जिगु सलं जाःगु म्ये न्यानाब्यू धकाः इनाप यायेमाःगु । थ्व खँ लुमना वइबलय् ला फर्निचर हे मिइ अपू धाये..’ – सानुबाबु महर्जन, म्येहालामि
नेपालभाषाया उत्थान जुइमाः धकाः अभिनय यानागु खः । अथेजुयाः दुःख मताः ।’ – तुल्सी डंगोल (कलाकार)
नेवाः उद्यमीलिसे पिंके म्होतिं नं १० लाख तकाया विज्ञापन यायेगु ध्यबा दइ । उकिं १० प्रतिशत जक नेवाः ज्याझ्वःयात फ्यानाबिउ धकाः इनाप याना तर वय्कःपिसं यानामदी ।
लाखे अ डिफरेन्ट मि, छगू न्हूगु प्रयाेग
“तापालय् थ्वःगु सः” डायस्पोरिक बाखं
“पलाःख्वाँय्” त्याजिके माःगु पला
“नीलः” चिबाखंया न्हू ख्वाःपाः
“फूस्वां” सिद्धिचरणया म्हसीका सफू
नां ह्युपिं नेपालभाषाया झिम्ह कलाकार
नेपालभाषाया म्यूजिक भिडियाे ख्यलय् मस्तय्‌गु छ्यलाबुला
गन वन हिराेइन त ?
राजामति – इतिहास व मिथ्या
निक्वःगू हलिं नेवाः न्ह्यसः लिसः कासा ११४३ – अपडेट
छगू फिल्म हे थःगु म्हसीका
समाजया किपा – सफू पितब्वज्य

In [24]:
import requests
from bs4 import BeautifulSoup
import re
import time
from pathlib import Path
from typing import List, Optional

# ────────────────────────────────────────────────
#  CONFIGURATION
# ────────────────────────────────────────────────

VERSION_ID = 1457          
OUTPUT_FILE = Path("newari_bible.txt")

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

BASE_URL = "https://www.bible.com/bible/{vid}/{book}.{chapter}.NEW"

#New Testament books (USFM code + number of chapters)
BOOKS = [
    ("MAT", 28), ("MRK", 16), ("LUK", 24), ("JHN", 21),
    ("ACT", 28), ("ROM", 16), ("1CO", 16), ("2CO", 13),
    ("GAL", 6),  ("EPH", 6),  ("PHP", 4),  ("COL", 4),
    ("1TH", 5),  ("2TH", 3),  ("1TI", 6),  ("2TI", 4),
    ("TIT", 3),  ("PHM", 1),  ("HEB", 13), ("JAS", 5),
    ("1PE", 5),  ("2PE", 3),  ("1JN", 5),  ("2JN", 1),
    ("3JN", 1),  ("JUD", 1),  ("REV", 22)
]

def clean_verse_text(text: str) -> str:
    """Remove verse numbers, multiple spaces, footnotes, English words, etc."""
    #Remove leading verse number
    text = re.sub(r'^\d+\s*', '', text.strip())
    #Remove footnote markers and content inside them
    text = re.sub(r'#.*?(?=\s|$)', '', text)
    #Collapse multiple spaces,newlines
    text = re.sub(r'\s+', ' ', text)
    #Remove trailing period 
    text = re.sub(r'\s*\।?\s*$', '', text)
    return text.strip()


def scrape_chapter(book_code: str, chapter: int) -> Optional[List[str]]:
    url = BASE_URL.format(vid=VERSION_ID, book=book_code, chapter=chapter)
    
    try:
        resp = requests.get(url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
    except requests.RequestException as e:
        print(f"  → Failed to fetch {book_code} {chapter}  ({e})")
        return None

    soup = BeautifulSoup(resp.text, "html.parser")

    #Main content container
    container = soup.find("div", class_="ChapterContent_bible-reader__LmLUa")
    if not container:
        print(f"  → No chapter content found for {book_code} {chapter}")
        return None

    verses = []

    #Find all verse spans
    for verse_span in container.find_all("span", class_="ChapterContent_verse__57FIw"):
        # Verse number
        label = verse_span.find("span", class_="ChapterContent_label__R2PLt")
        verse_num = label.get_text(strip=True) if label else "?"

        #Collect text
        text_parts = []
        for child in verse_span.descendants:
            if child.name == "span":
                classes = child.get("class", [])
                if "label" in classes or "note" in classes or "heading" in classes:
                    continue
            if isinstance(child, str) and child.strip():
                text_parts.append(child.strip())

        verse_text = clean_verse_text(" ".join(text_parts))
        if verse_text:
            verses.append(verse_text)

    return verses


def main():
    output_lines = []
    total_chapters = sum(chap for _, chap in BOOKS)

    print(f"Scraping {VERSION_ID} – {len(BOOKS)} books ≈ {total_chapters} chapters\n")

    processed = 0

    for book_code, max_chap in BOOKS:
        print(f"→ {book_code} ({max_chap} chapters)")
        book_lines = [f"\n=== {book_code} ===\n"]

        for chap in range(1, max_chap + 1):
            processed += 1
            verses = scrape_chapter(book_code, chap)

            if verses:
                book_lines.append(f"--- Chapter {chap} ---")
                book_lines.extend(verses)
                book_lines.append("")
            else:
                book_lines.append(f"(Chapter {chap} failed or not available)")

            print(f"  {chap:2d}  done  ({processed}/{total_chapters})")
            time.sleep(2.1)   # polite delay (~0.47 req/s)

        output_lines.extend(book_lines)

    # Save
    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
        f.write("\n".join(line.rstrip() for line in output_lines if line.strip()) + "\n")

    print(f"\nDone. Saved to: {OUTPUT_FILE.absolute()}")
    print(f"Total lines: {len(output_lines):,}")


if __name__ == "__main__":
    main()

Scraping 1457 – 27 books ≈ 260 chapters

→ MAT (28 chapters)
   1  done  (1/260)
   2  done  (2/260)
   3  done  (3/260)
   4  done  (4/260)
   5  done  (5/260)
   6  done  (6/260)
   7  done  (7/260)
   8  done  (8/260)
   9  done  (9/260)
  10  done  (10/260)
  11  done  (11/260)
  12  done  (12/260)
  13  done  (13/260)
  14  done  (14/260)
  15  done  (15/260)
  16  done  (16/260)
  17  done  (17/260)
  18  done  (18/260)
  19  done  (19/260)
  20  done  (20/260)
  21  done  (21/260)
  22  done  (22/260)
  23  done  (23/260)
  24  done  (24/260)
  25  done  (25/260)
  26  done  (26/260)
  27  done  (27/260)
  28  done  (28/260)
→ MRK (16 chapters)
   1  done  (29/260)
   2  done  (30/260)
   3  done  (31/260)
   4  done  (32/260)
   5  done  (33/260)
   6  done  (34/260)
   7  done  (35/260)
   8  done  (36/260)
   9  done  (37/260)
  10  done  (38/260)
  11  done  (39/260)
  12  done  (40/260)
  13  done  (41/260)
  14  done  (42/260)
  15  done  (43/260)
  16  done  (44/260)
→ LU

In [29]:
import os
import re
from collections import OrderedDict

# List of files to merge
files_to_merge = [
    "newari_scraped1.txt",
    "newari_scraped2.txt",
    "newari_scraped3.txt",
    "newari_scraped4.txt",
    "newari_bible.txt"
]

output_file = "newari_cleaned.txt"

'''Remove unwanted characters:
# English letters A-Z, a-z
# English digits 0-9
# Devanagari digits \u0966-\u096F (०-९)
# Common punctuation and special characters
'''
clean_pattern = re.compile(r"[A-Za-z0-9\u0966-\u096F,.\-!?;:\"'()\[\]{}<>@#$%^&*_+=/\\|~`]")

# Track unique sentences while preserving order
seen = OrderedDict()

total_sentences_read = 0
total_duplicates_removed = 0

with open(output_file, 'w', encoding='utf-8') as outfile:
    for fname in files_to_merge:
        if not os.path.exists(fname):
            print(f"File not found: {fname}")
            continue

        with open(fname, 'r', encoding='utf-8') as infile:
            for line in infile:
                line = line.strip()
                if not line:
                    continue

                #Remove unwanted characters
                line = clean_pattern.sub('', line)
                #Normalize spaces
                line = re.sub(r'\s+', ' ', line).strip()
                if not line:
                    continue

                sentences = re.split(r'[।॥]', line)
                for sentence in sentences:
                    sentence = sentence.strip()
                    if not sentence:
                        continue

                    total_sentences_read += 1

                    #remove end-of-sentence syntax (|)
                    if sentence not in seen:
                        seen[sentence] = None
                        outfile.write(sentence + "\n")
                    else:
                        total_duplicates_removed += 1

print("Done!")
print(f"Total sentences read: {total_sentences_read}")
print(f"Duplicates removed: {total_duplicates_removed}")
print(f"Total unique sentences saved: {len(seen)}")
print(f"Saved to: {output_file}")


Done!
Total sentences read: 21414
Duplicates removed: 2494
Total unique sentences saved: 18920
Saved to: newari_cleaned.txt


In [6]:
import os
import re
from collections import OrderedDict

files_to_merge = [
    "newari_scraped1.txt",
    "newari_scraped2.txt",
    "newari_scraped3.txt",
    "newari_scraped4.txt",
    "newari_bible.txt"
]

output_file = "newari_cleaned1.txt"

# Remove English letters, digits, Devanagari digits, punctuation
clean_pattern = re.compile(r"[A-Za-z0-9\u0966-\u096F,.\-!?;:\"'()\[\]{}<>@#$%^&*_+=/\\|~`]")

seen = OrderedDict()
total_lines_read = 0
duplicates_removed = 0

for fname in files_to_merge:
    if not os.path.exists(fname):
        print(f"File not found: {fname}")
        continue

    with open(fname, 'r', encoding='utf-8') as infile:
        for line in infile:
            total_lines_read += 1

            #Clean characters 
            cleaned_line = clean_pattern.sub('', line)

            #Remove duplicates based on full cleaned line
            if cleaned_line not in seen:
                seen[cleaned_line] = None
            else:
                duplicates_removed += 1

#Write cleaned unique lines
with open(output_file, 'w', encoding='utf-8') as outfile:
    for line in seen:
        outfile.write(line)

print("Done!")
print("Total lines read:", total_lines_read)
print("Duplicates removed:", duplicates_removed)
print("Total unique lines saved:", len(seen))
print("Saved to:", output_file)

Done!
Total lines read: 14774
Duplicates removed: 771
Total unique lines saved: 14003
Saved to: newari_cleaned1.txt


In [48]:
with open("newari_cleaned1.txt", "r", encoding="utf-8") as f:
    text = f.read()

words = text.split()
print("Total words:", len(words))

Total words: 218558


In [49]:
#Downsampling data
import random
import re

target_size = 179000  

lang_file = "newari_cleaned1.txt"
with open(lang_file, "r", encoding="utf-8") as f:
    words = f.read().split()

#Clean words: keep only Devanagari
def clean_word(word):
    word = re.sub(r'[^ऀ-ॿ]', '', word)  #remove non-Devanagari
    return word.strip()

#Clean all words
cleaned_words = [clean_word(w) for w in words if clean_word(w)]

#Downsample to exact target size
if len(cleaned_words) > target_size:
    cleaned_words = random.sample(cleaned_words, target_size)
elif len(cleaned_words) < target_size:
    
    cleaned_words = cleaned_words * (target_size // len(cleaned_words)) + cleaned_words[:target_size % len(cleaned_words)]

#Save
out_file = "newari_balanced.txt"
with open(out_file, "w", encoding="utf-8") as f:
    f.write(" ".join(cleaned_words))

print(f"Newari done: {len(cleaned_words)} words saved as {out_file}")

Newari done: 179000 words saved as newari_balanced.txt
