<a href="https://colab.research.google.com/github/gbeyderman/gbeyderman/blob/gh-pages/Chassidic_mBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We’ll use a combination of Hebrew Chassidus texts and English translations/commentaries to maintain mBERT’s bilingual strengths.

Action Steps:
Hebrew Texts: Scrape digitized Hebrew texts from the Chabad Library (https://chabadlibrary.org/books).
English Texts: Collect English translations of Chabad texts (e.g., Lessons in Tanya, Chassidic discourses from Kehot, and other sources).
Combine Datasets:
Merge Hebrew and English texts into a bilingual dataset.
Ensure a balance between the two languages to retain mBERT's multilingual capabilities.

In [2]:
import requests
from bs4 import BeautifulSoup
import os

# Directories for storing text
OUTPUT_DIR_HE = "chabad_texts_hebrew"
OUTPUT_DIR_EN = "chabad_texts_english"
os.makedirs(OUTPUT_DIR_HE, exist_ok=True)
os.makedirs(OUTPUT_DIR_EN, exist_ok=True)

# Example book URLs (extend this list)
hebrew_links = [
    "https://chabadlibrary.org/books/book1",  # Replace with real URLs
]


def scrape_and_save(url, output_dir):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text()
        filename = os.path.join(output_dir, url.split("/")[-1] + ".txt")
        with open(filename, "w", encoding="utf-8") as file:
            file.write(text)
        print(f"Saved: {filename}")
    else:
        print(f"Failed to fetch: {url}")

# Scrape Hebrew and English texts
for link in hebrew_links:
    scrape_and_save(link, OUTPUT_DIR_HE)


Saved: chabad_texts_hebrew/book1.txt


In [4]:
import requests
from bs4 import BeautifulSoup
import os

OUTPUT_DIR_EN = "chabad_texts_english"
os.makedirs(OUTPUT_DIR_EN, exist_ok=True)

# Base URL for English texts
base_url = "https://www.chabad.org/library/article_cdo/aid/1757026/jewish/Chassidic-Texts.htm"

# Scraping the index page
response = requests.get(base_url)
if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    links = [
        "https://www.chabad.org" + a["href"]
        for a in soup.find_all("a", href=True)
        if "/library/article_cdo/" in a["href"]
    ]

    # Scrape and save each linked text
    for link in links:
        try:
            res = requests.get(link)
            if res.status_code == 200:
                page_soup = BeautifulSoup(res.text, "html.parser")
                text = page_soup.get_text()
                filename = os.path.join(OUTPUT_DIR_EN, link.split("/")[-1] + ".txt")
                with open(filename, "w", encoding="utf-8") as file:
                    file.write(text)
                print(f"Saved: {filename}")
            else:
                print(f"Failed to fetch {link}: HTTP Status {res.status_code}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch {link}: {str(e)}")
else:
    print(f"Failed to fetch the index page. HTTP Status {response.status_code}")
    if response.status_code == 404:
        print("Reason: The page does not exist (404).")
    elif response.status_code == 403:
        print("Reason: Access is forbidden (403).")
    elif response.status_code == 500:
        print("Reason: Server error (500).")
    else:
        print("Reason: Unknown error occurred.")


Failed to fetch the index page. HTTP Status 403
Reason: Access is forbidden (403).
