In [47]:
import html
from bs4 import BeautifulSoup
from glob import glob
from itertools import chain
from pathlib import Path
import pandas as pd
import requests
import numpy as np
from tqdm import tqdm
import json

## Arabic - Hebrew

In [16]:
arabit_browse_template = "https://arabit.me/page/{page_number}/"
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
page_htmls = []
for page_number in tqdm(np.arange(1, 232)):
    page_url = arabit_browse_template.format(page_number=page_number)
    page_html = requests.get(page_url, headers=headers)
    page_htmls.append(page_html)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 231/231 [05:58<00:00,  1.55s/it]


In [17]:
for i, page in enumerate(page_htmls):
    with open(f'data/arabit/browse/{str(i+1)}', 'w') as f:
        f.write(page.text)

In [23]:
word_pages = []
for page in page_htmls:
    soup = BeautifulSoup(page.text, 'html.parser')

    entries = soup.find_all('h2', class_='entry-title')
    for entry in entries:
        link = entry.find('a').get("href")
        title = entry.text.strip()
        word_pages.append((title, link))

In [24]:
wp_df = pd.DataFrame(word_pages, columns=['title', 'link'])
wp_df.tail()

Unnamed: 0,title,link
2297,איכס,https://arabit.me/%d7%90%d7%99%d7%9b%d7%a1/
2298,כלוא,https://arabit.me/%d7%9b%d7%9c%d7%95%d7%90/
2299,נעל ישנה,https://arabit.me/%d7%a0%d7%a2%d7%9c-%d7%99%d7...
2300,טבח,https://arabit.me/%d7%98%d7%91%d7%97/
2301,נביא,https://arabit.me/%d7%a0%d7%91%d7%99%d7%90/


In [25]:
wp_df.shape

(2302, 2)

In [28]:
wp_df.title.nunique()

2167

In [27]:
wp_df.link.nunique()

2302

In [26]:
wp_df.to_csv('data/arabit/pages.csv', index=False)

In [33]:
import os

In [35]:
from concurrent.futures import ThreadPoolExecutor

def save_page(params):
    i, title, link = params
    output_path = f'data/arabit/word_pages/{str(i+1)}.html'
    if not os.path.exists(output_path): 
        word_page_html = requests.get(link, headers=headers)
        with open(output_path, 'w') as f:
            f.write(word_page_html.text)

with ThreadPoolExecutor(max_workers=10) as executor:
    list(tqdm(executor.map(save_page, wp_df.itertuples()), total=len(wp_df)))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2302/2302 [17:21<00:00,  2.21it/s]


In [95]:
import re
import unicodedata


def normalize_text(text):
    return ''.join(c for c in unicodedata.normalize('NFD', text)
                   if unicodedata.category(c) != 'Mn')


def find_image(soup):
    yoast = json.loads(soup.find("script", class_="yoast-schema-graph").string)
    for elem in yoast["@graph"]:
        if elem.get("@type") == 'ImageObject':
            return elem["caption"], elem["url"]
        
        
SEPS = "();,"
rsplit = re.compile("\\"+"|\\".join(SEPS)).split


def find_arabic_words(s):
    pattern = r'[\u0600-\u06FF\s]+'
    return [ w for w in re.findall(pattern, s) if len(w)>1 ]


words = []
for i, title, link in tqdm(list(wp_df.itertuples()), total=len(wp_df)):
    #print(i, title, link)
    wp_path = f'data/arabit/word_pages/{str(i+1)}.html'
    with open(wp_path, 'r') as f:
        wp_html = f.read()
    soup = BeautifulSoup(wp_html, 'html.parser')
    try:
        caption, image = find_image(soup)
    except:
        print(f"Failed: {link}")
        continue
    caption_part = normalize_text(rsplit(caption)[0].strip(" “\""))
    #print(caption_part, image)
    
    #find in body
    body = soup.find("body")
    arabic_word = None
    for line in body.text.split("\n"):
        line_norm = normalize_text(line)
        if caption_part in line_norm:
            arabic_words = find_arabic_words(line)
            if arabic_words:
                arabic_word = arabic_words[0]
                break
    #print(arabic_word)
    words.append((i+1, title, link, image, caption, caption_part, arabic_word))


 26%|████████████████████████████████████████▌                                                                                                                | 610/2302 [00:27<01:01, 27.45it/s]

Failed: https://arabit.me/%d7%9e%d7%9b%d7%95%d7%9c%d7%aa/


 29%|███████████████████████████████████████████▋                                                                                                             | 657/2302 [00:30<01:05, 25.02it/s]

Failed: https://arabit.me/%d7%a4%d7%9c%d7%99%d7%a9%d7%94/


 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 1558/2302 [01:10<00:26, 28.11it/s]

Failed: https://arabit.me/%d7%a9%d7%9c%d7%95%d7%9d-2/
Failed: https://arabit.me/%d7%94%d7%a9%d7%aa%d7%97%d7%a8%d7%a8-%d7%94%d7%aa%d7%a8%d7%95%d7%a4%d7%a3/
Failed: https://arabit.me/%d7%a4%d7%a8%d7%97%d7%97%d7%99%d7%9d/


 69%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                              | 1596/2302 [01:12<00:25, 27.95it/s]

Failed: https://arabit.me/%d7%92%d7%9c%d7%a2%d7%99%d7%9f/


 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 1602/2302 [01:12<00:38, 18.00it/s]

Failed: https://arabit.me/%d7%99%d7%95%d7%a6%d7%90-%d7%9e%d7%92%d7%93%d7%a8-%d7%94%d7%a8%d7%92%d7%99%d7%9c/
Failed: https://arabit.me/%d7%9c%d7%9b%d7%9c-%d7%94%d7%a8%d7%95%d7%97%d7%95%d7%aa/
Failed: https://arabit.me/%d7%96%d7%99%d7%95%d7%a3-%d7%91%d7%a9%d7%99%d7%a8%d7%94/
Failed: https://arabit.me/%d7%90%d7%a1%d7%95%d7%a8/
Failed: https://arabit.me/%d7%a9%d7%90%d7%9c%d7%9c%d7%94-%d7%99%d7%a2%d7%96%d7%95%d7%a8-%d7%9c%d7%9a/
Failed: https://arabit.me/%d7%97%d7%9c%d7%a9/


 70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 1608/2302 [01:12<00:31, 21.88it/s]

Failed: https://arabit.me/%d7%9e%d7%a6%d7%95%d7%99%d7%9f/
Failed: https://arabit.me/%d7%90%d7%97%d7%95%d7%a9%d7%9c%d7%95%d7%a7%d7%99/
Failed: https://arabit.me/%d7%a9%d7%99%d7%a4%d7%95%d7%a8/


 77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                   | 1768/2302 [01:19<00:24, 21.63it/s]

Failed: https://arabit.me/%d7%a4%d7%99%d7%aa%d7%aa-%d7%9c%d7%a4%d7%94-2/


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2302/2302 [01:44<00:00, 22.12it/s]


## Download images


In [102]:
image_urls = [iu for w in words if (iu:=w[3]).startswith("http")]

In [104]:
from concurrent.futures import ThreadPoolExecutor
import shutil

def save_image(image_url):
    image_name = image_url.split("/")[-1]
    output_path = f'data/arabit/images/{image_name}'
    if not os.path.exists(output_path): 
        r = requests.get(image_url, headers=headers, stream=True)
        if r.status_code == 200:
            with open(output_path, 'wb') as f:
                for chunk in r:
                    f.write(chunk)

with ThreadPoolExecutor(max_workers=10) as executor:
    list(tqdm(executor.map(save_image, image_urls), total=len(image_urls)))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2286/2286 [02:36<00:00, 14.61it/s]


## Reduce image size

In [117]:
from PIL import Image
OUTPUT_FOLDER = 'data/arabit/images_smaller/'
def compress_image(image, max_size=100000, scale=0.9):
    output_path = os.path.join(OUTPUT_FOLDER, image)
    if os.path.exists(output_path):
        return
    image_path = os.path.join('data/arabit/images/', image)
    img = Image.open(image_path)

    if os.path.getsize(image_path) <= max_size:
        shutil.copy(image_path, output_path)
        return
    # If image size is more than max size provided, reduce the quality of image 
    while os.path.getsize(image_path) > max_size:
        width, height = img.size
        img.thumbnail((width*scale, height*scale))
        image_path = os.path.join(OUTPUT_FOLDER, image)
        img.save(image_path,optimize=True, quality=85)

for image in tqdm(os.listdir('data/arabit/images/')):
    filename, extension = os.path.splitext(image)
    compress_image(image)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2254/2254 [06:35<00:00,  5.69it/s]


In [119]:
from PIL import Image
OUTPUT_FOLDER = 'data/arabit/images_xs/'
def compress_image(image, max_size=25000, scale=0.9):
    output_path = os.path.join(OUTPUT_FOLDER, image)
    if os.path.exists(output_path):
        return
    image_path = os.path.join('data/arabit/images_smaller/', image)
    img = Image.open(image_path)

    if os.path.getsize(image_path) <= max_size:
        shutil.copy(image_path, output_path)
        return
    # If image size is more than max size provided, reduce the quality of image 
    while os.path.getsize(image_path) > max_size:
        width, height = img.size
        img.thumbnail((width*scale, height*scale))
        image_path = os.path.join(OUTPUT_FOLDER, image)
        img.save(image_path,optimize=True, quality=75)

for image in tqdm(os.listdir('data/arabit/images_smaller/')):
    if '.DS_Store' in image or '.ipynb' in image:
        continue
    filename, extension = os.path.splitext(image)
    compress_image(image)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2255/2255 [02:54<00:00, 12.94it/s]


## Find matching audio for each page

In [141]:
url_re = re.compile("https?:\\/\\/(?:w+\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)")
soundcloud_re = url_re = re.compile("https?://(?:www\.)?soundcloud\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\+.~#?&/=]*)")

In [145]:
sounds = []
for i, title, link in tqdm(list(wp_df.itertuples()), total=len(wp_df)):
    #print(i, title, link)
    wp_path = f'data/arabit/word_pages/{str(i+1)}.html'
    with open(wp_path, 'r') as f:
        wp_html = f.read()
    sound_link = re.findall(soundcloud_re, wp_html)
    soup = BeautifulSoup(wp_html, 'html.parser')
    sound = soup.find("iframe", title=re.compile(".* by Arabit"))
    if sound is not None:
        sound = sound.get("title")
    else:
        sound = None
    sounds.append((i+1, title, sound, sound_link))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2302/2302 [01:42<00:00, 22.49it/s]


In [146]:
len([1 for s in sounds if s[2] is not None])

683

In [149]:
len([1 for s in sounds if s[2] is not None or (s[3] is not None and s[3]!=[])])

773

In [108]:
len(os.listdir('data/arabit/soundcloud/'))

1143

In [None]:
len(os.listdir('data/arabit/word_pages/'))

## Match iframe titles to souncloud file names

In [None]:
#TODO

## Scrape title of soundcloud links and match to soundcloud file names

In [None]:
#TODO

## look for souncloud arabit translit in body text

In [127]:
sound_files = []
for sound_file in os.listdir('data/arabit/soundcloud/'):
    if not sound_file.endswith('.mp3'):
        continue
    name = sound_file.split('.')[0]
    arab_translit = name.split('(')[0].strip()
    hebrew = name.split('(')[1].strip(') ') if '(' in name else None
    timestamp = os.path.getmtime('data/arabit/soundcloud/'+sound_file)
    sound_files.append((sound_file, arab_translit, hebrew, timestamp))
    
sfdf = pd.DataFrame(sound_files, columns=['sound_file', 'arab_translit', 'hebrew', 'timestamp'])
sfdf['timestamp'] = pd.to_datetime(sfdf['timestamp'],unit='s')
sfdf = sfdf.sort_values('timestamp')
sfdf.tail()

Unnamed: 0,sound_file,arab_translit,hebrew,timestamp
672,אנצ׳ם (הצטרף).mp3,אנצ׳ם,הצטרף,2024-01-13 05:39:50
406,האתף (טלפון).mp3,האתף,טלפון,2024-01-13 05:40:12
91,כרסי רג׳אג׳ (כסא נדנדה).mp3,כרסי רג׳אג׳,כסא נדנדה,2024-01-13 05:40:31
401,עבאס (זעוף פנים).mp3,עבאס,זעוף פנים,2024-01-13 05:40:53
1045,תצ׳ביט בנאת (להתחיל עם בחורות).mp3,תצ׳ביט בנאת,להתחיל עם בחורות,2024-01-13 05:41:10


In [128]:
sfdf.head()

Unnamed: 0,sound_file,arab_translit,hebrew,timestamp
655,אסמר.mp3,אסמר,,2020-12-14 03:06:04
563,חג'אב.mp3,חג'אב,,2020-12-14 03:07:07
882,מפרוצ'.mp3,מפרוצ',,2020-12-14 03:07:39
663,סג'ין.mp3,סג'ין,,2020-12-14 03:07:59
153,ראא'ס.mp3,ראא'ס,,2020-12-14 03:08:15


In [None]:
# try to 

## Clypit sounds

In [None]:
def download(mp3_url, title):
    filename = "%s.mp3" %(title)
    print "{*} Saving file to %s" %(filename)
    try:
        r = requests.get(url=mp3_url, stream=True)
        with open(filename, 'wb') as f:
            total_length = int(r.headers.get('content-length'))
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    f.flush()
    except Exception, e:
        print "{-} Something has gone horribly wrong! Please report on the github issue tracker with the following backtrace: \n%s" %(e)
    print "{*} Done!"
    
def get_mp3_url(url):
    content_id = url.replace("https://clyp.it/", "")
    try:
        r = requests.get(url="https://api.clyp.it/%s" %(content_id))    
    except Exception, e:
        print "{-} Something has gone horribly wrong! Please report on the github issue tracker with the following backtrace: \n%s" %(e)
    fucking_json = json.loads(r.text)
    song_title = fucking_json['Title']
    mp3_url = fucking_json['Mp3Url']
    if fucking_json['Status'] == "DownloadDisabled":
        print "{i} Uploader has disabled downloading. Who fucking cares."
    print "{*} Got song title: %s" %(song_title)
    print "{*} Got mp3 url: %s" %(mp3_url)
    return song_title, mp3_url

def main(args):
    if len(args) != 2:
        sys.exit("%s https://clyp.it/lolwat" %(args[0]))
    song_title, mp3_url = get_mp3_url(url=args[1])
    download(mp3_url=mp3_url, title=song_title)


## Collect all to one csv

In [96]:
wdf = pd.DataFrame(words, columns=["page_num", "title", "link", "image", "caption", "caption_part", "arabic_word"])
wdf.head()

Unnamed: 0,page_num,title,link,image,caption,caption_part,arabic_word
0,1,תודה,https://arabit.me/%d7%aa%d7%95%d7%93%d7%94/,https://arabit.me/wp-content/uploads/2024/01/ת...,תודה,תודה,شُكْرًا
1,2,מערה,https://arabit.me/%d7%9e%d7%a2%d7%a8%d7%94/,https://arabit.me/wp-content/uploads/2024/01/מ...,מערה,מערה,مغارة
2,3,כיסא נדנדה,https://arabit.me/%d7%9b%d7%99%d7%a1%d7%90-%d7...,https://arabit.me/wp-content/uploads/2024/01/כ...,כיסא נדנדה,כיסא נדנדה,كُرْسِيّ رَجَّاج
3,4,הצטרף,https://arabit.me/%d7%94%d7%a6%d7%98%d7%a8%d7%a3/,https://arabit.me/wp-content/uploads/2024/01/ה...,הצטרף,הצטרף,اِنْضَمّ
4,5,טלפון,https://arabit.me/%d7%98%d7%9c%d7%a4%d7%95%d7%9f/,https://arabit.me/wp-content/uploads/2024/01/ט...,טלפון,טלפון,هاتف


In [110]:
wdf.link.nunique()

2286

In [None]:
# add local image path
# add local audio

In [97]:
wdf.to_csv("data/arabit/words.csv", index=False)

## Create Anki Deck