In [2]:
from bs4 import BeautifulSoup
import requests
import random
from fpdf import FPDF
from datetime import datetime
import os

In [3]:
def scrape_word(_WORD):
    """
    Gets the page from the Cambridge dictionary
    """
    _BASE = "https://dictionary.cambridge.org/us/dictionary/"
    # Not all languages in the dictionary have the same class tags
    # This notebook is currently set up for Chinese (Simplified)
    _LANG = "english-chinese-simplified/"
    _URL = _BASE + _LANG + _WORD
    print(_URL)

    headers = requests.utils.default_headers()
    headers.update(
        {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
        }
    )
    page = requests.get(_URL, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup

def get_file(filename):
    """
    Reads a text tile
    """
    with open(filename, 'r') as f:
        db = f.readlines()
    return db

def diff_lists(li1, li2):
    """
    diff function between two lists
    """
    newlist = []
    for w in li1:
        if w not in li2:
            newlist.append(w)
    return newlist


In [4]:
# Loads a number (num) of words from the the text file dictionary (wordsfile)
wordsfile = 'words.txt'
num = 10
db = get_file(wordsfile)
# Loads and eliminates learned words
burned = get_file('burned.txt')
db = diff_lists(db, burned)
# Picks a random list of words
words = random.sample(db, num)
# Preview them
for i, w in enumerate(words, start=1):
    print(i,w)
    


1 tall

2 abroad

3 living

4 subject

5 active

6 wander

7 besides

8 merely

9 space

10 state



In [5]:
def scrape_defn(word):
    """
    Scrapes a single definition of the word from the Cambridge dictionary
    Does not include all definitions of a word if it has multiple
    Includes one example only
    """
    scrape = {"english_defn": "", "lang_defn": "", "english_example": "", "lang_example": ""}
    soup = scrape_word(word)

    # Find the definitilinespacingon block
    # This includes the English definition and the Other Language definition
    # Sometimes includes an Example
    # Example may not include a Translation
    defns = soup.find_all("div", class_="def-block ddef_block")
    # May have multiple definitions. Select one.
    defn_choice = random.sample(defns, 1)[0]

    # English definition (should always be "def ddef_d db", regardless of language)
    english_defn = defn_choice.find("div", class_="def ddef_d db")
    scrape["english_defn"] = english_defn.text
    # Other language translation (should always have tags "trans dtrans", but other tags may be different)
    lang_defn = defn_choice.find("span", class_="trans dtrans dtrans-se break-cj")
    scrape["lang_defn"] = lang_defn.text

    # Example (should always be "examp dexamp", regardless of language)
    examples = defn_choice.find_all("div", class_="examp dexamp")
    # A definition block may have multiple examples. Select one.
    example_choice = random.sample(examples, 1)[0]

    # English example
    english_example = example_choice.find("span", class_="eg deg")
    scrape["english_example"] = english_example.text
    # Other language translation example may not exist
    lang_example = example_choice.find("span", class_="trans dtrans dtrans-se hdb break-cj")
    scrape["lang_example"] = lang_example.text
    
    return scrape

In [7]:
# Scrape every word from the dictionary
# Some entries may not have the expected elements. Usually a rerun will get around them. It is random though.
scrapes = []

for word in words:
    scrape = scrape_defn(word.strip())
    scrape['word'] = word.strip()
    scrapes.append(scrape)

https://dictionary.cambridge.org/us/dictionary/english-chinese-simplified/tall
https://dictionary.cambridge.org/us/dictionary/english-chinese-simplified/abroad
https://dictionary.cambridge.org/us/dictionary/english-chinese-simplified/living
https://dictionary.cambridge.org/us/dictionary/english-chinese-simplified/subject
https://dictionary.cambridge.org/us/dictionary/english-chinese-simplified/active
https://dictionary.cambridge.org/us/dictionary/english-chinese-simplified/wander
https://dictionary.cambridge.org/us/dictionary/english-chinese-simplified/besides
https://dictionary.cambridge.org/us/dictionary/english-chinese-simplified/merely
https://dictionary.cambridge.org/us/dictionary/english-chinese-simplified/space
https://dictionary.cambridge.org/us/dictionary/english-chinese-simplified/state


In [8]:
# Outputs every scrape to PDF
linespacing = 5
    
pdf = FPDF()
pdf.set_margins(15, 15, 15)
pdf.add_page()
words_per_page = 5
on_page = 0
for s in scrapes:
    pdf.set_text_color(0)
    pdf.set_font('Times', 'B', 12)
    pdf.cell(0, linespacing, s["word"], 0, 1)

    pdf.set_font('Times', '', 12)
    pdf.multi_cell(0, linespacing, s["english_defn"], 0, 1)

    pdf.add_font('fireflysung', '', './fonts/fireflysung-1.3.0/fireflysung.ttf', uni=True)
    pdf.set_font('fireflysung', '', 12)
    pdf.cell(0, linespacing, s["lang_defn"], 0, 1)

    pdf.set_font('Times', 'U', 12)
    pdf.cell(63, linespacing, "Translate to English using the word:", 0, 0)

    pdf.add_font('fireflysung', '', './fonts/fireflysung-1.3.0/fireflysung.ttf', uni=True)
    pdf.set_font('fireflysung', '', 12)
    pdf.multi_cell(0, linespacing, s["lang_example"], 0, 1)

    # As practice, the English translation is made nearly invisible
    pdf.set_font('Times', '', 10)
    pdf.set_text_color(235)
    pdf.cell(0, 20, s["english_example"], 0 ,1)
    
    on_page+=1
    if on_page == words_per_page:
        pdf.add_page()
        
pdf_output_name = datetime.today().strftime("%Y-%m-%d")+".pdf"
pdf.output(os.path.join('pdfs',pdf_output_name), 'F')



''