In [1]:
import lexicon

In [3]:
lexicon.lookup_word('talossa', 'Finnish')

{'url': 'https://en.wiktionary.org/wiki/talossa#Finnish',
 'ipa': None,
 'img': None,
 'definitions': {'Noun': {'inessive singular of talo': None}}}

In [None]:
# links used in dev
'''
https://en.wiktionary.org/wiki/min%C3%A4#Finnish
https://en.wiktionary.org/wiki/koira
https://en.wiktionary.org/wiki/kyll%C3%A4#Finnish
https://en.wiktionary.org/wiki/sinua#Finnish
https://en.wiktionary.org/wiki/hyv%C3%A4#Finnish
https://en.wiktionary.org/api/rest_v1/page/html/p%C3%A4iv%C3%A4%C3%A4
'''

---

In [2]:
import llm
import requests
from bs4 import BeautifulSoup, element

In [5]:

WORD_TYPES = ['Adjective', 'Noun', 'Verb', 'Adverb', 'Article', 'Interjection']

In [246]:
# word = 'huomen'
# word = 'koira'
# word = 'minä'
# word = 'hyvä'
# word = 'talo'
# word = 'päivää'
word = 'päivä'
language = 'Finnish'

html = requests.get(f'https://en.wiktionary.org/api/rest_v1/page/html/{word}').text
soup = BeautifulSoup(html, 'html.parser')

In [247]:
# --- get sections ---

section = soup.select_one(f'section:has(> h2:-soup-contains("{language}"))')

pronunciation = section.select_one('section:has(> h3:-soup-contains("Pronunciation"))')
word_type_data = {
    t: section.select_one(f'section:has(> *:-soup-contains("{t}"))')
    for t in WORD_TYPES
}

# --- handle section titles, all must be handled ---

section_titles = [x.text for x in section.select('h3', recursive=False)]
for title in WORD_TYPES + ['Etymology', 'Etymology 1', 'Etymology 2', 'Pronunciation', 'References', 'Further reading', 'Anagrams', 'See also']:
    if title in section_titles:
        section_titles.remove(title)

if section_titles:
    print('UNHANDLED SECTION TITLES:', section_titles)

# --- parse IPA ---

ipa = None
if pronunciation:
    ipa = next(ipa.text for ipa in pronunciation.select('.IPA') if ipa.text[0]=='/')

# --- parse img ---

img = section.select_one('figure[typeof="mw:File/Thumb"] img')
if img:
    img = 'https:' + img['src']


In [248]:
# --- parse definitions ---

definitions = {}

for word_type, data in word_type_data.items():
    if not data: continue
    sub_defs = definitions[word_type] = {}

    for li in data.select_one('ol').select('li'):
        # print('\n---\n')
        # print(li.text)

        for element in soup.find_all(class_="citation-whole"):
            element.decompose()

        dl = li.select_one('ul')
        if not dl:
            dl = li.select_one('dl')
        if not dl:
            definition = []
            for e in li.children:
                if e.name not in ['ul', 'ol', 'dl', 'style']:
                    definition += e.text
            definition = ''.join(definition).strip()
            if definition:
                sub_defs[definition] = None
        else:
            definition = ''
            for e in li.children:
                if e.name == 'style': continue
                # print(e.name, e, e.text)
                if e.name == 'dl' or e.name == 'ul': break
                definition += e.text

            examples = []
            for ex in dl.children:
                ex = ex.text.strip()
                if not ex: continue
                examples.append(ex)

            # print('(')
            # print(definition)
            # print()
            # print(examples)
            # print(')')

            if definition:
                sub_defs[definition] = examples

import json
print(
    json.dumps(
        definitions,
        indent=4,
        ensure_ascii=False
    )
)
definitions

{
    "Noun": {
        "day (period between sunrise and sunset)\n": [
            "Siitä on nyt vuoden päivät.It was about one year ago."
        ],
        "day (from midnight to midnight)": null,
        "day (period of 24 hours)\n": [
            "Synonym: vuorokausi"
        ],
        "day (the part of a day period which one spends at one’s job, school, etc.)": null,
        "(poetic, archaic) the sun\n": [
            "Synonym: aurinko",
            "Päivä paistaa. ― The sun is shining.",
            "Päivä painuu mailleen. ― The sun sets."
        ],
        "event, occasion, symposium, forum, fair, etc. (in the singular only if the event lasts one day, otherwise in the plural)\n": [
            "vanhan kirjallisuuden päivät ― antiquarian book fair",
            "tieteen päivät ― (biennial) science forum"
        ]
    }
}


{'Noun': {'day (period between sunrise and sunset)\n': ['Siitä on nyt vuoden päivät.It was about one year ago.'],
  'day (from midnight to midnight)': None,
  'day (period of 24 hours)\n': ['Synonym: vuorokausi'],
  'day (the part of a day period which one spends at one’s job, school, etc.)': None,
  '(poetic, archaic) the sun\n': ['Synonym: aurinko',
   'Päivä paistaa. ― The sun is shining.',
   'Päivä painuu mailleen. ― The sun sets.'],
  'event, occasion, symposium, forum, fair, etc. (in the singular only if the event lasts one day, otherwise in the plural)\n': ['vanhan kirjallisuuden päivät ― antiquarian book fair',
   'tieteen päivät ― (biennial) science forum']}}