In [1]:
# A test with wiktionary RESTFul API.

from tools.datasets import *

In [47]:
import requests

def fetch_word(word):
    wiktionary_url = "https://en.wiktionary.org/api/rest_v1"
    r = requests.get(wiktionary_url + "/page/definition/" + word)
    r.raise_for_status()
    return r.json()

response = fetch_word("dog")

In [48]:
response.keys()

dict_keys(['en', 'af', 'da', 'nl', 'other', 'nv', 'pt', 'sv', 'vo'])

In [4]:
import pprint
pp = pprint.PrettyPrinter(indent=1)

pp.pprint(response["af"])

[{'definitions': [{'definition': '<span class="form-of-definition '
                                 'use-with-mention" about="#mwt283" '
                                 'typeof="mw:Transclusion">Alternative form of '
                                 '<span class="form-of-definition-link"><i '
                                 'class="Latn mention" lang="af"><a '
                                 'rel="mw:WikiLink" href="/wiki/dag#Afrikaans" '
                                 'title="dag">dag</a></i></span></span> (<span '
                                 'class="form-of-definition use-with-mention" '
                                 'about="#mwt284" '
                                 'typeof="mw:Transclusion">preterite of <span '
                                 'class="form-of-definition-link"><i '
                                 'class="Latn mention" lang="af"><a '
                                 'rel="mw:WikiLink" '
                                 'href="/wiki/dink#Afrikaans" '
 

In [7]:
pp.pprint(response["en"])

[{'definitions': [{'definition': 'A <a rel="mw:WikiLink" href="/wiki/mammal" '
                                 'title="mammal">mammal</a>, <i><a '
                                 'rel="mw:WikiLink" '
                                 'href="/wiki/Canis_lupus_familiaris" '
                                 'title="Canis lupus familiaris">Canis lupus '
                                 'familiaris</a></i>, that has been <a '
                                 'rel="mw:WikiLink" href="/wiki/domesticated" '
                                 'title="domesticated">domesticated</a> for '
                                 'thousands of <a rel="mw:WikiLink" '
                                 'href="/wiki/years" title="years">years</a>, '
                                 'of highly variable appearance due to <a '
                                 'rel="mw:WikiLink" href="/wiki/human" '
                                 'title="human">human</a> breeding.',
                   'examples': ['The <b>dog</b>

In [40]:
import pandas as pd

def replace_nan(df, column):
    """Replace NaNs in a column with an empty list.
    
    Required because pd.Series.fillna() does not accept lists.
    See https://stackoverflow.com/a/61944174
    """
    is_nan = df[column].isna()
    to_replace = pd.Series([[]] * is_nan.sum()).values
    df.loc[is_nan, column] = to_replace
    
def definition_to_df(response, language="en"):
    """explode definitions and convert the inner dicts into a pandas series, then join with PoS"""
    response_pd = pd.json_normalize(response[language]).explode("definitions")
    exploded_columns = response_pd["definitions"].apply(pd.Series).drop("parsedExamples", axis=1)
    
    replace_nan(exploded_columns, "examples")
    
    return pd.concat([response_pd['partOfSpeech'], exploded_columns], axis=1)

definition_to_df(response)

Unnamed: 0,partOfSpeech,definition,examples
0,Noun,"A <a rel=""mw:WikiLink"" href=""/wiki/mammal"" tit...",[The <b>dog</b> barked all night long.]
0,Noun,"Any member of the Family <a rel=""mw:WikiLink"" ...",[]
0,Noun,"A male dog, <a rel=""mw:WikiLink"" href=""/wiki/w...",[]
0,Noun,"A <a rel=""mw:WikiLink"" href=""/wiki/dull"" title...",[She’s a real <b>dog</b>.]
0,Noun,"A <a rel=""mw:WikiLink"" href=""/wiki/man"" title=...","[You lucky <b>dog</b>!, He's a silly <b>dog</b>.]"
0,Noun,"A <a rel=""mw:WikiLink"" href=""/wiki/coward"" tit...","[Come back and fight, you <b>dogs</b>!]"
0,Noun,Someone who is morally reprehensible.,[You dirty <b>dog</b>.]
0,Noun,"A sexually aggressive man (cf. <i><a rel=""mw:W...",[]
0,Noun,"Any of various mechanical devices for holding,...",[]
0,Noun,"<span class=""maintenance-line"" style=""color: #...",[]


In [49]:
from bs4 import BeautifulSoup

def parse_definition(word: str):
    """Parse a wiktionary definition.
    
    """
    fetched_word = fetch_word(word)
    result_df = definition_to_df(fetched_word)
    
    definition_stripped = result_df["definition"].apply(lambda definition: BeautifulSoup(definition).get_text())
    return definition_stripped

In [50]:
parse_definition("dog")

0    A mammal, Canis lupus familiaris, that has bee...
0    Any member of the Family Canidae, including do...
0    A male dog, wolf or fox, as opposed to a bitch...
0                  A dull, unattractive girl or woman.
0                   A man (derived from definition 2).
0                                            A coward.
0                Someone who is morally reprehensible.
0               A sexually aggressive man (cf. horny).
0    Any of various mechanical devices for holding,...
0    (Can we clean up this sense?) A click or palle...
0             A metal support for logs in a fireplace.
0                       The eighteenth Lenormand card.
0                                           A hot dog.
0                                            Underdog.
0                                                Foot.
0         (from "dog and bone") Phone or mobile phone.
0    One of the cones used to divide up a racetrack...
0                          shortened form of dog meat.
0    A flo

In [52]:
pp.pprint(fetch_word("ain't"))

{'en': [{'definitions': [{'definition': '<a rel="mw:WikiLink" href="/wiki/am" '
                                        'title="am">Am</a> <a '
                                        'rel="mw:WikiLink" href="/wiki/not" '
                                        'title="not">not</a>.'},
                         {'definition': '<a rel="mw:WikiLink" href="/wiki/are" '
                                        'title="are">Are</a> <a '
                                        'rel="mw:WikiLink" href="/wiki/not" '
                                        'title="not">not</a>, <a '
                                        'rel="mw:WikiLink" '
                                        'href="/wiki/aren\'t" '
                                        'title="aren\'t">aren’t</a>; <a '
                                        'rel="mw:WikiLink" href="/wiki/is" '
                                        'title="is">is</a> not, <a '
                                        'rel="mw:WikiLink" href="/wiki/isn\'

In [53]:
pp.pprint(fetch_word("families"))

{'ast': [{'definitions': [{'definition': '<span class="form-of-definition '
                                         'use-with-mention" about="#mwt7" '
                                         'typeof="mw:Transclusion"><a '
                                         'rel="mw:WikiLink" '
                                         'href="/wiki/Appendix:Glossary#plural_number" '
                                         'title="Appendix:Glossary">plural</a> '
                                         'of <span '
                                         'class="form-of-definition-link"><i '
                                         'class="Latn mention" lang="ast"><a '
                                         'rel="mw:WikiLink" '
                                         'href="/wiki/familia#Asturian" '
                                         'title="familia">familia</a></i></span></span>'}],
          'language': 'Asturian',
          'partOfSpeech': 'Noun'}],
 'da': [{'definitions': [{'definiti

## On Appendix:Glossary

It would be good to actually "start" building a KG dering some grammatical categories whenever possible. At the moment, Wikidata offers a very extensive glossary that could be used for Entity Linking with Wikidata Lexemes.

The wiktionary glossary includes a bunch of grammatical concepts, including Part of Speech and forms. In wikidata, they are distinct (the latter being (indirect) instances of Q980357 (grammatical category).
Indeed, let us start looking up wikidata grammatical categories.