# Wikidata lexicon

In [1]:
from tools.datasets import *

book = "L536"

book_df = fetch_dataset(book, provider="wikidata")

Dataset wikidata/L536.json already downloaded. Skipping...


## Wikidata lexemes breakdown structure

(cfr. Wikidata.ipynb). The online documentation on lexemes is pretty limited.

A lexeme is a unit of lexical meaning. Morphologically speaking it can only belong to one grammatical category. Homographical lexemes (P5402) are stored as different lexemes. In this case, L536 refers to book as a noun

- `lemmas`: array of lemmas of a lexeme.
    - `#lang` (e.g. `en`): contains the basic lemma in one or more language (lang->value). In general, the word could be valid in more languages.
    - `lexicalCategory`: an entity describing the grammatical category (verb, noun...)
    - `language`: a lexema only corresponds to a single language. Even here, just an entity
- `claims`: Structured like normal wikidata claims, contains grammatical features of the main lexeme and other relationships not related to senses, glosses or morphological forms. For example, `P5185` is the grammatical gender, `P5402` is a homograph lexeme.
- `forms`: an array of morphological forms. Each form is called L{ENTITY_NAME}-F{NO} with NO starting from 1.
    - `ìd`
    - `representations`: like for `#lang` above, but this time it represents a morphological variation.
    - `grammaticalFeatures`: an array of grammatical features
- `senses`: array of senses (either a translation or a definition, depending on the start and end language)
    - `claims`: the structure is similar to a normal claim in wikidata, but the number of predicates is circumscribed to:
        - `P5972: translation`: associate with other lexeme forms to provide translation. The values follow the form `wd:LX-SN` where `LX` is a lexeme and `SN` is the sense number starting from S1. Human-readable annotations can be found by querying their label (`rdfs:label` or `skos:definition` on the dataset).
        - `P5137: item for this sense`: the corresponding Wikidata Entity
        - a few others (`P18: image`, ...)
    - `glosses`: categorises the noun. Mainly used to disambiguate word senses. Like for `#lang` above.

In [2]:
import pandas as pd

book_pd = pd.json_normalize(book_df["entities"]["L536"])

In [3]:
book_pd.columns

Index(['pageid', 'ns', 'title', 'lastrevid', 'modified', 'type', 'id',
       'lexicalCategory', 'language', 'forms', 'senses', 'lemmas.en.language',
       'lemmas.en.value', 'claims.P5402'],
      dtype='object')

In [4]:
def senses(entity_df):
    available_senses = {}
    if "senses" in entity_df:
        for sense in entity_df["senses"][0]:
            available_senses[sense["id"]] = list(sense["glosses"].values())
    return available_senses
    

senses(book_pd)

{'L536-S1': [{'language': 'en', 'value': 'document'},
  {'language': 'ru', 'value': 'документ'},
  {'language': 'es', 'value': 'documento'},
  {'language': 'it', 'value': 'documento'},
  {'language': 'pt', 'value': 'documento'},
  {'language': 'pt-br', 'value': 'documento'},
  {'language': 'nl',
   'value': 'een ingebonden bundel bedrukte of beschreven vellen papier'},
  {'language': 'zh', 'value': '書'},
  {'language': 'zh-hant', 'value': '書'},
  {'language': 'zh-tw', 'value': '書'},
  {'language': 'nan', 'value': 'tsu'},
  {'language': 'de', 'value': 'Buch'},
  {'language': 'tg', 'value': 'ҳуҷҷат'},
  {'language': 'fr', 'value': 'document'}]}

In [5]:
portrait_df = fetch_dataset("L12", provider="wikidata")
senses(portrait_df)

Dataset wikidata/L12.json already downloaded. Skipping...


{}

## Testing data

We are collecting the top 1000 most used worst according to Wikidictionary. The counts are based on the absolute word frequency extracted from TV series and movie scripts in public domain till 2006. More details [here](https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/1-1000). From here on, we'll refer to them as WDTV.

Similarly, we compare with an extraction from Project Gutemberg (synced 2006 - is there anything more modern?) (WDPG) and hunspell-en-gb -ise (HUN-GB).

We also test against Wordnet and the full wiktionary.

In [6]:
# scrape wdtv

from bs4 import BeautifulSoup
from requests import get
from os.path import join


def scrape_wiktionary(url):
    r = get(url)
    parsed = BeautifulSoup(r.content, "html.parser")
    tables = parsed.find_all("table")
    table = tables[0]
    rows = table.find_all("tr")[1:]
    cols = [row.find_all("td")[1].find("a").text for row in rows]
    
    return cols


    
wdtv_list = scrape_wiktionary("https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/1-1000")
wdpg_list = scrape_wiktionary("https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/PG/2006/04/1-10000")

In [7]:
with wrap_open("wordlists/en_GB-ise.txt") as fp:
    hun_en_gb = fp.readlines()[1:-1]

In [10]:
# This is very stupid. I am well aware of that.

import random
import spacy
wdtv_sample = random.sample(wdtv_list, 10)
nlp = spacy.load('en_core_web_sm',
                     disable=["tagger", "parser", "ner", "entity_linker"])

print(wdtv_sample)
sample_lemmas = [tok.lemma_ for tok in nlp(" ".join(wdtv_sample))]
print(sample_lemmas)

['box', 'pick', 'probably', 'write', 'face', 'twenty', 'relationship', 'my', "isn't", 'buddy']
['box', 'pick', 'probably', 'write', 'face', 'twenty', 'relationship', 'my', 'be', 'not', 'buddy']


In [11]:
import os

# TODO: replace with pyspark pipeline to ditch sparql
def all_lexemes():
    lexemes_file = get_filename_path("wikidata/flattened_lexemes.pkl")
    if not os.path.isfile(lexemes_file):
        print(f"File {lexemes_file} not found, dumping from SPARQL...")
        find_lexemes_query = """
        SELECT ?lexeme ?lemma ?form ?word
        WHERE
        {
          ?lexeme dct:language wd:Q1860;
                  wikibase:lemma ?lemma;
                  ontolex:lexicalForm ?form.
          ?form ontolex:representation ?word .
        }
        """
        result = wikidata_sparql.run_query(find_lexemes_query)
        result.to_pickle(lexemes_file)
        return result
    else:
        print(f"File {lexemes_file} dumped")
        return pd.read_pickle(lexemes_file)

samples = all_lexemes()

File data/wikidata/flattened_lexemes.pkl not found, dumping from SPARQL...


In [12]:
samples.head()

Unnamed: 0,lexeme.type,lexeme.value,lemma.xml:lang,lemma.type,lemma.value,form.type,form.value,word.xml:lang,word.type,word.value
0,uri,http://www.wikidata.org/entity/L16917,en,literal,hide,uri,http://www.wikidata.org/entity/L16917-F1,en,literal,hide
1,uri,http://www.wikidata.org/entity/L16917,en,literal,hide,uri,http://www.wikidata.org/entity/L16917-F2,en,literal,hides
2,uri,http://www.wikidata.org/entity/L13310,en,literal,compromise,uri,http://www.wikidata.org/entity/L13310-F1,en,literal,compromise
3,uri,http://www.wikidata.org/entity/L13310,en,literal,compromise,uri,http://www.wikidata.org/entity/L13310-F2,en,literal,compromises
4,uri,http://www.wikidata.org/entity/L133,en,literal,evaluate,uri,http://www.wikidata.org/entity/L133-F1,en,literal,evaluate


In [13]:
samples.columns

Index(['lexeme.type', 'lexeme.value', 'lemma.xml:lang', 'lemma.type',
       'lemma.value', 'form.type', 'form.value', 'word.xml:lang', 'word.type',
       'word.value'],
      dtype='object')

In [15]:
# for each sampled lemma, is there a fetched wikidata lemma covering it?
def is_covered_lemma(dataframe, lemma):
    found = (dataframe["lemma.value"] == lemma).any()
    #if not found:
    #    print(f"Lemma {lemma} not found")
    return found

# for each sampled lemma, is there a fetched wikidata form covering it?
def is_covered_form(dataframe, form):
    found = (dataframe["word.value"] == form).any()
    #if not found:
    #    print(f"Form {form} not found")
    return found

In [16]:
def strip_proper_nouns(tok):
    return tok.pos_ != "PROPN" and not tok.text[0].isupper()

def test_dataset(wordlist, sample_size=100,
                    spacy_language_model="en_core_web_sm"):
    """Test against the given wordlist"""
    
    # This is extremely stupid: extracting the lemma from a single word
    # *will* lead to ambiguities for homographical words; POS tagging will
    # likely break as well, despite we are mainly using it for filtering out
    # proper nouns.
    # For some languages (e.g. latin) it is a serious issue.
    nlp = spacy.load(spacy_language_model,
                         disable=["parser", "ner", "entity_linker"])
    
    
    if sample_size:
        wordlist_sample = random.sample(wordlist, sample_size)
    else:
        wordlist_sample = wordlist
        
    # only consider the very first token
    tokens = list(filter(strip_proper_nouns, [nlp(word)[0] for word in wordlist_sample]))
    wordlist_filtered = [tok.text for tok in tokens]
    
    # Make sure to only consider distinct lemmas
    lemmas_sample = list(set([tok.lemma_ for tok in tokens]))
    extracted_lexemes = all_lexemes()
    
    # 1. Group the lexemes by lemma, find how many match
    
    matched_lemmas = [lemma for lemma in lemmas_sample
                                     if is_covered_lemma(extracted_lexemes, lemma)]
    
    matched_forms = [form for form in wordlist_filtered
                                     if is_covered_form(extracted_lexemes, form)]
    
    distinct_lemmas_count = len(matched_lemmas)
    
    # 2. Match the original wordlist sample with the forms
    distinct_forms_count = len(matched_forms)
    
    # How many
    return (distinct_lemmas_count, len(lemmas_sample),
             distinct_forms_count, len(wordlist_filtered))
    

In [17]:
test_dataset(wdtv_list, sample_size=0)

File data/wikidata/flattened_lexemes.pkl dumped


(586, 618, 817, 833)

In [18]:
test_dataset(wdpg_list, sample_size=500)

File data/wikidata/flattened_lexemes.pkl dumped


(355, 370, 422, 427)

In [19]:
test_dataset(hun_en_gb, sample_size=1000)

File data/wikidata/flattened_lexemes.pkl dumped


(274, 520, 281, 526)