# Wiktionary RDF dump

In [1]:
import os
parquet_pos = os.path.join(os.getcwd(), "data/wiktionary/parquet-index_2.11-0.4.1-SNAPSHOT.jar")

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
                        .config("spark.jars", parquet_pos) \
                        .getOrCreate()

# Add the python modules within the jar
spark.sparkContext.addPyFile(parquet_pos)

In [3]:
from lightcopy.index import QueryContext

In [4]:
context = QueryContext(spark)

In [5]:
from tools.providers import WiktionaryProvider

provider = WiktionaryProvider()

sample_1000 = os.path.join("data", provider.get_filename_path("sample_1000", "parquet"))
context.index.create.mode('overwrite').indexBy("word").parquet(sample_1000)

In [6]:
context.index.exists.parquet(sample_1000)

True

In [7]:
wiktionary_df_sample = context.index.parquet(sample_1000)

In [12]:
from tools.extractors import extract_form, extract_df

In [24]:
# Import all the available namespaces in a handy dict
from rdflib import Namespace, namespace

namespaces = {
    "dct": "http://purl.org/dc/terms/",
    "ontolex": "http://www.w3.org/ns/lemon/ontolex#",
    "wikibase": "http://wikiba.se/ontology#",
    "wd": "http://www.wikidata.org/entity/",
    "wdt": "http://www.wikidata.org/prop/direct/",
    "kgl": "http://grill-lab.org/kg/entity/",
    "kglprop": "http://grill-lab.org/kg/property/"
}

namespaces = dict((key, Namespace(val)) for (key, val) in namespaces.items())

for ns in dir(namespace):
    imported = getattr(namespace, ns)
    if isinstance(imported, Namespace) or isinstance(imported, namespace.ClosedNamespace):
        namespaces[ns.lower()] = imported

In [25]:
namespaces

{'dct': Namespace('http://purl.org/dc/terms/'),
 'ontolex': Namespace('http://www.w3.org/ns/lemon/ontolex#'),
 'wikibase': Namespace('http://wikiba.se/ontology#'),
 'wd': Namespace('http://www.wikidata.org/entity/'),
 'wdt': Namespace('http://www.wikidata.org/prop/direct/'),
 'kgl': Namespace('http://grill-lab.org/kg/entity/'),
 'kglprop': Namespace('http://grill-lab.org/kg/property/'),
 'csvw': Namespace('http://www.w3.org/ns/csvw#'),
 'dc': Namespace('http://purl.org/dc/elements/1.1/'),
 'dcat': Namespace('http://www.w3.org/ns/dcat#'),
 'dcterms': Namespace('http://purl.org/dc/terms/'),
 'doap': Namespace('http://usefulinc.com/ns/doap#'),
 'foaf': rdf.namespace.ClosedNamespace('http://xmlns.com/foaf/0.1/'),
 'odrl2': Namespace('http://www.w3.org/ns/odrl/2/'),
 'org': Namespace('http://www.w3.org/ns/org#'),
 'owl': Namespace('http://www.w3.org/2002/07/owl#'),
 'prof': Namespace('http://www.w3.org/ns/dx/prof/'),
 'prov': rdf.namespace.ClosedNamespace('http://www.w3.org/ns/prov#'),
 'rd

In [26]:
from rdflib import Graph
g = Graph()

In [27]:
import pandas as pd
from tools.dumps import wrap_open

with wrap_open("wikidata/grammatical_categories.json") as fp:
    wikidata_grammatical_categories = pd.read_json(fp)

with wrap_open("wikidata/pos_categories.json") as fp:
    pos_categories = pd.read_json(fp)

In [29]:
# for verbs

wikidata_grammatical_categories[wikidata_grammatical_categories['entityLabel.value'].str.contains("person")]

Unnamed: 0,entity.value,entityLabel.value
123,http://www.wikidata.org/entity/Q27918551,masculine personal
128,http://www.wikidata.org/entity/Q52943193,masculine animate non-personal
131,http://www.wikidata.org/entity/Q54152717,not masculine personal
345,http://www.wikidata.org/entity/Q51929218,first-person singular
346,http://www.wikidata.org/entity/Q51929290,first-person plural
347,http://www.wikidata.org/entity/Q51929369,second-person singular
348,http://www.wikidata.org/entity/Q51929403,second-person plural
349,http://www.wikidata.org/entity/Q51929447,third-person singular
350,http://www.wikidata.org/entity/Q51929517,third-person plural
351,http://www.wikidata.org/entity/Q52431955,third-person masculine singular


In [30]:
wikidata_grammatical_categories[wikidata_grammatical_categories['entityLabel.value'].str.contains("present")]

# Interesting: present tense, 

Unnamed: 0,entity.value,entityLabel.value
136,http://www.wikidata.org/entity/Q192613,present tense
144,http://www.wikidata.org/entity/Q1240211,present perfect
152,http://www.wikidata.org/entity/Q3502553,present subjunctive
153,http://www.wikidata.org/entity/Q3686414,conditional present
154,http://www.wikidata.org/entity/Q3910936,simple present
156,http://www.wikidata.org/entity/Q7240943,present continuous
157,http://www.wikidata.org/entity/Q9062494,present perfect in English
158,http://www.wikidata.org/entity/Q10345583,present participle
174,http://www.wikidata.org/entity/Q52434162,present imperative
175,http://www.wikidata.org/entity/Q52434245,present infinitive


In [31]:
kgl = namespaces["kgl"]
kgl_prop = namespaces["kglprop"]
form_link = namespaces["ontolex"].lexicalForm
kgl_form_link = kgl_prop.form
sense_link = namespaces["ontolex"].sense
kgl_sense_link = kgl_prop.sense
form_label = namespaces["ontolex"].representation
rdfs_label = namespaces["rdfs"].label
rdf_type = namespaces["rdf"].type
pos_link = kgl_prop.pos

sameAs = namespaces["owl"].sameAs
definition = namespaces["skos"].definition
kgl_definition = kgl_prop.definition
grammaticalFeature = kgl_prop.grammaticalFeature
kgl_label = kgl_prop.label
example_link = kgl_prop.example

from rdflib import URIRef

In [32]:
def add_category(g, label):
    cat_id = kgl[hash(label, "grammatical_category")]
    g.add((cat_id, rdfs_label, Literal(label)))
    g.add((cat_id, kgl_label, Literal(label)))
    g.add((cat_id, rdf_type, kgl.GrammaticalCategory))
    return cat_id

def populate_categories(g: Graph):
    categories = {}
    
    for row in wikidata_grammatical_categories.iterrows():
        label = row[1]['entityLabel.value']
        wikidata_identifier = row[1]['entity.value']
        cat_id = add_category(g, label)
        g.add((cat_id, sameAs, URIRef(wikidata_identifier)))
        categories[label] = cat_id

    # Wikidata is a horrible mess
    # Apparently some of the most beefy categories are not (in)direct subclasses
    # of "grammatical categories".
    extra_noun_categories = ["countable", "uncountable", "irregular",
                                  "usually uncountable", "unattested plural",
                                  "uncertain plural"]
    
    extra_verb_categories = ["defective"]
    
    extra_adjective_categories = ["positive", "comparative", "superlative",
                                        "not comparable", "comparable-only",
                                        "generally not comparable"]

    for cat in extra_noun_categories + extra_verb_categories + extra_adjective_categories:
        cat_id = add_category(g, cat)
        categories[cat] = cat_id
    
    return categories

In [33]:
import base64
import mmh3

def hash(word, pos):
    mmhash = mmh3.hash64(word + pos, signed=False)[0]
    mmhash = int.to_bytes(mmhash, 8, "big")
    return bytes.decode(base64.b32encode(mmhash)).rstrip("=").lower()

In [45]:
from rdflib.extras.infixowl import Class
from rdflib import Literal
from collections import defaultdict

def reset():
    global g
    global form_counter
    global sense_counter
    global lexemeClass
    global formClass
    
    form_counter = defaultdict(int)
    sense_counter = defaultdict(int)
    
    g = Graph()
    for k,v in namespaces.items():
        g.bind(k, v)
    
    lexemeClass = Class(kgl.Lexeme,
                            nameAnnotation=Literal("Lexeme"),
                            graph=g)
    lexemeClass.comment = Literal("A lexeme is the main entry of the dictionary")
    
    formClass = Class(kgl.Form, nameAnnotation=Literal("Form"), graph=g)
    formClass.comment = Literal("A form is a morphological form that appears when the lexeme is a declinable or conjugable noun")
    
    senseClass = Class(kgl.Sense, nameAnnotation=Literal("Sense"), graph=g)
    senseClass.comment = Literal("A sense, or synset, is a unit of meaning of a lexeme")
    
    subsenseClass = Class(kgl.Subsense, nameAnnotation=Literal("Subsense"), graph=g)
    subsenseClass.comment = Literal("A subsense is a possible refinement on a sense")
    
    usageClass = Class(kgl.Usage, nameAnnotation=Literal("Usage"), graph=g)
    usageClass.comment = Literal("An usage is a linguistic 'usage' of a lexeme")
    
    
    grammaticalCategory = Class(kgl.GrammaticalCategory, graph=g)

In [41]:
from tools.providers import WiktionaryProjectGutenberg

wdpg_list = WiktionaryProjectGutenberg.get_wordlist()

[]

In [42]:
wdpg_df = spark.createDataFrame([[word] for word in wdpg_list], ['word'])

In [43]:
wiktionary_full_df = spark.read.parquet("data/wiktionary/senses_examples_quotations_v2.parquet")

In [55]:
wiktionary_full_df.select(['word', 'pos']).filter('pos == "suffix"').limit(10).collect()

[Row(word="-'d", pos='suffix'),
 Row(word='-ock', pos='suffix'),
 Row(word='-lepsy', pos='suffix'),
 Row(word='-ile', pos='suffix'),
 Row(word='-adic', pos='suffix'),
 Row(word='-meter', pos='suffix'),
 Row(word='-ent', pos='suffix'),
 Row(word='-ory', pos='suffix'),
 Row(word='-sky', pos='suffix'),
 Row(word='-off', pos='suffix')]

In [57]:
wiktionary_full_df.select(['word', 'pos']).filter('pos == "circumfix"').limit(10).collect()

[Row(word='en- -en', pos='circumfix'),
 Row(word='em- -en', pos='circumfix'),
 Row(word='a- -ing', pos='circumfix')]

In [56]:
# TODO: move this analysis on WiktionaryFull notebook

wiktionary_full_df.select("pos").distinct().collect()

[Row(pos='abbrev'),
 Row(pos='conj'),
 Row(pos='det'),
 Row(pos='circumfix'),
 Row(pos='name'),
 Row(pos='prep'),
 Row(pos='suffix'),
 Row(pos='postp'),
 Row(pos='pron'),
 Row(pos='infix'),
 Row(pos='particle'),
 Row(pos='symbol'),
 Row(pos='letter'),
 Row(pos='num'),
 Row(pos='phrase'),
 Row(pos='adj'),
 Row(pos='article'),
 Row(pos='adv'),
 Row(pos='prep_phrase'),
 Row(pos='interfix'),
 Row(pos='proverb'),
 Row(pos='verb'),
 Row(pos='intj'),
 Row(pos='noun'),
 Row(pos='prefix'),
 Row(pos='affix')]

In [None]:
wiktionary_full_df.join(wdpg_df, "word", "inner").write.parquet("data/wiktionary/wiktionary_wdpg.parquet")

In [48]:
from rdflib import Literal
from collections import defaultdict
from enum import Enum

def is_in_graph(x):
    try:
        next(g.triples((x, None, None)))
        return True
    except StopIteration:
        return False

def add_form(g: Graph, word_id: str, lexeme_id: URIRef, label: str):
    count = form_counter[word_id]
    form_id = kgl[f"{word_id}-F{count}"]
    form_counter[word_id] += 1
    g.add((lexeme_id, form_link, form_id))
    g.add((lexeme_id, kgl_form_link, form_id))
    g.add((form_id, namespaces['rdf'].type, kgl.Form))
    g.add((form_id, kgl_prop['label'], Literal(label, lang="en")))
    g.add((form_id, rdfs_label, Literal(label, lang="en")))
    g.add((form_id, form_label, Literal(label, lang="en")))
    return form_id


class SenseType(Enum):
    SENSE = 1,
    SUBSENSE = 2,
    USAGE = 3

    
def add_sense(g: Graph, word_id: str, lexeme_id: URIRef, sense_definition: str, parent_sense_id=None, sense_type=SenseType.SENSE):
    count = sense_counter[word_id]
    sense_id = kgl[f"{word_id}-S{count}"]
    sense_counter[word_id] += 1
    
    if sense_type == SenseType.SENSE:
        g.add((lexeme_id, sense_link, sense_id))
        g.add((lexeme_id, kgl_sense_link, sense_id))
        g.add((sense_id, namespaces['rdf'].type, kgl.Sense))
    elif sense_type == SenseType.SUBSENSE:
        g.add((parent_sense_id, kgl_prop['subsense'], sense_id))
        g.add((sense_id, namespaces['rdf'].type, kgl.Subsense))
    else:
        g.add((parent_sense_id, kgl_prop['usage'], sense_id))
        g.add((sense_id, namespaces['rdf'].type, kgl.Usage))
    
    
    g.add((sense_id, rdfs_label, Literal(sense_definition, lang="en")))
    g.add((sense_id, definition, Literal(sense_definition, lang="en")))
    g.add((sense_id, kgl_definition, Literal(sense_definition, lang="en")))
    
    return sense_id

def add_sense_rec(g: Graph, senses, word_id, lexeme_id, depth=0, parent_sense=None):    
    for sense in senses:
        # TODO: now that senses are hierarchically structured, glosses should become a single string
        gloss = sense['glosses'][0] if sense['glosses'] else ""
        examples = sense['examples']
        if gloss:
            senseType = {0: SenseType.SENSE, 1: SenseType.SUBSENSE, 2: SenseType.USAGE}
            sense_id = add_sense(g, word_id, lexeme_id, gloss, parent_sense, senseType[depth])
                
            if examples:
                for example in examples:
                    if example:
                        g.add((sense_id, example_link, Literal(example, lang="en")))
                    
        if 'subsenses' in sense and sense['subsenses'] is not None:
            add_sense_rec(g, sense['subsenses'], word_id, lexeme_id, depth+1, sense_id)
        if 'usages' in sense and sense['usages'] is not None:
            add_sense_rec(g, sense['usages'], word_id, lexeme_id, depth+1, sense_id)
                    
       
    
def add_grammatical_categories(g,  word_id, cats):
    for cat in cats:
        g.add((word_id, grammaticalFeature, categories[cat]))

def add_noun_forms(g: Graph, word, word_id, lexeme_id, noun_forms):
    if noun_forms.irregular:
        g.add((lexeme_id, grammaticalFeature, categories["irregular"]))

    # countable can be either no or yes or sometimes.
    if not noun_forms.countable == "no":
        g.add((lexeme_id, grammaticalFeature, categories["countable"]))
    if not noun_forms.countable == "yes":
        if noun_forms.always:
            g.add((lexeme_id, grammaticalFeature, categories["uncountable"]))
        else:
            g.add((lexeme_id, grammaticalFeature, categories["usually uncountable"]))

    if noun_forms["optional"]:
        add_grammatical_categories(g, lexeme_id,
                                        [noun_forms["optional"] + " plural"])

    singular = add_form(g, word_id, lexeme_id, word)
    g.add((singular, grammaticalFeature, categories['singular']))

    if noun_forms["plurals"]:
        for plural in noun_forms.plurals:
            form_id = add_form(g, word_id, lexeme_id, plural)
            add_grammatical_categories(g, form_id, ['plural'])


def add_adj_forms(g: Graph, word,  word_id, lexeme_id, adj_forms):
    opt = adj_forms['optional']
    if adj_forms['optional']:
        add_grammatical_categories(g, lexeme_id, [opt])
    if opt is None or opt != "not comparable":
        positive_form = add_form(g, word_id, lexeme_id, word)
        add_grammatical_categories(g, positive_form, ["positive"])

    if adj_forms['comparatives']:
        for comp in adj_forms['comparatives']:
            comp_form = add_form(g, word_id, lexeme_id, comp)
            add_grammatical_categories(g, comp_form, ["comparative"])

    if adj_forms['superlatives']:
        for sup in adj_forms['superlatives']:
            sup_form = add_form(g, word_id, lexeme_id, sup)
            add_grammatical_categories(g, sup_form, ["superlative"])

            
def add_verb_forms(g: Graph, word: str, word_id, lexeme_id, verb_forms):
    infinitive = add_form(g, word_id, lexeme_id, word)
    add_grammatical_categories(g, infinitive,
                                    ["present tense", "infinitive",
                                     "first-person singular",
                                     "second-person singular",
                                     "first-person plural",
                                     "second-person plural", 
                                     "third-person plural"])

    if verb_forms["pres_3sg"]:
        pres_3sg = add_form(g, word_id, lexeme_id, verb_forms["pres_3sg"])
        add_grammatical_categories(g, pres_3sg,
                                   ["present tense",
                                    "third-person singular"])

    else:
        add_grammatical_categories(g, lexeme_id, ["defective"])

    pres_ptc = add_form(g, word_id, lexeme_id, verb_forms["pres_ptc"])
    add_grammatical_categories(g, pres_ptc, ["present participle"])

    past = add_form(g, word_id, lexeme_id, verb_forms["past"])
    add_grammatical_categories(g, past, ["past tense", "simple past"])

    past_ptc = add_form(g, word_id, lexeme_id, verb_forms["past_ptc"])
    add_grammatical_categories(g, past_ptc, ["past participle"])
        
def add_to_graph(g: Graph, row):
    
    word = row['word']
    senses = row['senses']
    pos = row['pos']
    noun_forms = row['noun_forms']
    adj_forms = row['adj_forms']
    verb_forms = row['verb_forms']
    
    word_id = hash(word, pos)
    lexeme_id = kgl[word_id]
    if not is_in_graph(word_id):
        g.add((lexeme_id, namespaces['rdf'].type, kgl.Lexeme))
        g.add((lexeme_id, pos_link, kgl[pos]))
        g.add((lexeme_id, kgl_label, Literal(word, lang="en")))
        g.add((lexeme_id, rdfs_label, Literal(word, lang="en")))
        # g.add((lexeme_id, namespace['dct'].language, something_for_english_language))
        
    
    # Detect collision by just looking at the word label.
    # In theory we should also check that different pos may cause a collision
    # but it looks extremely unlikely
    else:
        label = g.label(word_id)
        if label != word:
            print(f"Detected collision between {label} and {word}")
            word_id = hash(word + "$42", pos)
            lexeme_id = kgl[word_id]
            g.add((lexeme_id, pos_link, kgl[pos]))
            g.add((lexeme_id, kgl_prop.label, Literal(word, lang="en")))
            g.add((lexeme_id, namespaces['rdfs'].label, Literal(word, lang="en")))
    
    if row['senses']:
        add_sense_rec(g, row['senses'], word_id, lexeme_id)
            
    # Nouns
    if noun_forms:
        add_noun_forms(g, word, word_id, lexeme_id, noun_forms)
      
    # Adjectives
    if adj_forms:
        add_adj_forms(g, word, word_id, lexeme_id, adj_forms)
        
    # Verbs
    if verb_forms:
        add_verb_forms(g, word, word_id, lexeme_id, verb_forms)

In [49]:
wiktionary_wdpg_df = spark.read.parquet("data/wiktionary/wiktionary_wdpg.parquet")

reset()
categories = populate_categories(g)
# This is dumb, but until I get SANSA or anything more decent to work...
for row in wiktionary_wdpg_df.rdd.toLocalIterator():
    # Exclude automatically-generated single forms from being retrieved
    if(row['head']['template_name'] != 'head'):
        add_to_graph(g, row)

In [50]:
g.serialize("data/output/wdpg_10000_common.ttl", "turtle")

In [52]:
wiktionary_v2_df = spark.read.parquet("data/wiktionary/senses_examples_quotations_v2.parquet")

In [53]:
wiktionary_v2_df.filter("word = 'lay'").count()

9

In [93]:
reset()
categories = populate_categories(g)
# This is dumb, but until I get SANSA or anything more decent to work...
for row in wiktionary_v2_df.rdd.toLocalIterator():
    # Exclude automatically-generated single forms 6from being retrieved
    #print(row['head']['template_name'])
    if(row['head']['template_name'] != 'head'):
        add_to_graph(g, row)

In [94]:
g.serialize("data/wiktionary/full_v1.ttl", "turtle")

In [40]:
wiktionary_2_wdpg = wiktionary_v2_df.join(wdpg_df, "word", "inner")#.write.saveAsTable("wiktionary_2_wdpg")

In [73]:
# wiktionary_2_wdpg = spark.read.table("wiktionary_2_wdpg")
wiktionary_2_wdpg = wiktionary_v2_df.join(wdpg_df, "word", "inner")
reset()
categories = populate_categories(g)

# This is dumb, but until I get SANSA or anything more decent to work...
for row in wiktionary_2_wdpg.rdd.toLocalIterator():
    # Exclude automatically-generated single forms from being retrieved
    #print(row['head']['template_name'])
    if(row['head']['template_name'] != 'head'):
        add_to_graph(g, row)
        
g.serialize("data/wiktionary/wdpg_v2.ttl", "turtle")