# Parquet-Index test

In [1]:
import os
parquet_pos = os.path.join(os.getcwd(), "data/wiktionary/parquet-index_2.11-0.4.1-SNAPSHOT.jar")

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
                        .config("spark.jars", parquet_pos) \
                        .getOrCreate()

# Add the python modules within the jar
spark.sparkContext.addPyFile(parquet_pos)

In [3]:
from lightcopy.index import QueryContext

In [4]:
context = QueryContext(spark)

In [5]:
from tools.providers import WiktionaryProvider

provider = WiktionaryProvider()
sample_1000 = os.path.join("data", provider.get_filename_path("sample_1000", "parquet"))
context.index.create.mode('overwrite').indexBy("word").parquet(sample_1000)

In [6]:
context.index.exists.parquet(sample_1000)

True

In [7]:
wiktionary_df_sample = context.index.parquet(sample_1000)

In [8]:
wiktionary_df_sample.filter('word == "decolorate"').collect()

[Row(abbreviations=None, alternate=None, antonyms=None, categories=None, compounds=None, conjugation=None, derived=None, enum=None, heads=[Row(1='decolorat', 10=None, 11=None, 12=None, 13=None, 14=None, 2=None, 3=None, 4=None, 5=None, 6=None, 7=None, 8=None, 9=None, cat2=None, cat3=None, desc=None, f2qual=None, f3qual=None, f4qual=None, g=None, head=None, head2=None, past1=None, past2=None, past2_qual=None, past3=None, past3_qual=None, past4=None, past4_qual=None, past_ptc2=None, past_ptc2_qual=None, past_ptc3=None, past_ptc3_qual=None, past_ptc4=None, past_ptc4_qual=None, past_ptc5=None, past_ptc5_qual=None, past_ptc_qual=None, past_qual=None, pl=None, pl2qual=None, pl3qual=None, pl4qual=None, pl5qual=None, plqual=None, pres_3sg2=None, pres_3sg2_qual=None, pres_ptc2=None, pres_ptc2_qual=None, pres_ptc3=None, pres_ptc_qual=None, sc=None, sg=None, sort=None, suff=None, sup=None, sup1=None, sup2=None, sup3=None, sup4=None, template_name='en-verb', tr=None)], hypernyms=None, hyphenation=N

In [10]:
from pyspark.sql.functions import *

hijacker_df.select(explode('translations')).collect()

[Row(col=Row(alt=None, lang='nl', roman=None, script=None, sense='someone who hijacks', tags=['m'], word='kaper')),
 Row(col=Row(alt=None, lang='eo', roman=None, script=None, sense='someone who hijacks', tags=None, word='aerpirato')),
 Row(col=Row(alt=None, lang='de', roman=None, script=None, sense='someone who hijacks', tags=['m'], word='Entführer')),
 Row(col=Row(alt=None, lang='de', roman=None, script=None, sense='someone who hijacks', tags=['f'], word='Entführerin')),
 Row(col=Row(alt=None, lang='nrf', roman=None, script=None, sense='someone who hijacks', tags=['m'], word='haïjatcheux')),
 Row(col=Row(alt=None, lang='nb', roman=None, script=None, sense='someone who hijacks', tags=['m'], word='kaprer')),
 Row(col=Row(alt=None, lang='nn', roman=None, script=None, sense='someone who hijacks', tags=['m'], word='kaprar')),
 Row(col=Row(alt=None, lang='pl', roman=None, script=None, sense='someone who hijacks', tags=['m'], word='porywacz')),
 Row(col=Row(alt=None, lang='ru', roman=None, s

In [17]:
from tools.extractors import extract_form, extract_df

hijacker_df = extract_form(extract_df(wiktionary_df_sample, 'hijacker'))

In [12]:
# Import all the available namespaces in a handy dict
from rdflib import Namespace, namespace

namespaces = {
    "dct": "http://purl.org/dc/terms/",
    "ontolex": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "wikibase": "http://wikiba.se/ontology#",
    "wd": "http://www.wikidata.org/entity/",
    "wdt": "http://www.wikidata.org/prop/direct/",
    "kgl": "http://knowledge-glue.ir.dcs.gla.ac.uk/ontology/entity/",
    "kgl-prop": "http://knowledge-glue.ir.dcs.gla.ac.uk/ontology/property/"
}

namespaces = dict((key, Namespace(val)) for (key, val) in namespaces.items())

for ns in dir(namespace):
    imported = getattr(namespace, ns)
    if isinstance(imported, Namespace) or isinstance(imported, namespace.ClosedNamespace):
        namespaces[ns.lower()] = imported

In [13]:
namespaces

{'dct': Namespace('http://purl.org/dc/terms/'),
 'ontolex': Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#'),
 'wikibase': Namespace('http://wikiba.se/ontology#'),
 'wd': Namespace('http://www.wikidata.org/entity/'),
 'wdt': Namespace('http://www.wikidata.org/prop/direct/'),
 'kgl': Namespace('http://knowledge-glue.ir.dcs.gla.ac.uk/ontology/entity/'),
 'kgl-prop': Namespace('http://knowledge-glue.ir.dcs.gla.ac.uk/ontology/property/'),
 'csvw': Namespace('http://www.w3.org/ns/csvw#'),
 'dc': Namespace('http://purl.org/dc/elements/1.1/'),
 'dcat': Namespace('http://www.w3.org/ns/dcat#'),
 'dcterms': Namespace('http://purl.org/dc/terms/'),
 'doap': Namespace('http://usefulinc.com/ns/doap#'),
 'foaf': rdf.namespace.ClosedNamespace('http://xmlns.com/foaf/0.1/'),
 'odrl2': Namespace('http://www.w3.org/ns/odrl/2/'),
 'org': Namespace('http://www.w3.org/ns/org#'),
 'owl': Namespace('http://www.w3.org/2002/07/owl#'),
 'prof': Namespace('http://www.w3.org/ns/dx/prof/'),
 'prov': rdf.name

In [14]:
from rdflib import Graph
g = Graph()
for k,v in namespaces.items():
    g.bind(k, v)

In [15]:
import base64
import mmh3

def hash(word, pos):
    return bytes.decode(base64.b32encode(mmh3.hash_bytes(word + pos))).rstrip("=").lower()

In [32]:
kgl = namespaces["kgl"]
kgl_prop = namespaces["kgl-prop"]
form_link = namespaces["ontolex"].lexicalForm
form_label = namespaces["ontolex"].representation
pos_link = kgl_prop.pos

from rdflib import Literal

def is_in_graph(x):
    try:
        next(g.triples((x, None, None)))
        return True
    except StopIteration:
        return False


def add_to_graph(word, senses, pos, noun_forms, adj_forms, verb_forms):
    global g # I guess Python's GIL makes this thread-safe?
    word_id = kgl[hash(word, pos)]
    if not is_in_graph(word_id):
        g.add((word_id, pos_link, kgl[pos]))
        g.add((word_id, namespaces['rdfs'].label, Literal(word, lang="en")))
        
    
    # Detect collision by just looking at the word label.
    # In theory we should also check that different pos may cause a collision
    # but it looks extremely unlikely
    else:
        label = g.label(word_id)
        if label != word:
            print(f"Collision detected between {label} and {word}")
            word_id = kgl[hash(word + "$42", pos)]
            g.add((word_id, pos_link, kgl[pos]))
            g.add((word_id, namespaces['rdfs'].label, Literal(word, lang="en")))

In [38]:
from pyspark.sql.functions import udf, struct
from pyspark.sql.types import NullType

add_to_graph_udf = udf(lambda l: add_to_graph(*l), NullType())

# This is dumb, but until I get SANSA or anything more decent to work...
for row in hijacker_df.rdd.toLocalIterator():
    add_to_graph(row['word'], row['senses'], row['pos'],
                    row['noun_forms'], row['adj_forms'], row['verb_forms'])

In [40]:
for s,v,o in g.triples((None, None, None)):
    print(s, v, o)

http://knowledge-glue.ir.dcs.gla.ac.uk/ontology/entity/qtyv2msjtvlufiegcbw4lvqlu4 http://knowledge-glue.ir.dcs.gla.ac.uk/ontology/property/pos http://knowledge-glue.ir.dcs.gla.ac.uk/ontology/entity/noun
http://knowledge-glue.ir.dcs.gla.ac.uk/ontology/entity/qtyv2msjtvlufiegcbw4lvqlu4 http://www.w3.org/2000/01/rdf-schema#label hijacker


In [69]:
next(g.triples((None, None, None)))

(rdflib.term.URIRef('http://knowledge-glue.ir.dcs.gla.ac.uk/ontology/entityL1'),
 rdflib.term.URIRef('http://knowledge-glue.ir.dcs.gla.ac.uk/ontology/propertypos'),
 rdflib.term.URIRef('http://knowledge-glue.ir.dcs.gla.ac.uk/ontology/entitynoun'))