In [1]:
import os
prefix =  "/nfs/knowledge-glue/notebooks/3rdparty/lucene-index"
scala_path = "/usr/share/scala-2.11/lib/scala-library.jar"
import jnius_config
local_tools = os.getcwd()
jnius_config.set_classpath(prefix + "/babelnet-api-4.0.1.jar", prefix + "/lib/*", local_tools, scala_path )
print(jnius_config.get_classpath())
import jnius

['/nfs/knowledge-glue/notebooks/3rdparty/lucene-index/babelnet-api-4.0.1.jar', '/nfs/knowledge-glue/notebooks/3rdparty/lucene-index/lib/*', '/usr/local/lib/python3.7/dist-packages/jnius/src']


In [2]:
# Jeff, forgive me for I hath sinned
def jimport(path):
    """
    Wrap a jnius import. This adds a global variable
    This adds a global variable with the name of the class.
    
    >>> jimport("java.lang.Math")
    >>> Math.sqrt(100)
    10.0
    
    As pyjnius does not seem to import static fields or subclasses
    automatically via an 'autoclass' invocation, one needs to
    import it manually:
    
    >>> jimport("org.mypackage.myclass")
    >>> jimport("org.mypackage.myclass$nestedclass")
    
    At the moment it is not allowed to import a nested class alone.
    """
    
    classname = path.split(".")[-1]
    if "$" in classname:
        parent_class, subclass = classname.split("$")
        exec("""
global ?parent_class
?parent_class.?subclass = jnius.autoclass(path)
""".replace("?parent_class", parent_class).replace("?subclass", subclass),
             globals(), {'path': path})
    else:
        exec("""
global ?classname
?classname = jnius.autoclass(path)
    """.replace("?classname", classname), globals(), {'path': path})

In [3]:
jimport("it.uniroma1.lcl.babelnet.BabelNet")
jimport("it.uniroma1.lcl.babelnet.BabelNetQuery")
jimport("it.uniroma1.lcl.babelnet.BabelNetQuery$Builder")
jimport("it.uniroma1.lcl.babelnet.BabelSynset")
jimport("it.uniroma1.lcl.jlt.util.Language")
jimport("com.babelscape.util.UniversalPOS")
jimport("tools.BabelNetBridge")
jimport("tools.BabelNetLexeme")

In [None]:
import os

from pyspark import SparkContext
from pyspark.sql import SparkSession

parquet_pos = os.path.join(os.getcwd(), "data/wiktionary/parquet-index_2.11-0.4.1-SNAPSHOT.jar")

spark = SparkSession.builder \
                        .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
                        .config("spark.jars", parquet_pos) \
                        .getOrCreate()

spark.sparkContext.addPyFile(parquet_pos)

In [None]:
wiktionary_2_wdpg = spark.read.parquet("spark-warehouse/wiktionary_2_wdpg")

In [None]:
from tools.rdf_dumping import ExtractorGraph

In [None]:
graph = ExtractorGraph()

for row in wiktionary_2_wdpg.rdd.toLocalIterator():
    if(row['head']['template_name'] != 'head'):
        graph.add_wiktionary(row)

In [None]:
from rdflib import Namespace
# TODO: check that the IRI is actually this one
bn = Namespace('https://babelnet.org/synset?word=bn:')

In [None]:
graph.g.bind('bn', 'https://babelnet.org/synset?word=bn:')

In [None]:
from tools.rdf_dumping import rdf_type, kgl, kgl_prop
from tools.strings import strip_prefix
import re
# Match a sense, in the form of kgl:entity-Sx
matcher = re.compile("^(.*)-S\d+")
# for simplicity, a sense only has one gloss
sense_names = []
sense_glosses = []

for (sense, _, sense_description) in graph.g.triples((None, kgl_prop['definition'], None)):
    if not matcher.match(sense):
        continue
    if sense_description.language == "en":
        sense_names.append(sense.toPython())
        sense_glosses.append(sense_description.toPython())

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
# This is potentially slow, beware!
import numpy as np

sense_embeddings_path = "data/wiktionary/senses_encoded.npy"
if os.path.exists(sense_embeddings_path):
    sense_embeddings = np.load(sense_embeddings_path)
else:
    sense_embeddings = model.encode(sense_glosses)
    np.save(sense_embeddings_path, sense_embeddings)

In [None]:
sense_gloss_dict = dict(zip(sense_names, zip(sense_glosses, sense_embeddings)))
seen = set()

from tools.rdf_dumping import rdf_type, kgl, kgl_prop
from tools.strings import strip_prefix
#import re
#matcher = re.compile("^(.*)-S\d+")
related_to = {}


def handle_pointer(lexeme, pointer):
    pass

def query_babelnet(lexeme, word: str, pos: str):
    
    if (word, pos) in seen:
        return
    
    seen.add((word, pos))
    
    pos = strip_prefix("http://grill-lab.org/kg/entity/", pos)
    synsets = BabelNetBridge.getSynsetsForLexeme(word, pos)
    
    senses = [triple[2] for triple in graph.g.triples((lexeme, kgl_prop['sense'], None))]
    
    print("For word ", word, pos)

    pertaining_senses = [sense_gloss_dict[sense.toPython()] for sense in senses]
    print([gloss for gloss, embed in pertaining_senses])
    wiktionary_senses_embed = np.array([embed for gloss, embed in pertaining_senses])
    
    for synset in synsets:
        
        babelnet_id = synset.id().getID()
        glosses = synset.synsets()[0].getGloss()
        related = synset.relatedWords()
        
        # sources = [source.toString() for source in BabelNet.getInstance().getSynset(synset.id()).getSenseSources()]
        # print(sources)

        print(f"Determining embeddings for {babelnet_id} ({word})")
        glosses_embedded = np.array(model.encode([glosses]))
        glosses_embedded /= np.linalg.norm(glosses_embedded)
        
        # print(wiktionary_senses_embed.shape)
        # print(np.linalg.norm(wiktionary_senses_embed, axis=1).shape)
        wiktionary_senses_embed /= np.linalg.norm(wiktionary_senses_embed, axis=1)[:, np.newaxis]
        
        #print(glosses_embedded.shape)
        #print(wiktionary_senses_embed.T.shape)
        cosine_similarity = glosses_embedded.flatten() @ wiktionary_senses_embed.T
        
        # print(cosine_similarity)
        
        best_match = np.argmax(cosine_similarity)
        print(f"Given this BabelNet entry: {glosses}")
        print(f"And this Wiktionary gloss: {pertaining_senses[best_match][0]}")
        print(f"Best match score: {cosine_similarity[best_match]}")
        # TODO estimate ROC curve here
        if cosine_similarity[best_match] < 0.53:
            print("Not related to any. Creating a new synset.")
            sense_id = graph.add_sense(graph.hash(word, pos), lexeme, glosses)
        else:
            print("Related")
            sense_id = senses[best_match]
        graph.g.add((sense_id, kgl_prop['synset'], bn[babelnet_id]))
        graph.g.add((lexeme, kgl_prop['synset'], bn[babelnet_id]))
        
        handle_pointer(lexeme, related)
        
        print("=" * 80)
        
    
for idx, (lexeme, _, word) in enumerate(graph.g.triples((None, kgl_prop["label"], None))):
    all_pos = [t[2] for t in graph.g.triples((lexeme, kgl_prop["pos"], None))]
    for pos in all_pos:
        query_babelnet(lexeme, word.toPython(), pos)
    if idx >= 20:
        break