Skip to content

Commit

Permalink
Fix SynsetMapper and add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
frankier committed Aug 6, 2018
1 parent 8312cb0 commit 88257d0
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 9 deletions.
63 changes: 54 additions & 9 deletions finntk/emb/autoextend.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from nltk.util import binary_search_file
import zipfile
import logging
from functools import total_ordering
from finntk.utils import ResourceMan, urlretrieve
from finntk.wordnet.reader import fiwn
from gensim.models import KeyedVectors
from shutil import copyfileobj
import os
Expand Down Expand Up @@ -56,6 +58,36 @@ def get_vecs(self):
vecs = AutoExtendNumberBatchFiWNWordVecs()


@total_ordering
class AsKey(object):

def __init__(self, obj, key_func):
self.obj = obj
self.obj_key = key_func(obj)
self.key_func = key_func

def __lt__(self, other):
return self.obj_key < self.key_func(other)

def __eq__(self, other):
return self.obj_key == self.key_func(other)

def __add__(self, other):
return AsKey(self.obj + other.encode("utf-8"), self.key_func)

def __len__(self):
return len(self.obj)


POS_ORDER = b"nvar"


def synset_map_key(line):
key = line.split(b" ", 1)[0]
_, off, pos = key.strip().rsplit(b"-", 2)
return (POS_ORDER.index(pos), off)


class SynsetMapper:

def __init__(self, res_man):
Expand All @@ -66,28 +98,41 @@ def __init__(self, res_man):
def map_file(self):
if self._map_file is None:
synsets_fn = self.res_man.get_res("synsets")
self._map_file = open(synsets_fn)
self._map_file = open(synsets_fn, "rb")
return self._map_file

def __call__(self, synset_id):
line = binary_search_file(self.map_file, synset_id)
bits = line.split(" ", 1)
return bits[1] or None
full_synset_id = "wn-fi-2.0-" + synset_id
line = binary_search_file(
self.map_file, AsKey(full_synset_id.encode("utf-8"), synset_map_key)
)
if line is None:
return
bits = line.split(b" ", 1)
return bits[1].rstrip().decode("utf-8") or None


synset_map = SynsetMapper(vecs)


def get_lemma_id(lemma):
synset = lemma.synset()
return "{}-wn-fi-2.0-{}".format(lemma.name().lower(), fiwn.ss2of(synset))


def mk_lemma_vec(lemma):
from finntk.wordnet.reader import fiwn
fiwn_space = vecs.get_vecs()
return fiwn_space[get_lemma_id(lemma)]

synset = lemma.synset()

def mk_lemmas_mat(lemmas):
fiwn_space = vecs.get_vecs()
lemma_id = "{}-wn-fi-2.0-{}".format(lemma.name(), fiwn.ss2of(synset))
return fiwn_space[lemma_id]
return fiwn_space[[get_lemma_id(lemma) for lemma in lemmas]]


def mk_synset_vec(synset):
fiwn_space = vecs.get_vecs()
synset_id = "".join((lemma.key() + "," for lemma in synset.lemmas()))
synset_id = synset_map(fiwn.ss2of(synset))
if synset_id is None:
return
return fiwn_space[synset_id]
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
],
"dev": [
"pytest",
"hypothesis",
# Markdown descriptions
"twine>=1.11.0",
"wheel>=0.31.0",
Expand Down
49 changes: 49 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
import pytest
from hypothesis import strategies as st, given

from finntk.omor.extract import extract_lemmas_combs, extract_lemmas_recurs
from finntk.wordnet.reader import fiwn
from scipy.spatial.distance import cosine
import heapq
import itertools


def intersect(*its):
for key, values in itertools.groupby(heapq.merge(*its)):
if len(list(values)) == len(its):
yield key


@pytest.mark.parametrize(
Expand Down Expand Up @@ -36,3 +47,41 @@ def test_lemmas_combs(compound, expected_lemmas):
def test_lemmas_recurs(compound, expected_lemmas):
actual_lemmas = extract_lemmas_recurs(compound)
assert actual_lemmas.issuperset(expected_lemmas)


def fiwn_conceptnet_common_lemmas():
CONCEPTNET_FI = "/c/fi/"
from finntk.emb.numberbatch import vecs as numberbatch_vecs

vecs = numberbatch_vecs.get_vecs()

def fi_lemmas():
for entity in vecs.index2entity:
if entity.startswith(CONCEPTNET_FI):
yield entity[len(CONCEPTNET_FI):]

return intersect(fiwn.all_lemma_names(), fi_lemmas())


@given(st.sampled_from(fiwn_conceptnet_common_lemmas()))
def test_get_lemma_vec(lemma_name):
from finntk.emb.autoextend import mk_lemma_vec

for lemma in fiwn.lemmas(lemma_name):
assert mk_lemma_vec(lemma) is not None


@given(st.sampled_from(fiwn_conceptnet_common_lemmas()))
def test_get_synset_vec(lemma_name):
from finntk.emb.autoextend import mk_synset_vec

synset = fiwn.lemmas(lemma_name)[0].synset()
assert mk_synset_vec(synset) is not None


@given(st.one_of(st.just("pitää"), st.just("saada")))
def test_surf_vec_matches(surf):
from finntk.emb.autoextend import vecs as autoextend_vecs
from finntk.emb.numberbatch import mk_concept_vec

assert cosine(mk_concept_vec("fi", surf), autoextend_vecs.get_vecs()[surf]) < 0.01

0 comments on commit 88257d0

Please sign in to comment.