Skip to content

Commit

Permalink
Formatted files
Browse files Browse the repository at this point in the history
Using the black formatter
  • Loading branch information
tomaarsen committed Sep 3, 2020
1 parent a9cd983 commit 7f6d5d2
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 20 deletions.
25 changes: 18 additions & 7 deletions word_forms/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

try:
from nltk.corpus import wordnet as wn

raise_lookuperror_if_wordnet_data_absent = wn.synsets("python")
except LookupError:
import nltk

nltk.download("wordnet")
try:
from nltk.corpus import words
Expand All @@ -15,14 +17,16 @@

ALL_WORDNET_WORDS = set(words.words())


class Verb(object):
def __init__(self, verbs):
self.verbs = verbs

def __repr__(self):
return "Verbs" + str(self.verbs)

verbs_fh = open(Path(__file__).ancestor(1).child("en-verbs.txt"))

verbs_fh = open(Path(__file__).ancestor(1).child("en-verbs.txt"))
lines = verbs_fh.readlines()
verbs_fh.close()
CONJUGATED_VERB_DICT = {}
Expand All @@ -32,14 +36,21 @@ def __repr__(self):
for verb in verb_obj.verbs:
CONJUGATED_VERB_DICT[verb] = verb_obj

ADJECTIVE_TO_ADVERB = {"good" : "well"}
for ss in wn.all_synsets(pos = "r"):
ADJECTIVE_TO_ADVERB = {"good": "well"}
for ss in wn.all_synsets(pos="r"):
for lemma in ss.lemmas():
word = lemma.name()
this_word_lemmas = [lemma for ss in wn.synsets(word, pos = wn.ADV)
for lemma in ss.lemmas() if lemma.name() == word]
pertainyms = {pertainym.name() for this_word_lemma in this_word_lemmas
for pertainym in this_word_lemma.pertainyms()}
this_word_lemmas = [
lemma
for ss in wn.synsets(word, pos=wn.ADV)
for lemma in ss.lemmas()
if lemma.name() == word
]
pertainyms = {
pertainym.name()
for this_word_lemma in this_word_lemmas
for pertainym in this_word_lemma.pertainyms()
}
matches = get_close_matches(word, pertainyms)
for match in matches:
ADJECTIVE_TO_ADVERB[match] = word
39 changes: 26 additions & 13 deletions word_forms/word_forms.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
try:
from nltk.corpus import wordnet as wn

raise_lookuperror_if_wordnet_data_absent = wn.synsets("python")
except LookupError:
import nltk

nltk.download("wordnet")
import inflect
from difflib import SequenceMatcher

from .constants import (ALL_WORDNET_WORDS, CONJUGATED_VERB_DICT,
ADJECTIVE_TO_ADVERB)
from .constants import ALL_WORDNET_WORDS, CONJUGATED_VERB_DICT, ADJECTIVE_TO_ADVERB


def belongs(lemma, lemma_list):
"""
Expand All @@ -26,10 +28,11 @@ def belongs(lemma, lemma_list):
behavior for the statement "lemma in list_list".
"""
return any(
element.name() == lemma.name() and element.synset() == lemma.synset()
element.name() == lemma.name() and element.synset() == lemma.synset()
for element in lemma_list
)


def get_related_lemmas(word):
"""
args
Expand All @@ -42,22 +45,31 @@ def get_related_lemmas(word):
"""
return get_related_lemmas_rec(word, [])


def get_related_lemmas_rec(word, known_lemmas):
# Turn string word into list of Lemma objects
all_lemmas_for_this_word = [lemma for ss in wn.synsets(word)
for lemma in ss.lemmas()
if lemma.name() == word]
all_lemmas_for_this_word = [
lemma
for ss in wn.synsets(word)
for lemma in ss.lemmas()
if lemma.name() == word
]
# Add new lemmas to known lemmas
known_lemmas += [lemma for lemma in all_lemmas_for_this_word
if not belongs(lemma, known_lemmas)]
known_lemmas += [
lemma for lemma in all_lemmas_for_this_word if not belongs(lemma, known_lemmas)
]
# Loop over new lemmas, and recurse using new related lemmas, but only if the new related lemma is similar to the original one
for lemma in all_lemmas_for_this_word:
for new_lemma in (lemma.derivationally_related_forms() + lemma.pertainyms()):
if not belongs(new_lemma, known_lemmas) and SequenceMatcher(None, word, new_lemma.name()).ratio() > 0.4:
for new_lemma in lemma.derivationally_related_forms() + lemma.pertainyms():
if (
not belongs(new_lemma, known_lemmas)
and SequenceMatcher(None, word, new_lemma.name()).ratio() > 0.4
):
get_related_lemmas_rec(new_lemma.name(), known_lemmas)
# Return the known lemmas
return known_lemmas


def singularize(noun):
"""
args
Expand All @@ -71,6 +83,7 @@ def singularize(noun):
return singular
return noun


def get_word_forms(word):
"""
args
Expand All @@ -88,7 +101,7 @@ def get_word_forms(word):
"""
word = singularize(word)
related_lemmas = get_related_lemmas(word)
related_words_dict = {"n" : set(), "a" : set(), "v" : set(), "r" : set()}
related_words_dict = {"n": set(), "a": set(), "v": set(), "r": set()}
for lemma in related_lemmas:
pos = lemma.synset().pos()
if pos == "s":
Expand All @@ -97,11 +110,11 @@ def get_word_forms(word):
# TODO: This will add the plural of eg "politics", which according to inflect is "politicss"
for noun in related_words_dict["n"].copy():
related_words_dict["n"].add(inflect.engine().plural_noun(noun))

for verb in related_words_dict["v"].copy():
if verb in CONJUGATED_VERB_DICT:
related_words_dict["v"] |= CONJUGATED_VERB_DICT[verb].verbs

for adjective in related_words_dict["a"].copy():
if adjective in ADJECTIVE_TO_ADVERB:
related_words_dict["r"].add(ADJECTIVE_TO_ADVERB[adjective])
Expand Down

0 comments on commit 7f6d5d2

Please sign in to comment.