In [1]:
import spacy

from spacy_wordnet.wordnet_annotator import WordnetAnnotator 

# Load an spacy model (supported models are "es" and "en") 
nlp = spacy.load('../models/spacy/en_core_web_sm-2.3.0')

In [3]:
import nltk
nltk.data.path.append('../models/nltk')

In [None]:
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

In [36]:
token = nlp('prices')[0]

# wordnet object link spacy token with nltk wordnet interface by giving acces to
# synsets and lemmas 
synsets = token._.wordnet.synsets()
tlemmas = token._.wordnet.lemmas()
lemmas_for_synset = [lemma for s in synsets for lemma in s.lemma_names()]
print(lemmas_for_synset)

['monetary_value', 'price', 'cost', 'price', 'terms', 'damage', 'price', 'cost', 'toll', 'price', 'price', 'price', 'Price', 'Leontyne_Price', 'Mary_Leontyne_Price']


In [30]:
# And automatically tags with wordnet domains
t_domains = token._.wordnet.wordnet_domains()

In [59]:
economy_domains = ['finance', 'banking']
enriched_sentence = []
s = 'I want to withdraw 5,000 euros'
s = 'This Tesla car is awesome, but its price is high.'
s = 'Tell me somthing about Harry Potter, how about NBA'
s = 'economic tension between China and US'
sentence = nlp(s)

# For each token in the sentence
for token in sentence:
#     t_domains = token._.wordnet.wordnet_domains()
#     print(t_domains)
#     print('-----------------------------------------')
    # We get those synsets within the desired domains
    # synsets = token._.wordnet.wordnet_synsets_for_domain(economy_domains)
    synsets = token._.wordnet.synsets()
    if not synsets or token.is_stop or token.is_punct:
        enriched_sentence.append(token.text)
    else:
        lemmas_for_synset = [lemma for s in synsets for lemma in s.lemma_names()]
        lemmas_for_synset = [l.replace('_', ' ').replace('-', ' ') for l in lemmas_for_synset]
        lemmas_for_synset = [ps.stem(l) for l in lemmas_for_synset]
        print('lemmas_for_synset, ', token.text)
        print(list(set(lemmas_for_synset)))
        # If we found a synset in the economy domains
        # we get the variants and add them to the enriched sentence
        enriched_sentence.append('({})'.format('|'.join(set(lemmas_for_synset))))

# Let's see our enriched sentence
print(' '.join(enriched_sentence))

lemmas_for_synset,  economic
['econom']
lemmas_for_synset,  tension
['taut', 'stress', 'tension', 'latent hostil', 'tensiti', 'tens']
lemmas_for_synset,  China
['nationalist china', 'china', 'cathay', 'prc', 'chinawar', 'communist china', 'republic of china', 'mainland china', "people's republic of china", 'taiwan', 'red china']
(econom) (taut|stress|tension|latent hostil|tensiti|tens) between (nationalist china|china|cathay|prc|chinawar|communist china|republic of china|mainland china|people's republic of china|taiwan|red china) and US


In [55]:
enriched_sentence

['(economical|economic)',
 '(tautness|latent_hostility|stress|tensity|tension|tenseness)',
 'between',
 "(Taiwan|Republic_of_China|Cathay|china|China|Communist_China|Red_China|mainland_China|PRC|chinaware|Nationalist_China|People's_Republic_of_China)",
 'and',
 'US']

In [6]:
# experiment with stemmer
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [58]:
example_words = ["python","pythoner","Pythoning","pythoned","pythonly", 'Repulic of China']
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli
repulic of china


In [10]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
new_doc = nlp(new_text)

for w in new_doc:
    print(ps.stem(w.text))

It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


In [11]:
# inspect index
from pyserini import analysis, index

index_reader = index.IndexReader('../indexes/msmarco-doc/')

In [22]:
import itertools
for term in itertools.islice(index_reader.terms(), 21100, 21120):
    print(f'{term.term} (df={term.df}, cf={term.cf})')

0.009sum (df=1, cf=1)
0.009svv (df=1, cf=1)
0.009swva (df=1, cf=1)
0.009ten (df=1, cf=1)
0.009the (df=1, cf=1)
0.009threonin (df=1, cf=1)
0.009w (df=1, cf=1)
0.009wax (df=1, cf=1)
0.009wife (df=1, cf=1)
0.009wilcoxon (df=1, cf=1)
0.009ωd (df=1, cf=1)
0.00;0.0 (df=1, cf=1)
0.00;0.00 (df=1, cf=1)
0.00;0.005 (df=1, cf=1)
0.00;347348 (df=1, cf=1)
0.00_ (df=15, cf=43)
0.00a (df=21, cf=29)
0.00ab (df=1, cf=1)
0.00abbybank (df=1, cf=1)
0.00abec (df=1, cf=1)


In [18]:
term = 'cities'

# Look up its document frequency (df) and collection frequency (cf).
# Note, we use the unanalyzed form:
df, cf = index_reader.get_term_counts(term)
print(f'term "{term}": df={df}, cf={cf}')

term "cities": df=495994, cf=2431480


In [24]:
term = 'cities'

# Analyze the term.
analyzed = index_reader.analyze(term)
print(f'The analyzed form of "{term}" is "{analyzed[0]}"')

# Skip term analysis:
df, cf = index_reader.get_term_counts(analyzed[0], analyzer=None)
print(f'term "{term}": df={df}, cf={cf}')

The analyzed form of "cities" is "citi"
term "cities": df=495994, cf=2431480


In [26]:
# Fetch and traverse postings for an unanalyzed term:
postings_list = index_reader.get_postings_list(term)
print('how many postings', len(postings_list))

how many postings 495994


In [27]:
for posting in postings_list[0:10]:
    print(f'docid={posting.docid}, tf={posting.tf}, pos={posting.positions}')

docid=3, tf=1, pos=[355]
docid=24, tf=2, pos=[41, 92]
docid=32, tf=1, pos=[311]
docid=41, tf=1, pos=[306]
docid=61, tf=9, pos=[158, 998, 1267, 1329, 1901, 1911, 1948, 2527, 3328]
docid=62, tf=1, pos=[53]
docid=68, tf=1, pos=[717]
docid=72, tf=5, pos=[2734, 4210, 6924, 7774, 8632]
docid=78, tf=45, pos=[1514, 1781, 3912, 3925, 3962, 3971, 4011, 4412, 5756, 8704, 8951, 10307, 10427, 10448, 10761, 12936, 13500, 13865, 13969, 14005, 14730, 16796, 16808, 17710, 18385, 18428, 18434, 18488, 18501, 18515, 18533, 18669, 18674, 18680, 18726, 18898, 18903, 19082, 19099, 19105, 19203, 19735, 20316, 20604, 21634]
docid=79, tf=1, pos=[22099]


In [None]:
# test out the nn_wordnet module
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)
from nn_wordnet import NNWordNet

nn = NNWordNet()
nn.expand_query_version_1('Chinese dream versus American dream')