In [2]:
import numpy as np
import nltk
from nltk.corpus import semcor
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import math
nltk.download("semcor")
nltk.download("universal_tagset")
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
# corpus = semcor._items(None, "token", True, True, True)

[nltk_data] Downloading package semcor to /root/nltk_data...
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
import gensim
import gensim.downloader
from gensim.models import Word2Vec, KeyedVectors

In [4]:
google_wv = gensim.downloader.load('word2vec-google-news-300')



In [5]:
wv_mean = 0
wv_std = 1
dim_wv = 300

In [6]:
np.set_printoptions(linewidth = 200)

In [7]:
corpus = semcor.tagged_sents(tag = 'both')

In [8]:
corpus[0]

[Tree('DT', ['The']),
 Tree(Lemma('group.n.01.group'), [Tree('NE', [Tree('NNP', ['Fulton', 'County', 'Grand', 'Jury'])])]),
 Tree(Lemma('state.v.01.say'), [Tree('VB', ['said'])]),
 Tree(Lemma('friday.n.01.Friday'), [Tree('NN', ['Friday'])]),
 Tree('DT', ['an']),
 Tree(Lemma('probe.n.01.investigation'), [Tree('NN', ['investigation'])]),
 Tree('IN', ['of']),
 Tree(Lemma('atlanta.n.01.Atlanta'), [Tree('NN', ['Atlanta'])]),
 Tree('POS', ["'s"]),
 Tree(Lemma('late.s.03.recent'), [Tree('JJ', ['recent'])]),
 Tree(Lemma('primary.n.01.primary_election'), [Tree('NN', ['primary', 'election'])]),
 Tree(Lemma('produce.v.04.produce'), [Tree('VB', ['produced'])]),
 Tree(None, ['``']),
 Tree('DT', ['no']),
 Tree(Lemma('evidence.n.01.evidence'), [Tree('NN', ['evidence'])]),
 Tree(None, ["''"]),
 Tree('IN', ['that']),
 Tree('DT', ['any']),
 Tree(Lemma('abnormality.n.04.irregularity'), [Tree('NN', ['irregularities'])]),
 Tree(Lemma('happen.v.01.take_place'), [Tree('VB', ['took', 'place'])]),
 Tree(None, 

In [10]:
lemmatizer = WordNetLemmatizer()
map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def preprocess_from_semcor(sentence):
  """
    input: a list ( an element of semcor.tagged_sents(tag='both') )
    output: a list of (word, pos_tag, synset) triplets. synset may be None. omit words (which would be punctuations) whose pos_tag is None.
  """
  context = []
  for word in sentence:
    
    lemma = word.label()
    
    if type(lemma) != nltk.corpus.reader.wordnet.Lemma: # meaning word.label() is either None or a string (which may be pos_tag or synset name)
      if lemma == None:
        continue
      if ('.' not in lemma):                            # label is pos tag
        pos = lemma
        if pos == 'NE':                                 # sometimes a named entity tree with no labelled synset/lemma is present. This is just to catch and ignore that case, NE is obviously not a pos tag
          continue
        ww = word[0].lower()
        context.append((ww,pos,'None'))
        continue
      else:                                             # label is synset name
        lemma_available = False
    else:
      lemma_available = True
    
    tree = word[0]
    if tree.label() == 'NE':
      tree = tree[0]
    pos = tree.label()
    
    if lemma_available:
      ww = lemma.name().lower()
      synset = lemma.synset().name()
    else:
      ww = lemmatizer.lemmatize(tree[0], map.get(pos[0], wordnet.NOUN)).lower()
      synset = lemma
    
    context.append((ww,pos,synset))

  return context

preprocess_from_semcor(semcor.tagged_sents(tag = 'both')[0])

[('the', 'DT', 'None'),
 ('group', 'NNP', 'group.n.01'),
 ('say', 'VB', 'state.v.01'),
 ('friday', 'NN', 'friday.n.01'),
 ('an', 'DT', 'None'),
 ('investigation', 'NN', 'probe.n.01'),
 ('of', 'IN', 'None'),
 ('atlanta', 'NN', 'atlanta.n.01'),
 ("'s", 'POS', 'None'),
 ('recent', 'JJ', 'late.s.03'),
 ('primary_election', 'NN', 'primary.n.01'),
 ('produce', 'VB', 'produce.v.04'),
 ('no', 'DT', 'None'),
 ('evidence', 'NN', 'evidence.n.01'),
 ('that', 'IN', 'None'),
 ('any', 'DT', 'None'),
 ('irregularity', 'NN', 'abnormality.n.04'),
 ('take_place', 'VB', 'happen.v.01')]

In [11]:
lemmatizer = WordNetLemmatizer()
map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def preprocess(sentence, no_pos = False):
  """
    input: sentence as a string
    output: list of (lemmatized_word, pos) or (lemmatized_word) according to the bool argument no_pos
  """
  tagged_sent = nltk.pos_tag(word_tokenize(sentence))
  lemmatized_sent = [(lemmatizer.lemmatize(word, map.get(pos[0], wordnet.NOUN)).lower(), pos) for word, pos in tagged_sent]
  if no_pos:
    return [w for (w,p) in lemmatized_sent]
  else:
    return lemmatized_sent

preprocess("The authorities said an investigation produced no evidence")

[('the', 'DT'),
 ('authority', 'NNS'),
 ('say', 'VBD'),
 ('an', 'DT'),
 ('investigation', 'NN'),
 ('produce', 'VBD'),
 ('no', 'DT'),
 ('evidence', 'NN')]

In [12]:
# for MFS

word_to_synset_freq = dict()
for i in range(10000):
  sentence = corpus[i]
  for (w,p,s) in preprocess_from_semcor(sentence):
    if s == None:
      continue
    if w not in word_to_synset_freq:
      word_to_synset_freq[w] = dict()
    if s not in word_to_synset_freq[w]:
      word_to_synset_freq[w][s] = 1
    else:
      word_to_synset_freq[w][s] += 1

# word_to_synset_freq

In [13]:
def extended_lesk(synsets, context):
  (maxsofar, best_ss) = (0, None)
  for ss in synsets:
    if ss.pos() != 'n':
      continue
    defn = preprocess(ss.definition(), no_pos = True)
    for hypernym in ss.hypernyms():
      defn += preprocess(hypernym.definition(), no_pos = True)
    for hyponym in ss.hyponyms():
      defn += preprocess(hyponym.definition(), no_pos = True)
    
    score = len(context.intersection(defn))
    if score > maxsofar:
      (maxsofar, best_ss) = (score, ss)
  
  if not best_ss: # no synset had any intersection with context
    best_ss = synsets[0]  #wfs
  
  return best_ss

In [14]:
def pagerank(nouns, use_wv = False):
  start, end = 0, 0
  num_synsets = []
  synset_lists = []
  idx = []
  for noun in nouns:
    _synsets = wordnet.synsets(noun)
    _synsets = [ss for ss in _synsets if ss.pos() == 'n']
    num_synsets.append(len(_synsets))
    synset_lists.append(_synsets)
    end = start + len(_synsets)
    idx.append((start,end))
    start = end
  W = np.zeros((end,end))

  index_order = np.argsort(num_synsets)
  for ii in range(len(index_order)-1):
    ss_list1 = synset_lists[index_order[ii]]
    start1, end1 = idx[index_order[ii]]
    ss_list2 = synset_lists[index_order[ii+1]]
    start2, end2 = idx[index_order[ii+1]]
    for i in range(len(ss_list1)):
      for j in range(len(ss_list2)):
        defn_i = preprocess(ss_list1[i].definition(), no_pos = True)
        defn_j = preprocess(ss_list2[j].definition(), no_pos = True)
        if not use_wv:  # lesk based similarity
          val = len(set(defn_j).intersection(defn_i))
          W[start1+i][start2+j] = val
          # val = len(set(defn_i).intersection(defn_j))
          # W[start2+j][start1+i] = val
        else:
          wv_i = np.zeros((len(defn_i), dim_wv))
          wv_j = np.zeros((len(defn_j), dim_wv))
          for _index in range(len(defn_i)):
            word = defn_i[_index]
            if word in google_wv:
              wv_i[_index] = google_wv[word]
            else:
              wv_i[_index] = np.random.normal(wv_mean,wv_std,dim_wv)
          for _index in range(len(defn_j)):
            word = defn_j[_index]
            if word in google_wv:
              wv_j[_index] = google_wv[word]
            else:
              wv_j[_index] = np.random.normal(wv_mean,wv_std,dim_wv)
          wv_i = wv_i.mean(axis = 0)
          wv_j = wv_j.mean(axis = 0)
          val = np.dot(wv_i,wv_j)/(np.linalg.norm(wv_i)*np.linalg.norm(wv_j))
          # W[start1+i][start2+j] = val
          W[start2+j][start1+i] = val

  
  col_sum = W.sum(axis = 0, keepdims = True)
  col_sum[col_sum == 0] = 1 # instead of dividing by zero, we leave it as is
  W = W/col_sum
  d = 0.85
  I = np.eye(end)
  R = np.linalg.inv(I - d*W)@(np.ones((end,1))*(1-d)/end)
  result = []
  for i in range(len(nouns)):
    noun = nouns[i]
    start,end = idx[i]
    if start == end:  # no synsets for the noun were available
      best_ss = 'None'
    else:
      best_ss = wordnet.synsets(noun)[np.argmax(R[start:end])]
      best_ss = best_ss.name()
    result.append(best_ss)
  return result

In [15]:
import pandas as pd

In [16]:
def tag(sentence):
  sentence = preprocess(sentence)
  
  MFS = []
  WFS = []
  ExtendedLesk = []

  nouns = []
  
  for (word,pos) in sentence:
    if pos[0] != 'N': # only do nouns
      continue
    
    nouns.append(word)  # for page rank

    mfs = 'None'
    if word in word_to_synset_freq:
      ss_freq = word_to_synset_freq[word]
      (maxsofar, best_ss) = (0, 'None')
      for ss in ss_freq:
        if ss_freq[ss] > maxsofar:
          maxsofar = ss_freq[ss]
          best_ss = ss
      mfs = best_ss
    MFS.append(mfs)
    

    synsets = wordnet.synsets(word)
    if len(synsets) == 0:
      WFS.append('None')
      ExtendedLesk.append('None')
      continue

    wfs = synsets[0]
    context = set([w for (w,p) in sentence if w != word])
    ex_lesk = extended_lesk(synsets, context)

    WFS.append(wfs.name())
    ExtendedLesk.append(ex_lesk.name())
  
  PageRank = pagerank(nouns, use_wv = False)

  return nouns, MFS, WFS, ExtendedLesk, PageRank

tagged_result = tag("The authorities said an investigation produced no evidence")
pd.DataFrame(tagged_result, index=['Noun', 'MFS', 'WFS', 'Ex-Lesk', 'Page-Rank'])

Unnamed: 0,0,1,2
Noun,authority,investigation,evidence
MFS,authority.n.01,probe.n.01,evidence.n.01
WFS,authority.n.01,probe.n.01,evidence.n.01
Ex-Lesk,agency.n.01,probe.n.01,evidence.n.03
Page-Rank,authority.n.01,probe.n.01,evidence.n.03


In [18]:
# The authorities said an investigation produced no evidence
# wordnet.synset('authority.n.01').definition()
for ss in tagged_result[3]:
  print(ss, wordnet.synset(ss).definition())

agency.n.01 an administrative unit of government
probe.n.01 an inquiry into unfamiliar or questionable activities
evidence.n.03 (law) all the means by which any alleged matter of fact whose truth is investigated at judicial trial is established or disproved


In [22]:
def tag_semcor(sentence):
  sentence = preprocess_from_semcor(sentence)
  
  MFS = []
  WFS = []
  ExtendedLesk = []
  correct_lesk = 0
  correct_pgrank = 0
  correct_wfs = 0
  correct_mfs = 0
  total = 0
  
  nouns = []
  true_ss = []

  for (word,pos, true_synset) in sentence:
    if pos[0] != 'N': # only do nouns
      continue
    
    total += 1

    mfs = 'None'
    if word in word_to_synset_freq:
      ss_freq = word_to_synset_freq[word]
      (maxsofar, best_ss) = (0, 'None')
      for ss in ss_freq:
        if ss_freq[ss] > maxsofar:
          maxsofar = ss_freq[ss]
          best_ss = ss
      mfs = best_ss
    MFS.append(mfs)
    correct_mfs += (mfs == true_synset)
    

    nouns.append(word)  # for page rank
    true_ss.append(true_synset)

    synsets = wordnet.synsets(word)
    if len(synsets) == 0:
      WFS.append('None')
      ExtendedLesk.append('None')
      continue

    wfs = synsets[0]
    context = set([w for (w,p,s) in sentence if w != word])
    ex_lesk = extended_lesk(synsets, context)

    WFS.append(wfs.name())
    ExtendedLesk.append(ex_lesk.name())
    correct_lesk += (true_synset == ex_lesk.name())
    correct_wfs += (true_synset == wfs.name())

  PageRank = pagerank(nouns, use_wv = True)
  for pgrank,true_synset in zip(PageRank, true_ss):
    correct_pgrank += (true_synset == pgrank)
  # print(correct_lesk,correct_wfs,total)
  return true_ss, MFS, WFS, ExtendedLesk, PageRank, (total, correct_mfs, correct_wfs, correct_lesk, correct_pgrank)

_ = tag_semcor(semcor.tagged_sents(tag = 'both')[1])
__ = list(zip(_[0], _[1], _[2], _[3], _[4]))
__.append(_[5])
pd.DataFrame(__, columns=['true', 'MFS', 'WFS', 'Ex-Lesk', 'Page-Rank'])


Unnamed: 0,true,MFS,WFS,Ex-Lesk,Page-Rank
0,jury.n.01,jury.n.01,jury.n.01,jury.n.01,jury.n.01
1,term.n.02,term.n.02,term.n.01,term.n.05,term.n.01
2,end.n.02,end.n.02,end.n.01,goal.n.01,end.n.02
3,presentment.n.01,presentment.n.01,presentment.n.01,presentment.n.01,presentment.n.01
4,group.n.01,group.n.01,group.n.01,group.n.01,group.n.03
5,mission.n.03,charge.n.02,charge.n.01,charge.n.03,charge.n.03
6,election.n.01,election.n.01,election.n.01,election.n.02,election.n.03
7,praise.n.01,praise.n.01,praise.n.01,praise.n.01,praise.n.02
8,thanks.n.01,thanks.n.01,thanks.n.01,thanks.n.01,thanks.n.01
9,location.n.01,location.n.01,location.n.01,location.n.01,location.n.04


In [20]:
import sklearn

In [25]:
total, mfs, wfs, lesk, pgrank = 0,0,0,0,0

true_list = []
mfs_list = []
wfs_list = []
lesk_list = []
pgrank_list = []
for i in range(10000,15000):
  result = tag_semcor(corpus[i])
  (t, m, w, l, p) = result[5]
  total += t
  mfs += m
  wfs += w
  lesk += l
  pgrank += p
  true_list += result[0]
  mfs_list += result[1]
  wfs_list += result[2]
  lesk_list += result[3]
  pgrank_list += result[4]
  if i%1000 == 0:
    print(i)
# print(total, mfs/total, wfs/total, lesk/total, pgrank/total)

_ = list()
_.append(sklearn.metrics.precision_recall_fscore_support(true_list, mfs_list, average = 'weighted', zero_division = 0))
_.append(sklearn.metrics.precision_recall_fscore_support(true_list, wfs_list, average = 'weighted', zero_division = 0))
_.append(sklearn.metrics.precision_recall_fscore_support(true_list, lesk_list, average = 'weighted', zero_division = 0))
_.append(sklearn.metrics.precision_recall_fscore_support(true_list, pgrank_list, average = 'weighted', zero_division = 0) )

df = pd.DataFrame(_, index=['MFS', 'WFS', 'Ex-Lesk', 'Page-Rank'], columns = ['precision', 'recall', 'fscore', 'Accuracy'])
df['Accuracy'] = [mfs/total, wfs/total, lesk/total, pgrank/total]
df

10000
11000
12000
13000
14000


Unnamed: 0,precision,recall,fscore,Accuracy
MFS,0.472077,0.535379,0.478419,0.535379
WFS,0.705983,0.760303,0.711647,0.748562
Ex-Lesk,0.676971,0.619534,0.614263,0.607793
Page-Rank,0.657321,0.434915,0.477281,0.434915


In [31]:
Labels = list(set(true_list).union(set(lesk_list)).union(set(pgrank_list)))

In [33]:
len(Labels)

10466

In [34]:
cm_lesk = sklearn.metrics.confusion_matrix(true_list, lesk_list, labels = Labels)
cm_pgrank = sklearn.metrics.confusion_matrix(true_list, pgrank_list, labels = Labels)

In [65]:
np.fill_diagonal(cm_lesk, 0)
np.fill_diagonal(cm_pgrank, 0)

In [75]:
conf_lesk = np.where(cm_lesk>25)
conf_pgrank = np.where(cm_pgrank>70)

In [76]:
for a,b in zip(conf_lesk[0], conf_lesk[1]):
  print(Labels[a], Labels[b])

day.n.01 day.n.03
united_states.n.01 united_states_government.n.01
time.n.01 time.n.05
time.n.03 time.n.05
None one.n.01
None fluorine.n.01
act.n.01 act.n.02


In [77]:
for a,b in zip(conf_pgrank[0], conf_pgrank[1]):
  print(Labels[a], Labels[b])

person.n.01 person.n.03
person.n.01 person.n.02
group.n.01 group.n.03
group.n.01 group.n.02
location.n.01 location.n.04


In [21]:
[(ss,ss.definition()) for ss in wordnet.synsets("bank")]

[(Synset('bank.n.01'),
  'sloping land (especially the slope beside a body of water)'),
 (Synset('depository_financial_institution.n.01'),
  'a financial institution that accepts deposits and channels the money into lending activities'),
 (Synset('bank.n.03'), 'a long ridge or pile'),
 (Synset('bank.n.04'),
  'an arrangement of similar objects in a row or in tiers'),
 (Synset('bank.n.05'),
  'a supply or stock held in reserve for future use (especially in emergencies)'),
 (Synset('bank.n.06'),
  'the funds held by a gambling house or the dealer in some gambling games'),
 (Synset('bank.n.07'),
  'a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force'),
 (Synset('savings_bank.n.02'),
  'a container (usually with a slot in the top) for keeping money at home'),
 (Synset('bank.n.09'),
  'a building in which the business of banking transacted'),
 (Synset('bank.n.10'),
  'a flight maneuver; aircraft tips laterally abo