In [1]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.wsd import lesk

# A bit of preprocessing 
def preprocess(text):
    mapping = {"NOUN": wordnet.NOUN, "VERB": wordnet.VERB, "ADJ": wordnet.ADJ, "ADV": wordnet.ADV}
    sw_list = ['on', 'the', 'of', 'a']
    
    lem = WordNetLemmatizer()
    # tokenize, if input is text
    tokens = nltk.word_tokenize(text) if type(text) is str else text
    # compute pos-tag
    tagged = nltk.pos_tag(tokens, tagset='universal') # nltk.pos_tag(, tagset="universal")
    # lowercase
    tagged = [(w.lower(), p) for w, p in tagged]
    # optional: remove all words that are not NOUN, VERB, ADJ, or ADV (i.e. no sense in WordNet)
    tagged = [(w, p) for w, p in tagged if p in mapping]
    # re-map tags to WordNet (return orignal if not in-mapping, if above is not used)
    tagged = [(w, mapping.get(p, p)) for w, p in tagged]
    # remove stopwords
    tagged = [(w, p) for w, p in tagged if w not in sw_list] #... not in stopword list]
    # lemmatize
    tagged = [(w, lem.lemmatize(w, pos=p), p) for w, p in tagged]
    # unique the list
    tagged = list(set(tagged))
    
    return tagged
def get_sense_definitions(context):
    # input is text or list of strings
    lemma_tags = preprocess(context)

    # let's get senses for each
    senses = [(w, wordnet.synsets(l, p)) for w, l, p in lemma_tags]

    # let's get their definitions
    definitions = []
    for raw_word, sense_list in senses:
        if len(sense_list) > 0:
            # let's tokenize, lowercase & remove stop words 
            def_list = []
            for s in sense_list:
                defn = s.definition()
                # let's use the same preprocessing
                tags = preprocess(defn)
                toks = [l for w, l, p in tags]
                def_list.append((s, toks))
            definitions.append((raw_word, def_list))
    return definitions
    
def get_top_sense(words, sense_list):
    # get top sense from the list of sense-definition tuples
    # assumes that words and definitions are preprocessed identically
    val, sense = max((len(set(words).intersection(set(defn))), ss) for ss, defn in sense_list)
    return val, sense

# Lesk simplified
def lesk_simplified(context_sentence, ambiguous_word, pos=None, synsets=None):

    context = set(context_sentence)
    
    if synsets is None:
        synsets = wordnet.synsets(ambiguous_word)
    # Filter by pos-tag
    if pos:
        synsets = [ss for ss in synsets if str(ss.pos()) == pos]

    if not synsets:
        return None
    
    _, sense = max((len(set(context).intersection(set(nltk.word_tokenize(ss.definition())))), ss) for ss in synsets)  # Don't forget to tokenize the definition
    
    return sense



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\farih\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# getting pre-computed ic of the semcor corpus (large sense tagged corpus)
from nltk.corpus import wordnet_ic
nltk.download('wordnet_ic')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')

def get_top_sense_sim(context_sense, sense_list, similarity):
    # get top sense from the list of sense-definition tuples
    # assumes that words and definitions are preprocessed identically
    scores = []
    for sense in sense_list:
        ss = sense[0]
        if similarity == "path":
            try:
                scores.append((context_sense.path_similarity(ss), ss))
            except:
                scores.append((0, ss))    
        elif similarity == "lch":
            try:
                scores.append((context_sense.lch_similarity(ss), ss))
            except:
                scores.append((0, ss))
        elif similarity == "wup":
            try:
                scores.append((context_sense.wup_similarity(ss), ss))
            except:
                scores.append((0, ss))
        elif similarity == "resnik":
            try:
                scores.append((context_sense.res_similarity(ss, semcor_ic), ss))
            except:
                scores.append((0, ss))
        elif similarity == "lin":
            try:
                scores.append((context_sense.lin_similarity(ss, semcor_ic), ss))
            except:
                scores.append((0, ss))
        elif similarity == "jiang":
            try:
                scores.append((context_sense.jcn_similarity(ss, semcor_ic), ss))
            except:
                scores.append((0, ss))
        else:
            print("Similarity metric not found")
            return None
    val, sense = max(scores)
    return val, sense


def lesk_similarity(context_sentence, ambiguous_word, similarity="resnik", pos=None, 
                    synsets=None, majority=True):
    context_senses = get_sense_definitions(set(context_sentence) - set([ambiguous_word]))
    
    if synsets is None:
        synsets = get_sense_definitions(ambiguous_word)[0][1]

    if pos:
        synsets = [ss for ss in synsets if str(ss[0].pos()) == pos]

    if not synsets:
        return None
    
    scores = []
    
    # Here you may have some room for improvement
    # For instance instead of using all the definitions from the context
    # you pick the most common one of each word (i.e. the first)
    for senses in context_senses:
        for sense in senses[1]:
            scores.append(get_top_sense_sim(sense[0], synsets, similarity))
            
    if len(scores) == 0:
        return synsets[0][0]
    
    if majority:
        filtered_scores = [x[1] for x in scores if x[0] != 0]
        if len(filtered_scores) > 0:
            best_sense = Counter(filtered_scores).most_common(1)[0][0]
        else:
            # Almost random selection
            best_sense = Counter([x[1] for x in scores]).most_common(1)[0][0]
    else:
        _, best_sense = max(scores)
    
    return best_sense

def original_lesk(context_sentence, ambiguous_word, pos=None, synsets=None, majority=False):

    context_senses = get_sense_definitions(set(context_sentence)-set([ambiguous_word]))
    if synsets is None:
        synsets = get_sense_definitions(ambiguous_word)[0][1]

    if pos:
        synsets = [ss for ss in synsets if str(ss[0].pos()) == pos]

    if not synsets:
        return None
    scores = []
    # print(synsets)
    for senses in context_senses:
        for sense in senses[1]:
            scores.append(get_top_sense(sense[1], synsets))
            
    if len(scores) == 0:
        return synsets[0][0]
    
    if majority:
        filtered_scores = [x[1] for x in scores if x[0] != 0]
        if len(filtered_scores) > 0:
            best_sense = Counter(filtered_scores).most_common(1)[0][0]
        else:
            # Almost random selection
            best_sense = Counter([x[1] for x in scores]).most_common(1)[0][0]
    else:
        _, best_sense = max(scores)
    return best_sense

[nltk_data] Downloading package wordnet_ic to
[nltk_data]     C:\Users\farih\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!


In [3]:
from nltk.metrics.scores import precision, recall, f_measure, accuracy
nltk.download('senseval')
from nltk.corpus import senseval

# Let's create mapping from convenience
mapping = {
    'interest_1': 'interest.n.01',
    'interest_2': 'interest.n.03',
    'interest_3': 'pastime.n.01',
    'interest_4': 'sake.n.01',
    'interest_5': 'interest.n.05',
    'interest_6': 'interest.n.04',
}

refs = {k: set() for k in mapping.values()}
hyps = {k: set() for k in mapping.values()}
hyps2 = {k: set() for k in mapping.values()}
refs_list = []
hyps_list = []
hyps_list2 = []

# since WordNet defines more senses, let's restrict predictions

synsets = []
for ss in wordnet.synsets('interest', pos='n'):
    if ss.name() in mapping.values():
        # You need to preporecess the definitions
        # Give a look at the preprocessing function that we defined above 
        defn = ss.definition()
        # let's use the same preprocessing
        tags = preprocess(defn)
        toks = [l for w, l, p in tags]
        synsets.append((ss,toks))


[nltk_data] Downloading package senseval to
[nltk_data]     C:\Users\farih\AppData\Roaming\nltk_data...
[nltk_data]   Package senseval is already up-to-date!


In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from nltk.util import ngrams
import numpy as np

vectorizer = CountVectorizer()
classifier = MultinomialNB()
lblencoder = LabelEncoder()
data = [" ".join([t[0] for t in inst.context]) for inst in senseval.instances('interest.pos')]
lbls = [inst.senses[0] for inst in senseval.instances('interest.pos')]
stratified_split = StratifiedKFold(n_splits=5, shuffle=True)

vectors = vectorizer.fit_transform(data)
lblencoder.fit(lbls)
labels = lblencoder.transform(lbls)

def collocational_features(inst):
    p = inst.position
    return {
        "w-2_word": 'NULL' if p < 2 else inst.context[p-2][0],
        "w-1_word": 'NULL' if p < 1 else inst.context[p-1][0],
        "w+1_word": 'NULL' if len(inst.context) - 1 < p+1 else inst.context[p+1][0],
        "w+2_word": 'NULL' if len(inst.context) - 1 < p+2 else inst.context[p+2][0],
        "pos-tag for w-2_word": 'NULL' if p < 2 else nltk.pos_tag(inst.context[p-2], tagset="universal")[1][1],
        "pos-tag for w-1_word": 'NULL' if p < 1 else nltk.pos_tag(inst.context[p-1], tagset="universal")[1][1],
        "pos-tag for w+1_word": 'NULL' if len(inst.context) - 1 < p+1 else nltk.pos_tag(inst.context[p+1], tagset="universal")[1][1],
        "pos-tag for w+2_word": 'NULL' if len(inst.context) - 1 < p+2 else nltk.pos_tag(inst.context[p+2], tagset="universal")[1][1],
        "ngrams within w-2": 'NULL' if p < 2 else list(ngrams(inst.context[p-2], 2))[0],
        "ngrams within w-1": 'NULL' if p < 1 else list(ngrams(inst.context[p-1], 2))[0],
        "ngrams within w+1": 'NULL' if len(inst.context) - 1 < p+1 else list(ngrams(inst.context[p+1], 2))[0],
        "ngrams within w+2": 'NULL' if len(inst.context) - 1 < p+2 else list(ngrams(inst.context[p+2], 2))[0]
    }

data_col = [collocational_features(inst) for inst in senseval.instances('interest.pos')]
print(data_col[0])


dvectorizer = DictVectorizer(sparse=False)
dvectors = dvectorizer.fit_transform(data_col)

uvectors = np.concatenate((vectors.toarray(), dvectors), axis=1)
scores = cross_validate(classifier, uvectors, labels, cv=stratified_split, scoring=['f1_micro'])
print("\033[1mEvaluation score for Concatenated BAO and Extended Collocational Feature Vectors:\033[0m")
print("{:.3f}".format(sum(scores['test_f1_micro'])/len(scores['test_f1_micro'])))


for i, inst in enumerate(senseval.instances('interest.pos')):
    txt = [t[0] for t in inst.context]
    raw_ref = inst.senses[0] # let's get first sense
    hyp = original_lesk(txt, txt[inst.position], synsets=synsets, majority=True).name()
    hyp2 = lesk_similarity(txt, txt[inst.position], similarity="resnik", synsets=synsets, majority=True).name()
    ref = mapping.get(raw_ref)
    
    # for precision, recall, f-measure        
    refs[ref].add(i)
    hyps[hyp].add(i)
    hyps2[hyp2].add(i)
    
    
    # for accuracy
    refs_list.append(ref)
    hyps_list.append(hyp)
    hyps_list2.append(hyp2)

print("\033[1mFor Original Lesk:\033[0m Acc:", round(accuracy(refs_list, hyps_list), 3))
print("\033[1mFor Lesk Similarity:\033[0m Acc:", round(accuracy(refs_list, hyps_list2), 3))

for cls in hyps.keys():
    p = precision(refs[cls], hyps[cls])
    r = recall(refs[cls], hyps[cls])
    f = f_measure(refs[cls], hyps[cls], alpha=1)
print("\033[1mEvaluation score For Original Lesk:\033[0m{:15s}: p={:.3f}; r={:.3f}; f={:.3f}; s={}".format(cls, p, r, f, len(refs[cls])))

for cls in hyps2.keys():  
    p = precision(refs[cls], hyps2[cls])
    r = recall(refs[cls], hyps2[cls])
    f = f_measure(refs[cls], hyps2[cls], alpha=1)
print("\033[1mEvaluation score For Lesk Similarity:\033[0m{:15s}: p={:.3f}; r={:.3f}; f={:.3f}; s={}".format(cls, p, r, f, len(refs[cls])))

{'w-2_word': 'declines', 'w-1_word': 'in', 'w+1_word': 'rates', 'w+2_word': '.', 'pos-tag for w-2_word': 'NOUN', 'pos-tag for w-1_word': 'NOUN', 'pos-tag for w+1_word': 'VERB', 'pos-tag for w+2_word': '.', 'ngrams within w-2': ('declines', 'NNS'), 'ngrams within w-1': ('in', 'IN'), 'ngrams within w+1': ('rates', 'NNS'), 'ngrams within w+2': ('.', '.')}
[1mEvaluation score for Concatenated BAO and Extended Collocational Feature Vectors:[0m
0.906
[1mFor Original Lesk:[0m Acc: 0.046
[1mFor Lesk Similarity:[0m Acc: 0.065
[1mEvaluation score For Original Lesk:[0minterest.n.04  : p=0.989; r=0.070; f=0.989; s=1252
[1mEvaluation score For Lesk Similarity:[0minterest.n.04  : p=1.000; r=0.005; f=1.000; s=1252
