In [144]:
import math
import os
import re
import string
import matplotlib.pyplot as plt
from collections import OrderedDict
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [145]:
data_path = "wsd_data/"
def_ext = ".definition"
tst_ext = ".test"
stop_words = stopwords.words('english')
print(sorted(stop_words))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some',

In [146]:
# Store all file names necessary
def_files = []
test_files = []
for file in os.listdir(data_path):
    if file.endswith(def_ext):
        #print(file)
        def_files.append(os.path.join(data_path,file))
    elif file.endswith(tst_ext):
        test_files.append(os.path.join(data_path,file))

In [155]:
def normalize_text(text):
    norm_text = None
    #for text in texts:
    # Lower case
    text = text.lower()
    list_words = text.split()
    words_wo_stop = []
    # Stop word Removal
    for word in list_words:
        if word not in stop_words:
            # Stemming
            stemmer = SnowballStemmer("english")
            word = stemmer.stem(word)
            words_wo_stop.append(word)
    text = " ".join(words_wo_stop)
    # Remove punctuation and numbers
    text = re.sub('['+string.punctuation+']', ' ', text)
    text = re.sub('[·´]','',text)
    text = re.sub('[0-9]','',text)

    norm_text = text.split()
    return norm_text

In [156]:
# Use an OrderedDict to remember the order of the keys
# The senses of each term is currently in order of their usage in WordNet
# So vital to keep the order maintained in the dictionary, as this will be used
# while tie-breaking
def_dict = OrderedDict()
last_key = None
for dfile in def_files:
    print("Reading",dfile)
    with open(dfile,encoding='utf8') as f:
        for line in f:
            if line.startswith("#DEFINITION"):
                defs = line.split()[1]
                def_w, def_m = defs.split('%')[0], defs.split('%')[1]
                last_key = (def_w,def_m)
                #print("Last key",last_key)
                def_dict[last_key] = None
            # Useless considering a line that is just a newline
            elif line != "\n":
                #print(line)
                norm_line = normalize_text(line)
                #print("Normed",normalize_text(line))
                norm_line = set(norm_line)
                # The word itself shouldn't be in the set of words defining it
                norm_line.remove(last_key[0])
                def_dict[last_key] = (norm_line)
#print("DEFINITION DICTIONARY",def_dict.keys())
print("Generated definitions! Number of definitions stored is",len(def_dict))

Reading wsd_data/bass.definition
Reading wsd_data/crane.definition
Reading wsd_data/motion.definition
Reading wsd_data/palm.definition
Reading wsd_data/plant.definition
Reading wsd_data/tank.definition
Generated definitions! Number of definitions stored is 12


In [163]:
def sorensen_similarity(set_x, set_y):
    intersect = set_x.intersection(set_y)
    #print("Intersection is",intersect)
    #union = set_x.union(set_y)
    score = (2 * len(intersect)) / (len(set_x) + len(set_y))
    return score

In [174]:
def jaccard_similarity(set_x, set_y):
    intersect = set_x.intersection(set_y)
    union = set_x.union(set_y)
    score = len(intersect) / len(union)
    return score

In [175]:
def predict_word_sense(test_text, definitions,similarity_measure="sorensen"):
    max_score = -100000000
    predicted_sense = None
    for definition in definitions:
        def_word = definition[0]
        def_sense = definition[1]
        # Word for which we are trying to predict the sense should not be in the set of test words
        try:
            assert def_word not in test_text
        except Exception as e:
            #print(def_word)
            #print(test_text)
            raise e
        # Get similarity score
        #print("Using definition",definition)
        if similarity_measure == "sorensen":
            score = sorensen_similarity(test_text,definitions[definition])
        else:
            score = jaccard_similarity(test_text,definitions[definition])
        if score > max_score:
            #print("Score is",score)
            max_score = score
            predicted_sense = def_sense
    return (predicted_sense,max_score)

In [176]:
# Time to read the test files
true_labels = [] # List of list... Each inner list has list of tuples that has the term and true sense
test_texts = [] # List of lists... Inner list is of sets of test texts

for tfile in test_files:
    #true_label = None
    true_label = []
    test_text = []
    set_text = []
    print("Reading",tfile)
    with open(tfile,encoding='utf8') as f:
        for line in f:
            if line.startswith("#LABEL"):
                labs = line.split()[1]
                #true_label = (labs.split('%')[0], labs.split('%')[1])
                lab = (labs.split('%')[0], labs.split('%')[1])
                true_label.append(lab)
                # TODO
                #true_labels.append(true_label)
                if len(set_text) != 0:
                    test_text.append(set(set_text))
                    set_text = []
            elif line != "\n":
                #print(line)
                norm_line = normalize_text(line)
                #print("Normed",normalize_text(line))
                norm_line = set(norm_line)
                # The word itself shouldn't be in the set of words
                #print("True label")
                norm_line.remove(lab[0])
                set_text += norm_line
    # This is to get the body of the last test text, as we will miss it above
    test_text.append(set(set_text))
    set_text = []
    test_texts.append((test_text))
    test_text = []
    true_labels.append(true_label)
    #last_count = len(true_labels[-2]) if len(true_labels)>1 else 0
    #print(last_count)
    print("Number of cases",len(true_labels[-1]))
#print(len(test_texts[1]))

Reading wsd_data/bass.test
Number of cases 107
Reading wsd_data/crane.test
Number of cases 95
Reading wsd_data/motion.test
Number of cases 201
Reading wsd_data/palm.test
Number of cases 201
Reading wsd_data/plant.test
Number of cases 188
Reading wsd_data/tank.test
Number of cases 201


In [178]:
for i in range(len(true_labels)):
    true_label = true_labels[i]
    test_text = test_texts[i]
    print("*************Testing for",test_files[i],"*************")
    correct_predictions_sorensen = 0
    correct_predictions_jaccard = 0
    # Get the corresponding definitions and store in an abriged dictionary
    abriged_dict = OrderedDict()
    for key in def_dict:
        if true_label[0][0] in key:
            abriged_dict[key] = def_dict[key]
    #print("Keys:",abriged_dict.keys())
    # If we guess, we guess every class to be the one with the highest WordNet frequency
    guess = list(abriged_dict.keys())[0][1]
    guess_predictions = 0
    for j in range(len(true_label)):
        true_lab = true_label[j]
        text = test_text[j]
        
        #print("Truth label is",true_lab)
        pred_sorensen = predict_word_sense(text,abriged_dict,similarity_measure="sorensen")
        pred_jaccard = predict_word_sense(text,abriged_dict,similarity_measure="jaccard")
        #print(j+1,"Prediction of",true_lab[0],"is",pred[0],"while true label is",true_lab[1])
        if pred_sorensen[0] == true_lab[1]:
            correct_predictions_sorensen += 1
        if pred_jaccard[0] == true_lab[1]:
            correct_predictions_jaccard += 1
        if guess == true_lab[1]:
            guess_predictions += 1
    accuracy_sorensen = (correct_predictions_sorensen / len(true_label))
    accuracy_jaccard = (correct_predictions_jaccard / len(true_label))
    guess_accuracy = (guess_predictions / len(true_label))
    print("\nAccuracy achieved for word",true_lab[0],"is",accuracy_sorensen*100,"%\n")
    print("\nAccuracy achieved for word",true_lab[0],"using Jaccard similarity is",accuracy_jaccard*100,"%\n")
    print("\nBy just guessing accuracy achieved for word",true_lab[0],"is",guess_accuracy*100,"%\n")

*************Testing for wsd_data/bass.test *************

Accuracy achieved for word bass is 75.70093457943925 %


Accuracy achieved for word bass using Jaccard similarity is 75.70093457943925 %


By just guessing accuracy achieved for word bass is 90.65420560747664 %

*************Testing for wsd_data/crane.test *************

Accuracy achieved for word crane is 80.0 %


Accuracy achieved for word crane using Jaccard similarity is 80.0 %


By just guessing accuracy achieved for word crane is 75.78947368421053 %

*************Testing for wsd_data/motion.test *************

Accuracy achieved for word motion is 66.66666666666666 %


Accuracy achieved for word motion using Jaccard similarity is 66.66666666666666 %


By just guessing accuracy achieved for word motion is 70.64676616915423 %

*************Testing for wsd_data/palm.test *************

Accuracy achieved for word palm is 78.1094527363184 %


Accuracy achieved for word palm using Jaccard similarity is 78.1094527363184 %


By ju

## Question 4. How this algorithm differs from the original algorithm
<p>The original Lesk's algorithm actually used the lexicon definitions of the context words and the description of the target word to classify the sense of a particular word. So, we used the different senses of the target word and also the lexicographic meanings of the context words. </p>
<p>However, here in the simplified Lesk's algorithm, we use a bag of words model, and only take into account which words appear in the definition of a word in a dictionary or thesaurus. Then using this set or bag of words, we check for overlap with a test text using a suitable similarity measure. Hence we use no context lexicons.
</p>