In [None]:
# https://www.inf.ed.ac.uk/teaching/courses/fnlp/Tutorials/7_WSD/wsd_code.py

In [13]:
from __future__ import division
import nltk
import random
from nltk.corpus import senseval
from nltk.classify import accuracy, NaiveBayesClassifier, MaxentClassifier
from collections import defaultdict

The following shows how the senseval corpus consists of instances, where each instance
consists of a target word (and its tag), it position in the sentence it appeared in
within the corpus (that position being word position, minus punctuation), and the context,
which is the words in the sentence plus their tags. 

In [14]:
# C:\Users\djoshi\AppData\Roaming\nltk_data\corpora\senseval
senseval.fileids()

['hard.pos', 'interest.pos', 'line.pos', 'serve.pos']

In [17]:
senseval.instances("hard.pos")[1000]

SensevalInstance(word='hard-a', position=3, context=[('he', 'PRP'), ('knows', 'VBZ'), ('how', 'WRB'), ('hard', 'JJ'), ('it', 'PRP'), ('can', 'MD'), ('be', 'VB'), (',', ','), ('having', 'VBG'), ('endured', 'VBN'), ('the', 'DT'), ('experience', 'NN'), ('at', 'IN'), ('roughly', 'RB'), ('the', 'DT'), ('same', 'JJ'), ('age', 'NN'), ('.', '.')], senses=('HARD1',))

In [18]:
# 1. Data prep
word= "hard.pos"
_inst_cache = {}

if word not in _inst_cache:
    _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]

all_instances_senses = _inst_cache[word][:]
senses = list(set(l for (i, l) in all_instances_senses))
instances = [i for (i, l) in all_instances_senses]

In [19]:
# _inst_cache['hard.pos']
# [(ins1, sense),(ins1, sense), (ins1, sense),... ]

In [27]:
# all_instances_senses = _inst_cache[word][:]
# all_instances_senses[0]

In [28]:
# senses = list(set(l for (i, l) in all_instances_senses))
# instances = [i for (i, l) in all_instances_senses]

In [19]:
senses

['HARD1', 'HARD3', 'HARD2']

In [21]:
# instances

# 1. Create vocab

In [23]:
stopwords = ['.', ',', '?', '"', '``', "''", "'", '--', '-', ':', ';', '(',
             ')', '$', '000', '1', '2', '10,' 'I', 'i', 'a', 'about', 'after', 'all', 'also', 'an', 'any',
             'are', 'as', 'at', 'and', 'be', 'being', 'because', 'been', 'but', 'by',
             'can', "'d", 'did', 'do', "don'", 'don', 'for', 'from', 'had','has', 'have', 'he',
             'her','him', 'his', 'how', 'if', 'is', 'in', 'it', 'its', "'ll", "'m", 'me',
             'more', 'my', 'n', 'no', 'not', 'of', 'on', 'one', 'or', "'re", "'s", "s",
             'said', 'say', 'says', 'she', 'so', 'some', 'such', "'t", 'than', 'that', 'the',
             'them', 'they', 'their', 'there', 'this', 'to', 'up', 'us', "'ve", 'was', 'we', 'were',
             'what', 'when', 'where', 'which', 'who', 'will', 'with', 'years', 'you',
             'your']

STOPWORDS_SET=set(stopwords)

NO_STOPWORDS = []

In [25]:
instances[0]

SensevalInstance(word='hard-a', position=20, context=[('``', '``'), ('he', 'PRP'), ('may', 'MD'), ('lose', 'VB'), ('all', 'DT'), ('popular', 'JJ'), ('support', 'NN'), (',', ','), ('but', 'CC'), ('someone', 'NN'), ('has', 'VBZ'), ('to', 'TO'), ('kill', 'VB'), ('him', 'PRP'), ('to', 'TO'), ('defeat', 'VB'), ('him', 'PRP'), ('and', 'CC'), ('that', 'DT'), ("'s", 'VBZ'), ('hard', 'JJ'), ('to', 'TO'), ('do', 'VB'), ('.', '.'), ("''", "''")], senses=('HARD1',))

In [31]:
def extract_vocab_frequency(instances, stopwords=STOPWORDS_SET, n=300):
    """
    Given a list of senseval instances, return a list of the n most frequent words that
    appears in its context (i.e., the sentence with the target word in), output is in order
    of frequency 
    """
    fd = nltk.FreqDist()
    for i in instances:
        (target, suffix) = i.word.split('-')
        words = (c[0] for c in i.context if not c[0] == target)
        for word in set(words) - set(stopwords):
            fd[word] += 1
    
#     print(fd)
    return fd.most_common()[:n+1]

In [34]:
# 2. Create vocab (top 300 words having higher frequency)
fd=extract_vocab_frequency(instances,stopwords,n=300)
# print(fd)
vocab= [w for w,f in fd]
# vocab[300]

In [36]:
# vocab

# create train and test set

In [37]:
# 3. # Split the instances into a training and test set,
n = len(all_instances_senses)
random.seed(5444522)
random.shuffle(all_instances_senses)
training_data = all_instances_senses[:int(0.8 * n)]
test_data = all_instances_senses[int(0.8 * n):n]

In [38]:
print(len(training_data))
print(len(test_data))

3466
867


In [58]:
training_data[0]

(SensevalInstance(word='hard-a', position=6, context=[('a', 'DT'), ('call', 'NN'), ('for', 'IN'), ('reforms', 'NNS'), (';', ':'), ('but', 'CC'), ('hard', 'JJ'), ('work', 'NN'), ('does', 'VBZ'), ('n', 'NN'), ("'t", 'NN'), ('always', 'RB'), ('mean', 'VB'), ('pay', 'NN'), (',', ','), ('say', 'VB'), ('activists', 'NNS'), ('.', '.')], senses=('HARD2',)),
 'HARD2')

# 2. Extract features

In [39]:
# 3. Extract features
def wsd_context_features(instance,dist=3):
    features = {}
    ind = instance.position
    con = instance.context
    
    for i in range(max(0, ind-dist), ind):
        j = ind-i
        features['left-context-word-%s(%s)' % (j, con[i][0])] = True

    for i in range(ind+1, min(ind+dist+1, len(con))):
        j = i-ind
        features['right-context-word-%s(%s)' % (j, con[i][0])] = True

 
    features['word'] = instance.word
    features['pos'] = con[ind][1]
    return features

In [40]:
train_features= [(wsd_context_features(i, dist=3), label) for (i, label) in training_data]

In [41]:
len(train_features)

3466

In [42]:
train_features[3465]

({'left-context-word-3(that)': True,
  "left-context-word-2('s)": True,
  'left-context-word-1(pretty)': True,
  'right-context-word-1(to)': True,
  'right-context-word-2(believe)': True,
  'right-context-word-3(,)': True,
  'word': 'hard-a',
  'pos': 'JJ'},
 'HARD1')

# Train the model

In [43]:
classifier = NaiveBayesClassifier.train(train_features)

# Test the model

In [44]:
# get the test features
test_features= [(wsd_context_features(i, dist=3), label) for (i, label) in test_data]

In [45]:
acc = accuracy(classifier, test_features)

In [46]:
acc

0.8973471741637832