# Natural Language Processing

## Exercise Sheet 6

In [1]:
#imports for all exercises
import nltk
import re
import random
import matplotlib.pyplot as plt
import nltk.classify as classify
from string import ascii_lowercase
from itertools import cycle

from nltk.corpus import names
from nltk.corpus import senseval
from nltk.corpus import brown
from nltk.corpus import movie_reviews

### Exercise 1

Write a name gender classifier using the Names Corpus, the `apply_features` function, shuffling, and a test set of 500 instances. Use the following features:

a) first letter;  
b) last letter;  
c) last two letters;  
d) length;  
e) for each letter one feature, which is true if the name contains the letter.

Use the `NaiveBayesClassifier`, calculate the accuracy, and display the 10 most informative features.


In [2]:
def gender_features(name):
    feature_dict = {
        'word': name,
        'first_letter': name[0],
        'last_letter': name[-1:],
        'last_two_letters': name[-2:],
        'length': len(name)
    }
    
    for char in ascii_lowercase:
        feature_dict[char] = True if char in name else False
        
    return feature_dict

In [3]:
labeled_names = ([(name.lower(), 'male') for name in names.words('male.txt')] + [(name.lower(), 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

train_set, test_set = classify.apply_features(gender_features, labeled_names[:-500]), \
                      classify.apply_features(gender_features, labeled_names[-500:])

In [4]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

#### Accuracy and informative features

In [5]:
print("Accuracy: ", classify.accuracy(classifier, test_set), end="\n\n")
classifier.show_most_informative_features(10)

Accuracy:  0.76

Most Informative Features
        last_two_letters = 'na'           female : male   =    167.1 : 1.0
        last_two_letters = 'la'           female : male   =     74.4 : 1.0
        last_two_letters = 'ia'           female : male   =     56.7 : 1.0
             last_letter = 'a'            female : male   =     38.4 : 1.0
        last_two_letters = 'sa'           female : male   =     34.3 : 1.0
             last_letter = 'k'              male : female =     31.6 : 1.0
        last_two_letters = 'rd'             male : female =     31.6 : 1.0
        last_two_letters = 'us'             male : female =     29.7 : 1.0
        last_two_letters = 'ra'           female : male   =     25.9 : 1.0
        last_two_letters = 'ta'           female : male   =     25.2 : 1.0


### Exercise 2

The Senseval 2 Corpus contains data intended to train word-sense disambiguation classifiers. Using this dataset, build a `NaiveBayesClassifier` that predicts the correct sense tag for a given instance for the word "hard":

In [6]:
instances = senseval.instances('hard.pos')
labeled_instances = [(inst, inst.senses) for inst in instances]

tags_set = set()
for instance in instances:
    for el in instance.context:
        tags_set.add(el[1])
        
instances[0]

SensevalInstance(word='hard-a', position=20, context=[('``', '``'), ('he', 'PRP'), ('may', 'MD'), ('lose', 'VB'), ('all', 'DT'), ('popular', 'JJ'), ('support', 'NN'), (',', ','), ('but', 'CC'), ('someone', 'NN'), ('has', 'VBZ'), ('to', 'TO'), ('kill', 'VB'), ('him', 'PRP'), ('to', 'TO'), ('defeat', 'VB'), ('him', 'PRP'), ('and', 'CC'), ('that', 'DT'), ("'s", 'VBZ'), ('hard', 'JJ'), ('to', 'TO'), ('do', 'VB'), ('.', '.'), ("''", "''")], senses=('HARD1',))

In [7]:
def count(tag, taglist):
    return sum([1 if t==tag else 0 for t in taglist])

def gen_senseval_features(instance):
    instance_tags = [c[1] for c in instance.context]
        
    feature_dict = {
        "position": instance.position,
        #"position_ratio": instance.position / len(instance.context),
        "context": " ".join(instance_tags)
    }
    
    if instance.position > 0:
        feature_dict["preceding"] = instance.context[instance.position - 1]
    else:
        feature_dict["preceding"] = None
    
    if instance.position < len(instance.context):
        feature_dict["succeeding"] = instance.context[instance.position + 1]
    else:
        feature_dict["succeeding"] = None
        
    """
    # Every possible tag count
    for tag in tags_set:
        feature_dict[tag] = count(tag, instance_tags)
    
    # Every possible tag True/False
    for tag in tags_set:
        feature_dict[tag] = True if tag in instance_tags else False
    """
    
    return feature_dict

In [8]:
def prepare_train_test(data, feature_func, test_ratio=0.1, shuffle=True):
    if shuffle:
        random.shuffle(data)
    test_size = int(len(data) * test_ratio)

    train_set = classify.apply_features(feature_func, data[test_size:])
    test_set = classify.apply_features(feature_func, data[:test_size])
    
    return train_set, test_set


train_set, test_set = prepare_train_test(labeled_instances, gen_senseval_features, shuffle=True)
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [9]:
print("Accuracy: ", classify.accuracy(classifier, test_set), end="\n\n")
classifier.show_most_informative_features(10)

Accuracy:  0.8637413394919169

Most Informative Features
              succeeding = ('to', 'TO')    HARD1 : HARD2  =    139.4 : 1.0
               preceding = ("'s", 'VBZ')   HARD1 : HARD3  =     71.6 : 1.0
              succeeding = ('work', 'NN')  HARD2 : HARD3  =     71.3 : 1.0
              succeeding = ('line', 'NN')  HARD2 : HARD1  =     52.4 : 1.0
              succeeding = ('place', 'NN')  HARD3 : HARD1  =     44.9 : 1.0
               preceding = ('no', 'DT')    HARD2 : HARD1  =     35.9 : 1.0
              succeeding = ('for', 'IN')   HARD1 : HARD2  =     34.9 : 1.0
              succeeding = ('evidence', 'NN')  HARD2 : HARD1  =     25.3 : 1.0
               preceding = ('other', 'JJ')  HARD3 : HARD1  =     20.7 : 1.0
              succeeding = ("''", "''")    HARD3 : HARD1  =     19.3 : 1.0


Use the preceding and following word as features. They can be calculated by retrieving the position of the word "hard" as `p=inst.position` and then accessing `inst.context[p-1]` and `inst.context[p+1]`.

Run 10 iterations by reshuffling the instances and printing the individual accuracies. Finally, print the average accuracy.

In [10]:
accuracies = []
for i in range(1, 11):
    train_set, test_set = prepare_train_test(labeled_instances, gen_senseval_features, shuffle=True)
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    accuracy = classify.accuracy(classifier, test_set)
    accuracies.append(accuracy)
    print("Accuracy on iteration %d: %f" % (i, accuracy))
    
print("\nAverage accuracy: ", sum(accuracies)/len(accuracies))

Accuracy on iteration 1: 0.909931
Accuracy on iteration 2: 0.898383
Accuracy on iteration 3: 0.928406
Accuracy on iteration 4: 0.868360
Accuracy on iteration 5: 0.882217
Accuracy on iteration 6: 0.882217
Accuracy on iteration 7: 0.898383
Accuracy on iteration 8: 0.879908
Accuracy on iteration 9: 0.893764
Accuracy on iteration 10: 0.898383

Average accuracy:  0.8939953810623557


### Exercise 3

The synonyms "strong" and "powerful" pattern differently. Use the tagged Brown corpus with the universal tagset to first list the nouns which follow "strong" vs. "powerful". Write for this a function `next_noun(word, tagged_text)` which returns the list of nouns that follow `word` in the `tagged_text`. Build then a `NaiveBayesClassifier` that predicts when each word should be used by using the function `apply_features` and the following noun as single feature.

Run 10 iterations by reshuffling the instances and printing the individual accuracies. Finally, print the average accuracy.


In [11]:
def next_noun(word: str, tagged_text: list[tuple[str, str]]):
    actively_looking = False # avoids second for loop
    nouns = []
    for w, t in tagged_text:
        if w == word:
            actively_looking = True
            
            # This word is not a noun
            continue
            
        if actively_looking:
            if t == 'NOUN':
                nouns.append(w)
                actively_looking = False
                
    return nouns

In [12]:
tagged_words = brown.tagged_words(tagset='universal')

In [13]:
labeled_adj = list(zip(next_noun("strong", tagged_words), cycle(["strong"]))) + list(zip(next_noun("powerful", tagged_words), cycle(["powerful"])))

In [14]:
def gen_noun_features(noun):
    return {'noun': noun}

In [15]:
best_classifier = (0.0, None)
accuracies = []
for i in range(1, 11):
    train_set, test_set = prepare_train_test(labeled_adj, gen_noun_features, shuffle=True)
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    accuracy = classify.accuracy(classifier, test_set)
    if accuracy > best_classifier[0]:
        best_classifier = (accuracy, classifier)
    
    accuracies.append(accuracy)
    print("Accuracy on iteration %d: %f" % (i, accuracy))
    
print("\nAverage accuracy: ", sum(accuracies)/len(accuracies))

Accuracy on iteration 1: 0.640000
Accuracy on iteration 2: 0.640000
Accuracy on iteration 3: 0.880000
Accuracy on iteration 4: 0.800000
Accuracy on iteration 5: 0.800000
Accuracy on iteration 6: 0.800000
Accuracy on iteration 7: 0.760000
Accuracy on iteration 8: 0.880000
Accuracy on iteration 9: 0.720000
Accuracy on iteration 10: 0.680000

Average accuracy:  0.7599999999999999


In [16]:
best_classifier[1].classify(gen_noun_features('nature'))

'powerful'

### Exercise 4

Based on the Movie Reviews document classifier discussed in this chapter, build a new `NaiveBayesClassifier`. Tag first the Movie Reviews Corpus using the combined tagger from the previous chapter stored in `t2.pkl`. Filter the tagged words to contain only words for the tags `['JJ', 'JJR', 'JJS', 'RB', 'NN', 'NNS', 'VB', 'VBN', 'VBG', 'VBZ', 'VBD', 'QL']` as well as only alphabetic tokens with at least three characters. Convert the words to lowercase. Use the most common 5000 words as `word_features` in the function `document_features`. 

Run 10 iterations by reshuffling the instances and printing the accuracy and 5 most informative features for each iteration. Finally, print the average accuracy.
    

In [17]:
# Copy from chapter 5
brown_tagged_sents = brown.tagged_sents(categories='news')
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]

t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)
# End copy

0.8452108043456593

In [18]:
reviews_tagged = t2.tag(movie_reviews.words())
reviews_tagged = [(word.lower(), tag) for (word, tag) in reviews_tagged \
                  if tag in ['JJ', 'JJR', 'JJS', 'RB', 'NN', 'NNS', 'VB', 'VBN', 'VBG', 'VBZ', 'VBD', 'QL'] \
                  and len(word) >= 3 and word.isalpha()]

In [19]:
fdst = nltk.FreqDist(reviews_tagged)
word_features = [tagged_word[0] for tagged_word, freq in fdst.most_common(5000)]

In [20]:
# Compare to nltk book
def document_features(document):
    document_words = set(document)
    
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [21]:
documents = [(list(movie_reviews.words(fileid)), category) \
             for category in movie_reviews.categories() \
             for fileid in movie_reviews.fileids(category)]

In [22]:
accuracies = []
for i in range(1, 11):
    random.shuffle(documents)
    featuresets = [(document_features(d), c) for (d,c) in documents]
    test_size = int(len(featuresets) * 0.1)
    
    train_set, test_set = featuresets[test_size:], featuresets[:test_size]

    classifier = nltk.NaiveBayesClassifier.train(train_set)
    accuracy = nltk.classify.accuracy(classifier, test_set)
    accuracies.append(accuracy)
    
    print("Accuracy: ", accuracy)
    classifier.show_most_informative_features(5)
    print("\n\n")
    
print("Average accuracy: ", sum(accuracies)/len(accuracies))

Accuracy:  0.785
Most Informative Features
   contains(outstanding) = True              pos : neg    =     12.3 : 1.0
     contains(stupidity) = True              neg : pos    =     11.5 : 1.0
         contains(stark) = True              pos : neg    =     11.0 : 1.0
     contains(ludicrous) = True              neg : pos    =     10.5 : 1.0
     contains(insulting) = True              neg : pos    =     10.2 : 1.0



Accuracy:  0.81
Most Informative Features
     contains(ludicrous) = True              neg : pos    =     12.8 : 1.0
   contains(outstanding) = True              pos : neg    =     12.8 : 1.0
        contains(hudson) = True              neg : pos    =      9.2 : 1.0
     contains(insulting) = True              neg : pos    =      9.2 : 1.0
         contains(lousy) = True              neg : pos    =      9.2 : 1.0



Accuracy:  0.84
Most Informative Features
   contains(outstanding) = True              pos : neg    =     12.6 : 1.0
    contains(schumacher) = True           

### Exercise 5

The PP Attachment Corpus is a corpus describing prepositional phrase attachment decisions. Each instance in the training corpus is encoded as a `PPAttachment` object:

    from nltk.corpus import ppattach
    ppattach.attachments('training')
    
        [PPAttachment(sent='0', verb='join', noun1='board',
            prep='as', noun2='director', attachment='V'),
        PPAttachment(sent='1', verb='is', noun1='chairman',
            prep='of', noun2='N.V.', attachment='N'),
        ...]

    inst = ppattach.attachments('training')[1]
    (inst.noun1, inst.prep, inst.noun2)
    
        ('chairman', 'of', 'N.V.')

In the same way, `ppattach.attachments('test')` accesses the test instances. Select only the instances where `inst.attachment` is `'N'`:

In [23]:
from nltk.corpus import ppattach

Using this sub-corpus, build a `NaiveBayesClassifier` that attempts to predict which preposition is used to connect a given pair of nouns. For example, given the pair of nouns "team" and "researchers", the classifier should predict the preposition "of". 

Write for this purpose a function `prepare_featuresets(subcorpus)`, where `subcorpus` is either the string "training" or "test" to return the training set or the test set. 

Print the achieved accuracy as well as the result of `classifier.classify({ 'noun1': 'team', 'noun2': 'researchers' })`.

In [24]:
def prepare_featuresets(subcorpus: str):
    return [({'noun1': inst.noun1, 'noun2': inst.noun2}, inst.prep) \
            for inst in ppattach.attachments(subcorpus) \
            if inst.attachment == 'N']

In [25]:
classifier = nltk.NaiveBayesClassifier.train(prepare_featuresets("training"))
accuracy = nltk.classify.accuracy(classifier, prepare_featuresets("test"))

print("Accuracy: ", accuracy)
classifier.classify({ 'noun1': 'team', 'noun2': 'researchers' })

Accuracy:  0.5690032858707558


'of'

In [26]:
classifier.classify({ 'noun1': 'school', 'noun2': 'loosers' })

'in'