# Natural Language Processing

## Exercise Sheet 6

In [76]:
#imports for all exercises
import nltk
import random
import pickle
from nltk import classify
from nltk import NaiveBayesClassifier, classify, FreqDist
from nltk.classify import apply_features, accuracy
from nltk.corpus import names, senseval, brown, movie_reviews, ppattach
from nltk.tag import untag
from random import shuffle
nltk.download('ppattach')

[nltk_data] Downloading package ppattach to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package ppattach is already up-to-date!


True

### Exercise 1

Write a name gender classifier using the Names Corpus, the `apply_features` function, shuffling, and a test set of 500 instances. Use the following features:

a) first letter;  
b) last letter;  
c) last two letters;  
d) length;  
e) for each letter one feature, which is true if the name contains the letter.

Use the `NaiveBayesClassifier`, calculate the accuracy, and display the 10 most informative features.


In [59]:
# Feature extractor function for names
# Here we define a feature extractor function, gender_features, 
# that takes a name as input and creates a dictionary of features. 
# These features include the first and last letters of the name, 
# the last two letters, the length of the name, and whether each letter of the alphabet is in the name.

def gender_features(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    features["last_two_letters"] = name[-2:].lower()
    features["length"] = len(name)
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features[f"contains({letter})"] = (letter in name.lower())
    return features

# Prepare the data
names_genders_list_of_pair = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
random.shuffle(names_genders_list_of_pair)

# Extract features
    # Iterates through each (name, gender) pair in the names list.
    # For each pair, it applies the gender_features function to the name to create a features dictionary and pairs it with the gender label.
featuresets = [(gender_features(name), gender) for (name, gender) in names_genders_list_of_pair]

# Split into training and test sets
train_set, test_set = featuresets[500:], featuresets[:500]


# Train the Naive Bayes classifier
    # Creates a Naive Bayes classifier.
    # The NaiveBayesClassifier.train method takes the train_set as input and creates a model that associates the features with the gender label.
classifier = NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
print("Accuracy:", nltk.classify.accuracy(classifier, test_set))

# Show the most informative features
classifier.show_most_informative_features(10)



# Now you can use the classifier to predict a new name
name = "Neo"
print(f"The name {name} is predicted to be: {classifier.classify(gender_features(name))}")

Accuracy: 0.774
Most Informative Features
        last_two_letters = 'na'           female : male   =    166.6 : 1.0
        last_two_letters = 'la'           female : male   =     76.0 : 1.0
        last_two_letters = 'ia'           female : male   =     39.9 : 1.0
             last_letter = 'a'            female : male   =     36.6 : 1.0
        last_two_letters = 'sa'           female : male   =     34.3 : 1.0
        last_two_letters = 'ta'           female : male   =     32.4 : 1.0
             last_letter = 'k'              male : female =     32.0 : 1.0
        last_two_letters = 'us'             male : female =     28.1 : 1.0
        last_two_letters = 'rd'             male : female =     26.3 : 1.0
        last_two_letters = 'ra'           female : male   =     25.6 : 1.0
The name Neo is predicted to be: male


### Exercise 2

The Senseval 2 Corpus contains data intended to train word-sense disambiguation classifiers. Using this dataset, build a `NaiveBayesClassifier` that predicts the correct sense tag for a given instance for the word "hard":

In [60]:
from nltk.corpus import senseval
instances = senseval.instances('hard.pos')
labeled_instances = [(inst, inst.senses) for inst in instances] 
size = int(len(labeled_instances) * 0.1)
random.shuffle(labeled_instances)
train_set = apply_features(features, labeled_instances[size:])
test_set = apply_features(features, labeled_instances[:size])

Use the preceding and following word as features. They can be calculated by retrieving the position of the word "hard" as `p=inst.position` and then accessing `inst.context[p-1]` and `inst.context[p+1]`.

Run 10 iterations by reshuffling the instances and printing the individual accuracies. Finally, print the average accuracy.

#### Goal:
We want to determine in which sense (meaning) of a words is used in sentence, when a word has more than meaning

In [61]:
# 'senseval' is a corpus from NLTK specifically designed for word-sense disambiguation tasks
# Function to extract features from the instances
   # Extracts the context words around the target word "hard" in a given instance from the Senseval corpus.
def features(instance):
    p = instance.position
    # dictionary features where it conditionally adds the preceding and following words of "hard" 
    # if they exist (i.e., "hard" is not the first or last word).
    features = {}
    if p > 0:
        features['preceding_word'] = instance.context[p - 1]
    if p < len(instance.context) - 1:
        features['following_word'] = instance.context[p + 1]
    return features

# Load instances for the word "hard"
    # Loads all instances of the word "hard" from the Senseval corpus.
    # labeled_instances is a list of tuples where each tuple contains an instance and its correct sense labels.
instances = senseval.instances('hard.pos')
labeled_instances = [(inst, inst.senses) for inst in instances]

# Variable to store the sum of accuracies for calculating the average
accuracy_sum = 0

# Perform 10 iterations of shuffling and classification
    # a loop that will run 10 times, each time shuffling the data and testing the classifier.
for i in range(10):
    # Shuffle the labeled instances
    random.shuffle(labeled_instances)
    
    # Split the data into training and test sets (10% for test)
        # Determines the size of the test set to be 10% of the full dataset.
        # The data is split into training and test sets based on that size.
    size = int(len(labeled_instances) * 0.1)
    train_set, test_set = labeled_instances[size:], labeled_instances[:size]
    
    # Convert instances to feature sets
         # Converts the training and testing data into a format usable by the classifier, 
         # where each instance is a pair of the feature dictionary and the correct sense.
    train_set = [(features(n), sense) for (n, sense) in train_set]
    test_set = [(features(n), sense) for (n, sense) in test_set]
    
    # Train the Naive Bayes classifier
    classifier = NaiveBayesClassifier.train(train_set)
    
    # Calculate the accuracy of the classifier
    current_accuracy = accuracy(classifier, test_set)
    accuracy_sum += current_accuracy
    
    # Print the accuracy for the current iteration
    print(f"Iteration {i+1} Accuracy: {current_accuracy:.4f}")

# Print the average accuracy
print(f"Average Accuracy: {accuracy_sum/10:.4f}")

Iteration 1 Accuracy: 0.9122
Iteration 2 Accuracy: 0.8961
Iteration 3 Accuracy: 0.8961
Iteration 4 Accuracy: 0.8915
Iteration 5 Accuracy: 0.8891
Iteration 6 Accuracy: 0.8915
Iteration 7 Accuracy: 0.9376
Iteration 8 Accuracy: 0.9284
Iteration 9 Accuracy: 0.9400
Iteration 10 Accuracy: 0.8661
Average Accuracy: 0.9048


In [62]:
# Test the classifier
accuracy = classify.accuracy(classifier, test_set)
print("Accuracy on the test set:", accuracy)

# Optionally, you can print out the predictions for the instances in the test set
for test_instance in test_set:
    features, label = test_instance
    predicted_label = classifier.classify(features)
    print('Predicted sense:', predicted_label, 'Actual sense:', label)


Accuracy on the test set: 0.8660508083140878
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD2',) Actual sense: ('HARD2',)
Predicted sense: ('HARD2',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD3',) Actual sense: ('HARD3',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
Predicted sense: ('HARD1',) Actual sense: ('HARD1',)
P

### Exercise 3

The synonyms "strong" and "powerful" pattern differently. Use the tagged Brown corpus with the universal tagset to first list the nouns which follow "strong" vs. "powerful". Write for this a function `next_noun(word, tagged_text)` which returns the list of nouns that follow `word` in the `tagged_text`. Build then a `NaiveBayesClassifier` that predicts when each word should be used by using the function `apply_features` and the following noun as single feature.

Run 10 iterations by reshuffling the instances and printing the individual accuracies. Finally, print the average accuracy.


In [68]:

from nltk.classify import accuracy as nltk_accuracy  # renamed to avoid conflicts

# Define next_noun, a function that finds all instances of a specified word (converted to lowercase) 
# in the tagged_text and returns the noun that immediately follows it.
def next_noun(word, tagged_text):
    return [tagged_text[i+1][0] for i, (w, tag) in enumerate(tagged_text) 
            if w.lower() == word and i < len(tagged_text)-1 and tagged_text[i+1][1] == 'NOUN']

# Fetch all the tagged words from the Brown corpus using the 'universal' tagset, 
# which is a simplified tagset that has general tags like 'NOUN', 'VERB', etc.
tagged_words = brown.tagged_words(tagset='universal')

# Create a list of feature sets. Each feature set is a tuple 
    # where the first element is a dictionary with a single key-value pair: the key is 'next-word' 
        # and the value is the noun following either 'strong' or 'powerful'. 
# The second element of the tuple is the label (either 'strong' or 'powerful').
featuresets = [( {'next-word': noun}, word ) 
               for word in ('strong', 'powerful') 
               for noun in next_noun(word, tagged_words)]

# Set the number of iterations for the experiment to 10 
num_iterations = 10
# initialize a variable to sum the accuracies across all iterations.
sum_accuracy = 0

for iteration in range(num_iterations):
    random.shuffle(featuresets)  # Shuffle the featuresets to ensure randomness.
    size = int(len(featuresets) * 0.1) #Calculate 10% of the total number of feature sets for use as the test set size.
    train_set, test_set = featuresets[size:], featuresets[:size] # Split the data into a training set (90%) and a test set (10%).
    classifier = NaiveBayesClassifier.train(train_set) # Train the Naive Bayes classifier on the training set.
    # Measure and print the accuracy of the classifier on the test set.
    current_accuracy = nltk_accuracy(classifier, test_set)  # use the renamed function
    sum_accuracy += current_accuracy
    print(f"Iteration {iteration+1}: Accuracy = {current_accuracy:.2f}")

average_accuracy = sum_accuracy / num_iterations
print(f"Average Accuracy: {average_accuracy:.2f}")

Iteration 1: Accuracy = 0.50
Iteration 2: Accuracy = 0.86
Iteration 3: Accuracy = 0.93
Iteration 4: Accuracy = 0.57
Iteration 5: Accuracy = 0.57
Iteration 6: Accuracy = 0.71
Iteration 7: Accuracy = 0.93
Iteration 8: Accuracy = 0.64
Iteration 9: Accuracy = 0.57
Iteration 10: Accuracy = 0.79
Average Accuracy: 0.71


### Exercise 4

Based on the Movie Reviews document classifier discussed in this chapter, build a new `NaiveBayesClassifier`. Tag first the Movie Reviews Corpus using the combined tagger from the previous chapter stored in `t2.pkl`. Filter the tagged words to contain only words for the tags `['JJ', 'JJR', 'JJS', 'RB', 'NN', 'NNS', 'VB', 'VBN', 'VBG', 'VBZ', 'VBD', 'QL']` as well as only alphabetic tokens with at least three characters. Convert the words to lowercase. Use the most common 5000 words as `word_features` in the function `document_features`. 

Run 10 iterations by reshuffling the instances and printing the accuracy and 5 most informative features for each iteration. Finally, print the average accuracy.
    

In [72]:
from pickle import load
input = open('t2.pkl', 'rb')
tagger = load(input)
input.close()

In [73]:
# Load the combined tagger
with open('t2.pkl', 'rb') as f:
    tagger = pickle.load(f)

# Function to filter the tagged words and convert to lowercase
 # takes a list of tagged_words and a list of allowed_word_types. 
    # It filters the words by ensuring they are of the allowed POS tags, are purely alphabetical (no numbers or symbols), 
        # and are at least three characters long. The filtered words are then converted to lowercase.
def filter_words(tagged_words, allowed_word_types):
    return [
        word.lower() for word, tag in tagged_words
        if tag in allowed_word_types and word.isalpha() and len(word) >= 3
    ]

# Function to determine the set of contained word features in a document
    # creates a feature set for a given document. 
        # The features are boolean values indicating whether each word in a predetermined list (word_features) is present in the document. 
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in document_words)
    return features

# Allowed word types
    # defines the list of POS tags that are permitted for inclusion in the analysis.
        # These tags correspond to adjectives, adverbs, nouns, and various verb forms.
allowed_word_types = ['JJ', 'JJR', 'JJS', 'RB', 'NN', 'NNS', 'VB', 'VBN', 'VBG', 'VBZ', 'VBD', 'QL']

# Tag the Movie Reviews Corpus and filter words
    # iterate over each file in the movie reviews corpus, tag each word in the reviews with the POS tagger, 
        #filter the tagged words using the filter_words function, 
            # and then pair the result with the corresponding category (positive or negative).
tagged_reviews = [(filter_words(tagger.tag(movie_reviews.words(fileid)), allowed_word_types), category)
                  for category in movie_reviews.categories()
                  for fileid in movie_reviews.fileids(category)]

# Obtain the 5000 most common words
    # A FreqDist (Frequency Distribution) is created for all words in the corpus that are alphabetical and at least three characters long. 
        # The 5000 most common words are extracted to be used as features.
all_words = FreqDist(word.lower() for word in movie_reviews.words() if word.isalpha() and len(word) >= 3)
word_features = list(all_words)[:5000]

# Shuffle and create the feature sets
random.shuffle(tagged_reviews)
featuresets = [(document_features(words, word_features), category) for (words, category) in tagged_reviews]

# Run 10 iterations
sum_accuracy = 0
#The list of tagged reviews is shuffled to ensure that we get a random distribution of 
    # data when we later split this into training and testing sets. 
        # Feature sets are created for each document using the document_features function.
for i in range(10):
    random.shuffle(featuresets)
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
    classifier = NaiveBayesClassifier.train(train_set)
    current_accuracy = nltk_accuracy(classifier, test_set)
    sum_accuracy += current_accuracy
    print(f"Iteration {i+1}: Accuracy = {current_accuracy:.2f}")
    classifier.show_most_informative_features(5)

# Calculate and print the average accuracy
average_accuracy = sum_accuracy / 10
print(f"Average Accuracy: {average_accuracy:.2f}")

Iteration 1: Accuracy = 0.78
Most Informative Features
     contains(ludicrous) = True              neg : pos    =     14.4 : 1.0
   contains(outstanding) = True              pos : neg    =     13.1 : 1.0
        contains(elliot) = True              pos : neg    =     10.4 : 1.0
        contains(hudson) = True              neg : pos    =     10.2 : 1.0
     contains(insulting) = True              neg : pos    =     10.1 : 1.0
Iteration 2: Accuracy = 0.83
Most Informative Features
     contains(ludicrous) = True              neg : pos    =     15.1 : 1.0
     contains(stupidity) = True              neg : pos    =     10.7 : 1.0
     contains(insulting) = True              neg : pos    =      9.9 : 1.0
   contains(outstanding) = True              pos : neg    =      9.8 : 1.0
         contains(sucks) = True              neg : pos    =      9.1 : 1.0
Iteration 3: Accuracy = 0.83
Most Informative Features
     contains(insulting) = True              neg : pos    =     16.4 : 1.0
     conta

### Exercise 5

The PP Attachment Corpus is a corpus describing prepositional phrase attachment decisions. Each instance in the training corpus is encoded as a `PPAttachment` object:

    from nltk.corpus import ppattach
    ppattach.attachments('training')
    
        [PPAttachment(sent='0', verb='join', noun1='board',
            prep='as', noun2='director', attachment='V'),
        PPAttachment(sent='1', verb='is', noun1='chairman',
            prep='of', noun2='N.V.', attachment='N'),
        ...]

    inst = ppattach.attachments('training')[1]
    (inst.noun1, inst.prep, inst.noun2)
    
        ('chairman', 'of', 'N.V.')

In the same way, `ppattach.attachments('test')` accesses the test instances. Select only the instances where `inst.attachment` is `'N'`:

In [74]:
from nltk.corpus import ppattach
nattach = [inst for inst in ppattach.attachments('training')
               if inst.attachment == 'N']

Using this sub-corpus, build a `NaiveBayesClassifier` that attempts to predict which preposition is used to connect a given pair of nouns. For example, given the pair of nouns "team" and "researchers", the classifier should predict the preposition "of". 

Write for this purpose a function `prepare_featuresets(subcorpus)`, where `subcorpus` is either the string "training" or "test" to return the training set or the test set. 

Print the achieved accuracy as well as the result of `classifier.classify({ 'noun1': 'team', 'noun2': 'researchers' })`.

In [80]:
# Step 2: Function to filter instances where attachment is 'N'
    # creating a list comprehension that iterates over all instances in the training set of the PP Attachment Corpus. 
    # It filters the instances to include only those where the attachment is 'N', 
        # which indicates that the prepositional phrase is attached to a noun.
def filter_n_attachments(subcorpus_name):
    return [inst for inst in ppattach.attachments(subcorpus_name)
            if inst.attachment == 'N']

# Step 3: Function to prepare the feature sets
    # takes a subcorpus argument ('training' or 'test') and prepares the feature sets for the classifier. 
    # It first gets the relevant subset of the corpus, 
        # then creates feature dictionaries for each instance (only for those with 'N' attachment), 
            # mapping the pair of nouns to the preposition used.
def prepare_featuresets(subcorpus_name):
    attachments = filter_n_attachments(subcorpus_name)
    featuresets = []
    for inst in attachments:
        features = {'noun1': inst.noun1, 'noun2': inst.noun2}
        preposition = inst.prep
        featuresets.append((features, preposition))
    return featuresets

# Prepare the training and test sets
training_set = prepare_featuresets('training')
test_set = prepare_featuresets('test')

# Step 4: Train the Naive Bayes Classifier
classifier = NaiveBayesClassifier.train(training_set)

# Step 5: Evaluate the classifier's accuracy and test with a given noun pair
print(f"Accuracy: {nltk_accuracy(classifier, test_set):.2f}")
predicted_prep = classifier.classify({'noun1': 'team', 'noun2': 'researchers'})
print(f"Predicted preposition for ('team', 'researchers'): {predicted_prep}")

Accuracy: 0.57
Predicted preposition for ('team', 'researchers'): of


#### Reference:
- https://www.nltk.org/howto/corpus.html