# Natural Language Processing

## Exercise Sheet 5

In [11]:
#imports for all exercises
import nltk
from nltk.corpus import brown
from nltk import FreqDist
from collections import Counter, defaultdict


### Exercise 1

Produce a sorted list of tags used in the Brown corpus, removing duplicates. Do the same for the universal part-of-speech tagset.

In [5]:
first_sentence = brown.tagged_sents()[0]

print(first_sentence)

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]


#### Tags through an example

- I/PRP run/VB fast/.
- You/PRP run/VB slow/RB.

#### The tags are 
- /PRP :  pronoun,
- /VB : verb,
- /RB : adverb.

#### Universal Part-of-Speech Tagset
LTK provides a "universal" tagset, contains of 12 Tags, which is a simplified set of tags that can be used in many languages and corpora.

In [2]:
# Get the tags from Brown corpus
brown_tags = [tag for (word, tag) in brown.tagged_words()]

# Get unique tags and sort
unique_brown_tags = sorted(set(brown_tags))

# Get the universal tags from Brown corpus
universal_tags = [tag for (word, tag) in brown.tagged_words(tagset='universal')]

# Get unique universal tags and sort
unique_universal_tags = sorted(set(universal_tags))

print("Brown Corpus Tags:", unique_brown_tags)
print("\nUniversal Tags:", unique_universal_tags)

Brown Corpus Tags: ["'", "''", '(', '(-HL', ')', ')-HL', '*', '*-HL', '*-NC', '*-TL', ',', ',-HL', ',-NC', ',-TL', '--', '---HL', '.', '.-HL', '.-NC', '.-TL', ':', ':-HL', ':-TL', 'ABL', 'ABN', 'ABN-HL', 'ABN-NC', 'ABN-TL', 'ABX', 'AP', 'AP$', 'AP+AP-NC', 'AP-HL', 'AP-NC', 'AP-TL', 'AT', 'AT-HL', 'AT-NC', 'AT-TL', 'AT-TL-HL', 'BE', 'BE-HL', 'BE-TL', 'BED', 'BED*', 'BED-NC', 'BEDZ', 'BEDZ*', 'BEDZ-HL', 'BEDZ-NC', 'BEG', 'BEM', 'BEM*', 'BEM-NC', 'BEN', 'BEN-TL', 'BER', 'BER*', 'BER*-NC', 'BER-HL', 'BER-NC', 'BER-TL', 'BEZ', 'BEZ*', 'BEZ-HL', 'BEZ-NC', 'BEZ-TL', 'CC', 'CC-HL', 'CC-NC', 'CC-TL', 'CC-TL-HL', 'CD', 'CD$', 'CD-HL', 'CD-NC', 'CD-TL', 'CD-TL-HL', 'CS', 'CS-HL', 'CS-NC', 'CS-TL', 'DO', 'DO*', 'DO*-HL', 'DO+PPSS', 'DO-HL', 'DO-NC', 'DO-TL', 'DOD', 'DOD*', 'DOD*-TL', 'DOD-NC', 'DOZ', 'DOZ*', 'DOZ*-TL', 'DOZ-HL', 'DOZ-TL', 'DT', 'DT$', 'DT+BEZ', 'DT+BEZ-NC', 'DT+MD', 'DT-HL', 'DT-NC', 'DT-TL', 'DTI', 'DTI-HL', 'DTI-TL', 'DTS', 'DTS+BEZ', 'DTS-HL', 'DTX', 'EX', 'EX+BEZ', 'EX+HVD', '

### Exercise 2

Write a program to process the Brown Corpus using the universal part-of-speech tagset to find out which nouns are more common in their plural form than in their singular form. Only consider regular plurals formed with the "-s" suffix. Print an alphabetically sorted list of the nouns together with the frequencies for the singular and plural forms, one per line. 


#### Goal: 
 From the Brown corpus, identify nouns that are more commonly used in their plural forms (ending in "-s") than in their singular forms.

#### Approach:
- We will go through the words in the corpus and count occurrences of singular and plural nouns. 
- Once we have these counts, we'll identify and list the nouns that appear more times in their plural form than in singular.

In [4]:
# Getting noun frequencies
noun_freq = FreqDist(word.lower() for word, tag in brown.tagged_words(tagset='universal') if tag == "NOUN")

# Filtering nouns that are more common in plural form
more_plural = [(noun, noun_freq[noun], noun_freq[noun + "s"]) 
               for noun in noun_freq 
               if noun_freq[noun + "s"] > noun_freq[noun]]

# Sorting and printing
for singular, singular_freq, plural_freq in sorted(more_plural):
    print(f"{singular}: Singular({singular_freq}), Plural({plural_freq})")

2-year-old: Singular(2), Plural(3)
aberration: Singular(3), Plural(5)
abolitionist: Singular(1), Plural(4)
aborigine: Singular(7), Plural(8)
absolute: Singular(1), Plural(3)
abstract: Singular(1), Plural(4)
accommodation: Singular(1), Plural(8)
accomplishment: Singular(7), Plural(10)
acre: Singular(10), Plural(44)
active: Singular(6), Plural(8)
ad: Singular(5), Plural(10)
adapter: Singular(1), Plural(2)
addict: Singular(1), Plural(4)
additive: Singular(3), Plural(4)
adherent: Singular(1), Plural(5)
adjective: Singular(2), Plural(4)
admonition: Singular(1), Plural(3)
adventure: Singular(13), Plural(14)
adverb: Singular(1), Plural(2)
advertisement: Singular(2), Plural(3)
advertiser: Singular(1), Plural(5)
adviser: Singular(6), Plural(12)
advisor: Singular(1), Plural(5)
affair: Singular(33), Plural(84)
affiliation: Singular(4), Plural(5)
ailment: Singular(4), Plural(6)
airfield: Singular(5), Plural(6)
airline: Singular(2), Plural(5)
alien: Singular(2), Plural(3)
allowance: Singular(16), P

### Exercise 3

Find out which word has the greatest number of distinct tags in the Brown corpus using the original tagset. Without using the `most_common` function, print a list of the tags together with the frequencies for the word, sorted by frequency from highest to lowest, one per line.



#### Goal: 
- Identify the word in the Brown corpus that is tagged with the most number of unique tags. 
- Display the tags and their respective frequencies for this word.


In [20]:
# 1- Create a dictionary to track tags for each word
  #Initializing a dictionary (tags_dict) where keys are words and values are sets. Sets are used to store unique tags for each word without duplicates.
tags_dict = defaultdict(set)
 
# 2-  Iterating over each (word, tag) tuple in the Brown corpus.
for word, tag in brown.tagged_words():
    #Converting the word to lowercase to ensure uniformity.
    # Adding the tag to the set corresponding to the word in the tags_dict. 
    # If a word has the same tag multiple times, the set will store it only once.
    tags_dict[word.lower()].add(tag)

# 3- Find the word with the maximum number of distinct tags
max_word = max(tags_dict, key=lambda k: len(tags_dict[k])) # The max function is used with a custom key function to get the word associated with the largest set of tags.
max_tags = tags_dict[max_word] # Storing the set of unique tags associated with the max_word in the variable max_tags.

# 4- Get frequencies of tags for the max_word

# Creating a counter object (word_tags_original) to count occurrences of each (word, tag) tuple in the Brown corpus. This will help us find the frequency of each tag for the max_word.
word_tags_original = Counter([(word.lower(), tag) for (word, tag) in brown.tagged_words()])

# Creating a list of tuples (tags_frequencies), where each tuple contains a tag and its frequency for the max_word.
# This list is derived from the max_tags set and the word_tags_original counter.
tags_frequencies = [(tag, word_tags_original[(max_word, tag)]) for tag in max_tags]

# 5- Sort and print tags by frequency

# Sorting the tags_frequencies list based on the frequencies of the tags in descending order.
tags_frequencies.sort(key=lambda x: x[1], reverse=True)

print(f"The word '{max_word}' has  {len(max_tags)}  distinct tags.")


# Iterating over each (tag, frequency) tuple in the sorted tags_frequencies list and printing them.
for tag, freq in tags_frequencies:
    print(tag, freq)

The word 'that' has  15  distinct tags.
CS 6464
DT 2260
WPS 1654
WPO 135
QL 56
DT-NC 6
DT-TL 5
WPS-TL 3
WPS-NC 3
CS-NC 2
WPS-HL 2
NIL 1
WPO-NC 1
CS-HL 1
DT-HL 1


In [14]:
word_tags = defaultdict(set)

# Extracting word-tag combinations from Brown corpus
for word, tag in brown.tagged_words():
    word_tags[word].add(tag)

# Finding word with maximum distinct tags
max_word = max(word_tags, key=lambda x: len(word_tags[x]))

tags_frequency = {tag: sum(1 for _, t in brown.tagged_words() if t == tag and _ == max_word) 
                  for tag in word_tags[max_word]}

# Sorting by frequency and printing
for tag, freq in sorted(tags_frequency.items(), key=lambda x: x[1], reverse=True):
    print(f"{tag}: {freq}")


CS: 6419
DT: 1975
WPS: 1638
WPO: 135
QL: 54
DT-NC: 6
WPS-NC: 3
CS-NC: 2
WPS-HL: 2
NIL: 1
WPO-NC: 1
CS-HL: 1


### Exercise 4

Tabulate the frequencies of the universal tags that precede nouns in the Brown Corpus. 

#### Goal:
We want see which tags often come before nouns in the Brown Corpus

#### Approach:
- Go through the Brown Corpus word by word.
- Whenever you find a noun, note down the tag of the word that comes before it.
- Count the occurrences of each tag that precedes nouns.

In [21]:
# Define a default dictionary to hold frequencies of tags preceding nouns
preceding_tags = defaultdict(int)

# Using universal tagset for simplicity
# We use the brown.tagged_words function with the 'universal' tagset to get words along with their simplified tags.tagged_words = list(brown.tagged_words(tagset='universal'))

# Iterate over the tagged words with an index
for i, (word, tag) in enumerate(tagged_words):
    # Check if the word is a noun
    if tag == "NOUN" and i > 0:  # Whenever we find a noun, 
        preceding_tag = tagged_words[i-1][1] # we look at the word just before it, and note down its tag.
        preceding_tags[preceding_tag] += 1   # increment the count of the given tag by 1

# Display the frequencies
for tag, count in sorted(preceding_tags.items(), key=lambda x: x[1], reverse=True):
    print(tag, count)

DET 85845
ADJ 54653
NOUN 41309
ADP 37418
. 20084
VERB 17851
CONJ 9294
NUM 5668
ADV 1851
PRT 1068
PRON 440
X 77


### Exercise 5

Write a function `ambiguous(tagged_text)` that returns the number of ambiguous word types as well as the number of all word types in a tagged text. A word type is ambiguous if it is tagged with at least two different tags. Use the function to print both values as well as the percentage of ambiguous word types for the Brown Corpus both for the original and the universal tagset.

#### Goal:
- Create Funcation "ambiguous(tagged_text)", calculates how many words in tagged text have more than one tag
- We want to find words that have been labeled with more than one tag

#### Approach:
- For each word, we'll see how many different tags it has. If a word has more than one tag, it's ambiguous.

In [22]:

def ambiguous(tagged_text):
    # This dictionary will store each word as a key. 
    # The value will be a set containing all unique tags the word is associated with.
    word_tags = defaultdict(set)
    
    # Populating the word_tags dictionary
    for word, tag in tagged_text:
        word_tags[word.lower()].add(tag)  # We convert word to lowercase to ensure consistency

    # Counting the number of ambiguous words - those with more than 1 tag
    ambiguous_words = sum(1 for tags in word_tags.values() if len(tags) > 1)

    # Total unique words
    total_words = len(word_tags)

    # Percentage of ambiguous words
    percentage_ambiguous = (ambiguous_words / total_words) * 100

    # Printing the results
    print(f"Number of Ambiguous Word Types: {ambiguous_words}")
    print(f"Total Number of Word Types: {total_words}")
    print(f"Percentage of Ambiguous Word Types: {percentage_ambiguous:.2f}%")

    return ambiguous_words, total_words

# Using the function for the Brown Corpus with the original tagset
print("Original Tagset:")
ambiguous(brown.tagged_words())

# Using the function for the Brown Corpus with the universal tagset
print("\nUniversal Tagset:")
ambiguous(brown.tagged_words(tagset='universal'))

Original Tagset:
Number of Ambiguous Word Types: 9580
Total Number of Word Types: 49815
Percentage of Ambiguous Word Types: 19.23%

Universal Tagset:
Number of Ambiguous Word Types: 3408
Total Number of Word Types: 49815
Percentage of Ambiguous Word Types: 6.84%


(3408, 49815)

### Exercise 6

Write code to search the Brown Corpus to answer the following questions:

a) produce an alphabetically sorted list of the distinct words tagged as `MD`  

In [23]:
# Extracting words tagged as MD
md_words = {word.lower() for word, tag in brown.tagged_words() if tag == 'MD'}

# Sorting and printing the words
print(sorted(md_words))

["c'n", 'can', 'colde', 'could', 'dare', 'kin', 'maht', 'mai', 'may', 'maye', 'mayst', 'might', 'must', 'need', 'ought', 'shall', 'should', 'shuld', 'shulde', 'wil', 'will', 'wilt', 'wod', 'wold', 'wolde', 'would']


b) identify words that can be plural nouns or third person singular verbs  

In [28]:
# Initializing an empty set
word_tag_set = set()

# Iterating over each word, tag pair in the Brown corpus
for word, tag in brown.tagged_words():
    # If the tag is 'NNS' or 'VBZ'
    if tag in ['NNS', 'VBZ']:
        # Adding a tuple (word, tag) to the set
        word_tag_set.add((word.lower(), tag))

# Sorting the set primarily by word, then by tag
sorted_word_tags = sorted(word_tag_set, key=lambda x: (x[0], x[1]))

# Displaying the sorted list
for item in sorted_word_tags:
    print(item)

('$.027', 'NNS')
('$.03', 'NNS')
('$.054/mbf', 'NNS')
('$.07', 'NNS')
('$.07/cwt', 'NNS')
('$.076', 'NNS')
('$.09', 'NNS')
('$.105', 'NNS')
('$.12', 'NNS')
('$.30', 'NNS')
('$.30/mbf', 'NNS')
('$.50', 'NNS')
('$.65', 'NNS')
('$.75', 'NNS')
('$.80', 'NNS')
('$.86', 'NNS')
('$.90', 'NNS')
('$0.9', 'NNS')
('$1', 'NNS')
('$1,000', 'NNS')
('$1,000,000', 'NNS')
('$1,000,000,000', 'NNS')
('$1,200', 'NNS')
('$1,250,000', 'NNS')
('$1,276', 'NNS')
('$1,390', 'NNS')
('$1,450,000,000', 'NNS')
('$1,500', 'NNS')
('$1,500,000', 'NNS')
('$1,600', 'NNS')
('$1,750,000', 'NNS')
('$1,800', 'NNS')
('$1,961,000', 'NNS')
('$1.0', 'NNS')
('$1.1', 'NNS')
('$1.10', 'NNS')
('$1.26', 'NNS')
('$1.4', 'NNS')
('$1.5', 'NNS')
('$1.6', 'NNS')
('$1.60', 'NNS')
('$1.65', 'NNS')
('$1.7', 'NNS')
('$1.8', 'NNS')
('$1.80', 'NNS')
('$1.9', 'NNS')
('$10', 'NNS')
('$10,000', 'NNS')
('$10,000,000', 'NNS')
('$10,000-per-year', 'NNS')
('$10.00', 'NNS')
('$10.1', 'NNS')
('$10.3', 'NNS')
('$10.50', 'NNS')
('$10.8', 'NNS')
('$100', 

c) print an alphabetically sorted list of distinct three-word prepositional phrases of the form `IN+AT+NN`, separated by semicolons

#### Goal
we're looking for sequences of three words that match the pattern 'IN' (preposition), 'AT' (article), and 'NN' (noun).
#### Explanations

- IN: This is a tag for prepositions. Examples of words with this tag might be "in", "on", "at", "with", etc.

- AT: This tag is for the singular article, most commonly "the".

- NN: This is a tag for singular nouns. Examples could be "dog", "house", "book", etc.

##### - Examples:
- "in the house"
- "on the table"
- "with the pen"

In [30]:
# Get the tagged words from the Brown corpus
tagged_words = brown.tagged_words()
# Initialize an empty set to store our prepositional phrases.
# We're using a set to automatically ensure that each phrase is distinct.
distinct_phrases = set()

# Loop through the corpus while maintaining an index
for i in range(len(tagged_words) - 2):  # -2 is to avoid an "out of range" error on the last two words.
    # Check for the IN+AT+NN pattern
    if (tagged_words[i][1] == 'IN' and 
        tagged_words[i+1][1] == 'AT' and 
        tagged_words[i+2][1] == 'NN'):
        # If they do, form the phrase and add it to our set.
        phrase = " ".join([tagged_words[i][0], tagged_words[i+1][0], tagged_words[i+2][0]])
        distinct_phrases.add(phrase)

# Sort the phrases alphabetically and print them separated by semicolons
print("; ".join(sorted(distinct_phrases)))



### Exercise 7

Write a function `prec_adv(word, text)` that returns an alphabetically sorted list of distinct adverbs that precede `word` in `text`. Use this function to find out which adverbs precede the words "love", "like", and "prefer" in the Brown corpus. 

### Goal:
To implement funcation that give word and text and returns all list of all the adverbs the comes before that word, which is sorted in an alphabetically order.

In [33]:
def prec_adv(word, text):
    # 1. Initializing an empty set to store the adverbs that precede the given word.
    preceding_adverbs = set()

    # 2. Iterating over the text, which is assumed to be a list of word-tag pairs.
    for i, (current_word, tag) in enumerate(text):
        # Checking if we are at the beginning of the text to avoid an IndexError later.
        if i == 0:
            continue

        # 3. Checking if the current word matches the word we're looking for.
        if current_word == word:
            # Getting the word and tag of the preceding word.
            prev_word, prev_tag = text[i-1]

            # 4. Checking if the preceding word is an adverb (based on the tag "RB").
            if prev_tag == "RB":
                # If it is, we add it to our set of preceding adverbs.
                preceding_adverbs.add(prev_word)

    # 5. Finally, we convert the set to a list, sort it, and return it.
    return sorted(list(preceding_adverbs))

In [34]:
# Testing the function on the words "love", "like", and "prefer"
words_to_check = ["love", "like", "prefer"]
for word in words_to_check:
    print(f"Adverbs that precede '{word}': {prec_adv(word, brown.tagged_words())}")

Adverbs that precede 'love': ['always', 'dearly', 'just']
Adverbs that precede 'like': ['Jist', 'Just', 'Kinda', 'abreast', 'almost', 'alone', 'always', 'around', 'by', 'close', 'deceptively', 'even', 'exactly', 'gloriously', 'here', 'increasingly', 'jist', 'just', 'much', 'often', 'particularly', 'quick', 'rather', 'remarkably', 'roughly', 'simply', 'so', 'somewhat', 'sure', 'there', 'together', 'wildly', 'yet']
Adverbs that precede 'prefer': ['generally', 'much', 'spontaneously']
