# Natural Language Processing

## Exercise Sheet 5

In [1]:
#imports for all exercises
import nltk
import re
import matplotlib.pyplot as plt
import nltk.book as book
from nltk.corpus import brown

brown_tags = [tag for (word, tag) in brown.tagged_words()]
brown_universal_tags = [tag for (word, tag) in brown.tagged_words(tagset='universal')]

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


### Exercise 1

Produce a sorted list of tags used in the Brown corpus, removing duplicates. Do the same for the universal part-of-speech tagset.

In [2]:
list(sorted(set(brown_tags)))

["'",
 "''",
 '(',
 '(-HL',
 ')',
 ')-HL',
 '*',
 '*-HL',
 '*-NC',
 '*-TL',
 ',',
 ',-HL',
 ',-NC',
 ',-TL',
 '--',
 '---HL',
 '.',
 '.-HL',
 '.-NC',
 '.-TL',
 ':',
 ':-HL',
 ':-TL',
 'ABL',
 'ABN',
 'ABN-HL',
 'ABN-NC',
 'ABN-TL',
 'ABX',
 'AP',
 'AP$',
 'AP+AP-NC',
 'AP-HL',
 'AP-NC',
 'AP-TL',
 'AT',
 'AT-HL',
 'AT-NC',
 'AT-TL',
 'AT-TL-HL',
 'BE',
 'BE-HL',
 'BE-TL',
 'BED',
 'BED*',
 'BED-NC',
 'BEDZ',
 'BEDZ*',
 'BEDZ-HL',
 'BEDZ-NC',
 'BEG',
 'BEM',
 'BEM*',
 'BEM-NC',
 'BEN',
 'BEN-TL',
 'BER',
 'BER*',
 'BER*-NC',
 'BER-HL',
 'BER-NC',
 'BER-TL',
 'BEZ',
 'BEZ*',
 'BEZ-HL',
 'BEZ-NC',
 'BEZ-TL',
 'CC',
 'CC-HL',
 'CC-NC',
 'CC-TL',
 'CC-TL-HL',
 'CD',
 'CD$',
 'CD-HL',
 'CD-NC',
 'CD-TL',
 'CD-TL-HL',
 'CS',
 'CS-HL',
 'CS-NC',
 'CS-TL',
 'DO',
 'DO*',
 'DO*-HL',
 'DO+PPSS',
 'DO-HL',
 'DO-NC',
 'DO-TL',
 'DOD',
 'DOD*',
 'DOD*-TL',
 'DOD-NC',
 'DOZ',
 'DOZ*',
 'DOZ*-TL',
 'DOZ-HL',
 'DOZ-TL',
 'DT',
 'DT$',
 'DT+BEZ',
 'DT+BEZ-NC',
 'DT+MD',
 'DT-HL',
 'DT-NC',
 'DT-TL',
 'D

In [3]:
list(sorted(set(brown_universal_tags)))

['.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X']

### Exercise 2

Write a program to process the Brown Corpus using the universal part-of-speech tagset to find out which nouns are more common in their plural form than in their singular form. Only consider regular plurals formed with the "-s" suffix. Print an alphabetically sorted list of the nouns together with the frequencies for the singular and plural forms, one per line. 


In [4]:
# Normalize nouns, so that report and Report is the same word
nouns = [word.lower() for (word, tag) in brown.tagged_words(tagset='universal') if tag == "NOUN"]

# only compare nouns that are words
reg = re.compile(r'^[a-z]')
nouns = list(filter(reg.search, nouns))

noun_fd = nltk.FreqDist(nouns)

In [5]:
# Select only singular nouns, excluding regular formed plurals with the -s suffix
singular_nouns = [singular_noun for singular_noun in nouns if singular_noun[-1:] != "s"]

common_dict = dict()
# Search each singular word once and compare frequency distributions
for noun in set(singular_nouns):
    singular_freq = noun_fd[noun]
    plural_freq = noun_fd[noun + "s"]
    
    if plural_freq > singular_freq:
        common_dict[noun] = (singular_freq, plural_freq)

In [6]:
print("Noun: singular, plural")
print("===")
for noun in sorted(common_dict.keys()):
    print("{}: {}, {}".format(noun, common_dict[noun][0], common_dict[noun][1]))


Noun: singular, plural
===
aberration: 3, 5
abolitionist: 1, 4
aborigine: 7, 8
absolute: 1, 3
abstract: 1, 4
accommodation: 1, 8
accomplishment: 7, 10
acre: 10, 44
active: 6, 8
ad: 5, 10
adapter: 1, 2
addict: 1, 4
additive: 3, 4
adherent: 1, 5
adjective: 2, 4
admonition: 1, 3
adventure: 13, 14
adverb: 1, 2
advertisement: 2, 3
advertiser: 1, 5
adviser: 6, 12
advisor: 1, 5
affair: 33, 84
affiliation: 4, 5
ailment: 4, 6
airfield: 5, 6
airline: 2, 5
alien: 2, 3
allowance: 16, 25
allusion: 3, 5
almond: 1, 3
american: 34, 93
angel: 18, 24
ant: 6, 7
antagonist: 3, 4
antecedent: 1, 2
anti-semite: 2, 3
apache: 1, 4
apologie: 1, 4
appliance: 5, 8
applicant: 8, 10
appointee: 2, 5
appropriation: 5, 9
arm: 93, 121
armament: 1, 4
arrangement: 34, 38
aspect: 47, 64
aspencade: 1, 2
aspiration: 3, 12
assessor: 2, 20
asset: 5, 13
associate: 7, 14
atom: 37, 40
attribute: 4, 11
auditor: 1, 4
authorization: 2, 6
axe: 6, 7
axiom: 1, 2
azalea: 2, 3
backbend: 1, 2
backyard: 2, 3
bang: 3, 5
banker: 5, 15
bansh

### Exercise 3

Find out which word has the greatest number of distinct tags in the Brown corpus using the original tagset. Without using the `most_common` function, print a list of the tags together with the frequencies for the word, sorted by frequency from highest to lowest, one per line.



In [7]:
tagged_words = brown.tagged_words()

word_tags = dict()
for word, tag in tagged_words:
    if word not in word_tags.keys():
        word_tags[word] = {tag}
    else:
        word_tags[word].add(tag)

In [8]:
word_tags_sorted = sorted(word_tags.items(), key=lambda item: len(item[1]), reverse=True)
most_distinct_tags_word = word_tags_sorted[0]
most_distinct_tags_word

('that',
 {'CS',
  'CS-HL',
  'CS-NC',
  'DT',
  'DT-NC',
  'NIL',
  'QL',
  'WPO',
  'WPO-NC',
  'WPS',
  'WPS-HL',
  'WPS-NC'})

The word with the greatest number of distinct tags in the Brown corpus is: ```that```

In [9]:
# Print a list of the tags of the word with the greatest number of distinct tags, sorted by frequency
tags_fd = nltk.FreqDist(tagged_words) 

most_distinct_tags_word_freq = sorted({tag: tags_fd[(most_distinct_tags_word[0], tag)] for tag in most_distinct_tags_word[1]}.items(), key=lambda item: item[1], reverse=True)

for tag, freq in most_distinct_tags_word_freq:
    print(tag, freq)

CS 6419
DT 1975
WPS 1638
WPO 135
QL 54
DT-NC 6
WPS-NC 3
WPS-HL 2
CS-NC 2
NIL 1
CS-HL 1
WPO-NC 1


In [10]:
nltk.help.upenn_tagset('CS')
nltk.corpus.brown.readme()

No matching tags found.


'BROWN CORPUS\n\nA Standard Corpus of Present-Day Edited American\nEnglish, for use with Digital Computers.\n\nby W. N. Francis and H. Kucera (1964)\nDepartment of Linguistics, Brown University\nProvidence, Rhode Island, USA\n\nRevised 1971, Revised and Amplified 1979\n\nhttp://www.hit.uib.no/icame/brown/bcm.html\n\nDistributed with the permission of the copyright holder,\nredistribution permitted.\n'

In [None]:
# Print a list of the tags for each word, sorted by frequency
"""
words_fd = nltk.FreqDist(brown.words())

for word, freq in sorted(words_fd.items(), key=lambda item: item[1], reverse=True):
    print("{}({}): ".format(word, freq), word_tags[word])
"""

### Exercise 4

Tabulate the frequencies of the universal tags that precede nouns in the Brown Corpus. 

In [12]:
word_tag_pairs = nltk.bigrams(brown.tagged_words(tagset='universal'))
noun_preceding_tags = [preceding_word[1] for (preceding_word, word) in word_tag_pairs if word[1] == 'NOUN']

fdist = nltk.FreqDist(noun_preceding_tags)
fdist.tabulate()

  DET   ADJ  NOUN   ADP     .  VERB  CONJ   NUM   ADV   PRT  PRON     X 
85845 54653 41309 37418 20084 17851  9294  5668  1851  1068   440    77 


### Exercise 5

Write a function `ambiguous(tagged_text)` that returns the number of ambiguous word types as well as the number of all word types in a tagged text. A word type is ambiguous if it is tagged with at least two different tags. Use the function to print both values as well as the percentage of ambiguous word types for the Brown Corpus both for the original and the universal tagset.

In [13]:
# A tagged_text consists of words with its assigned word types (=tags)
def ambiguous(tagged_text: list[tuple[str, str]]):
    tagged_words_cfd = nltk.ConditionalFreqDist(tagged_text)
    ambiguous_words = [word for word in tagged_words_cfd if len(tagged_words_cfd[word]) > 1]
    
    return len(ambiguous_words), len(tagged_words_cfd)

In [14]:
tagged_words = brown.tagged_words()
tagged_words_universal = brown.tagged_words(tagset='universal')

ambiguous_regular = ambiguous(tagged_words)
ambiguous_universal = ambiguous(tagged_words_universal)

In [15]:
print("name: ambiguous, all, ratio", end="\n===\n")
print("Brown regular tagset: %d, %d, " % ambiguous_regular, end="") 
print("{:.2%}".format(ambiguous_regular[0]/ambiguous_regular[1]))
print("Brown universal tagset: %d, %d, " % ambiguous_universal, end="")
print("{:.2%}".format(ambiguous_universal[0]/ambiguous_universal[1]))

name: ambiguous, all, ratio
===
Brown regular tagset: 8729, 56057, 15.57%
Brown universal tagset: 3596, 56057, 6.41%


### Exercise 6

Write code to search the Brown Corpus to answer the following questions:

a) produce an alphabetically sorted list of the distinct words tagged as `MD`  
b) identify words that can be plural nouns or third person singular verbs  
c) print an alphabetically sorted list of distinct three-word prepositional phrases of the form `IN+AT+NN`, separated by semicolons


In [16]:
# Solution to a)
print(sorted(set([word.lower() for (word, tag) in tagged_words if tag == 'MD'])))

["c'n", 'can', 'colde', 'could', 'dare', 'kin', 'maht', 'mai', 'may', 'maye', 'mayst', 'might', 'must', 'need', 'ought', 'shall', 'should', 'shuld', 'shulde', 'wil', 'will', 'wilt', 'wod', 'wold', 'wolde', 'would']


In [17]:
# Solution to b)
nltk.help.upenn_tagset('NNS')
nltk.help.upenn_tagset('VBZ')

NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...
VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...


In [18]:
tagged_words_norm = [word.lower() for (word, tag) in tagged_words]
tagged_words_cfd = nltk.ConditionalFreqDist(tagged_words)

print(set([word for word in set(tagged_words_norm) if 'NNS' in tagged_words_cfd[word] and 'VBZ' in tagged_words_cfd[word]]))

{'reports', 'compresses', 'traces', 'experiences', 'shares', 'guides', 'cracks', 'honors', 'challenges', 'crops', 'thrusts', 'mounts', 'starts', 'flies', 'transfers', 'names', 'attacks', 'answers', 'bullies', 'parallels', 'dogs', 'points', 'passes', 'bridges', 'shocks', 'exhibits', 'stands', 'excuses', 'traps', 'wants', 'contracts', 'presses', 'hides', 'snatches', 'kids', 'deals', 'clouds', 'figures', 'springs', 'wishes', 'lies', 'pushes', 'drains', 'costs', 'caps', 'lapses', 'bottles', 'blows', 'imports', 'respects', 'champions', 'shows', 'records', 'clucks', 'supplies', 'dies', 'conducts', 'banks', 'checks', 'sports', 'houses', 'stops', 'features', 'escapes', 'strikes', 'acts', 'deserts', 'practices', 'jokes', 'stains', 'guarantees', 'sums', 'commands', 'reserves', 'photographs', 'bristles', 'dislikes', 'conflicts', 'slips', 'smiles', 'subjects', 'influences', 'sets', 'embraces', 'concerns', 'laps', 'surveys', 'captures', 'concentrates', 'shifts', 'shakes', 'drinks', 'leases', 'likes

In [19]:
# Solution to c)
three_word_phrases = set([(w1, w2, w3) for sent in brown.tagged_sents() for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sent) if t1 == 'IN' and t2 == 'AT' and t3 == 'NN'])

In [20]:
for phrase in sorted(three_word_phrases):
    print("%s %s %s" % phrase, end=";")



### Exercise 7

Write a function `prec_adv(word, text)` that returns an alphabetically sorted list of distinct adverbs that precede `word` in `text`. Use this function to find out which adverbs precede the words "love", "like", and "prefer" in the Brown corpus. 

In [21]:
def prec_adv(word, text):
    if hasattr(text, "tagged_words"):
        tagged_text = text.tagged_words(tagset='universal')
    else:
        # Assuming that text is an untagged string
        text = nltk.word_tokenize(text)
        tagged_text = nltk.pos_tag(text, tagset='universal')

    word_tag_pairs = nltk.bigrams(tagged_text)
    preceding_adverbs = set([preceding[0] for preceding, succeeding in word_tag_pairs if succeeding[0] == word and preceding[1] == 'ADV'])
    
    return list(sorted(preceding_adverbs))
    
t = "They had been doing this extraordinarily fast."
w = "fast"

print("Preceding 'love': ", prec_adv("love", brown))
print("Preceding 'like': ", prec_adv("like", brown))
print("Preceding 'prefer': ", prec_adv("prefer", brown))

Preceding 'love':  ['always', 'dearly', 'just', 'not']
Preceding 'like':  ['How', 'Jist', 'Just', 'Kinda', 'More', 'Not', 'abreast', 'almost', 'alone', 'always', 'around', 'by', 'close', 'deceptively', 'even', 'exactly', 'gloriously', 'here', 'increasingly', 'jist', 'just', 'less', 'more', 'much', 'not', 'often', 'particularly', 'quick', 'quite', 'rather', 'remarkably', 'roughly', 'simply', 'so', 'somewhat', 'sure', 'there', 'together', 'wildly', 'yet']
Preceding 'prefer':  ['generally', 'much', 'not', 'spontaneously']
