# NLTK Book: Ch. 4. Writing Structured Programs

Code-only version of Chapter 4 of the [NLTK Book](https://www.nltk.org/book/ch04.html) for use in the TAHLR course
NB: Selected material; you are encouraged to read through the full chapter at your own pace to rehearse programming concepts.


> The goal of this chapter is to answer the following questions:
> 1. How can you write well-structured, readable programs that you and others will be able to re-use easily?
> 2. How do the fundamental building blocks work, such as loops, functions and assignment?
> 3. What are some of the pitfalls with Python programming and how can you avoid them?

### 3.1 Back to the basics

In [None]:
import nltk
import re
from pprint import pprint
from nltk import word_tokenize

In [None]:
# Booleans
# i.e. True vs. False

empty_string = ""
print(empty_string)
print(bool(empty_string))

print()

full_string = "This is not an empty string"
print(full_string)
print(bool(full_string))

In [None]:
empty_list = []
print(empty_list)
print(bool(empty_list))

print()

full_list = [1, 2, 3]
print(full_list)
print(bool(full_list))

In [None]:
none_value = None
print(none_value)
print(bool(none_value))

print()

some_value = 1
print(some_value)
print(bool(some_value))

In [None]:
# Conditionals

grade = 89

if grade >= 90:
    print("A")
elif grade >= 80:
    print("B")
elif grade >= 70:
    print("C")
elif grade >= 60:
    print("D")
else:
    print("F")

In [None]:
mixed = ['cat', '', ['dog'], []]

for element in mixed:
    if element:
        print(element)

In [None]:
animals = ['cat', 'dog']

if 'cat' in animals:
    print(1)

if 'dog' in animals:
    print(2)

In [None]:
if 'cat' in animals:
    print(1)
elif 'dog' in animals:
    print(2)    

In [None]:
# Conditions...

sent = ['No', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '.']
all(len(w) > 4 for w in sent)

In [None]:
any(len(w) > 4 for w in sent)

## 4.2 Sequences

In [None]:
# List, i.e. square brackets and comma-separated items

l = [1,2,3] # NEVER USE THE VARIABLE NAME `list` etc.
print(type(l))
print(l)

In [None]:
# Tuple, i.e. round brackets and comma-separated items

t = (1,2,3)
print(type(t))
print(t)

In [None]:
s = "123"
print(type(s))
print(s)

In [None]:
# These are all iterable, i.e. we can loop over them

print('\nIterating over a list')
for item in l:
    print(item)
print('\nIterating over a tuple')
for item in t:
    print(item)
print('\nIterating over a string')
for item in s:
    print(item)    

In [None]:
# Sets, i.e. curly brackets and comma-separated items
s_ = {1,2,1}

print(type(s_))
print(s_)


In [None]:
for item in s_:
    print(item)

**Lazy evaluation**

> It is a widespread feature of Python 3 and NLTK 3 to only perform computation when required (a feature known as "lazy evaluation"). If you ever see a result like <zip object at 0x10d005448> when you expect to see a sequence, you can force the object to be evaluated just by putting it in a context that expects a sequence, like list(x), or for item in x.

In [None]:
# This won't evaluate immediately

range(10)

In [None]:
# Wrapping a sequence in `list` will evaluate it, but watch memory,
# e.g. (range(1000000000)) # This list would have a length of 1,000,000,000

list(range(10))

In [None]:
# Combining different sequence types
# e.g. sort a sentence by word length

words = 'I turned off the spectroroute'.split()
print(words)

In [None]:
wordlens = [(len(word), word) for word in words]
print(wordlens)

In [None]:
wordlens_sorted = sorted(wordlens)
print(wordlens_sorted)

In [None]:
' '.join(w for (_, w) in wordlens_sorted)

In [None]:
# Generator expressions

text = '''"When I use a word," Humpty Dumpty said...'''
words = (word.lower() for word in word_tokenize(text)) # Note the parentheses; more memory efficient than list comprehension

In [None]:
print(words)

In [None]:
for word in words:
    print(word)

In [None]:
print(words)

In [None]:
for word in words:
    print(word)

In [None]:
print(len(list(words)))

In [None]:
# Enumerate

logos = 'λόγος'

for i, char in enumerate(logos):
    print(f'Character {i}: {char}')

In [None]:
for i, char in enumerate(logos, 1):
    print(f'Character {i}: {char}')

In [None]:
# A study example with iteration, conditionals, list comprehension, enumerate, etc.; for in-class discussion

fd = nltk.FreqDist(nltk.corpus.brown.words())

cumulative = 0.0

most_common_words = [word for (word, count) in fd.most_common()]

for rank, word in enumerate(most_common_words, 1):
    cumulative += fd.freq(word)
    print(f'{rank}\t{cumulative:.2%}\t{word }')
    if cumulative > 0.25:
        break


## 4.4 Functions: The foundation of structured programming

In [None]:
import re
def get_text(file):
    """Read text from a file, normalizing whitespace and stripping HTML markup."""
    with open(file,'r') as f:
        text = f.read()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [None]:
text = get_text('../data/texts/lyoc/homer-odyssey-1.txt')

In [None]:
print(text[:15])

In [None]:
help(get_text)

In [None]:
# Named arguments

def repeat(msg='<empty>', num=1):
    return msg * num

repeat(num=3)

In [None]:
repeat(3)

In [None]:
# repeat(msg='Alice', 3) # This won't work

## 4.8: A sample of Python libraries

In [None]:
from numpy import arange
from matplotlib import pyplot

colors = 'rgbcmyk' # red, green, blue, cyan, magenta, yellow, black

def bar_chart(categories, words, counts):
    "Plot a bar chart showing counts for each word by category"
    ind = arange(len(words))
    width = 1 / (len(categories) + 1)
    bar_groups = []
    for c in range(len(categories)):
        bars = pyplot.bar(ind+c*width, counts[categories[c]], width,
                         color=colors[c % len(colors)])
        bar_groups.append(bars)
    pyplot.xticks(ind+width, words)
    pyplot.legend([b[0] for b in bar_groups], categories, loc='upper left')
    pyplot.ylabel('Frequency')
    pyplot.title('Frequency of Six Modal Verbs by Genre')
    pyplot.show()

In [None]:
genres = ['news', 'religion', 'hobbies', 'government', 'adventure']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfdist = nltk.ConditionalFreqDist(
             (genre, word)
             for genre in genres
             for word in nltk.corpus.brown.words(categories=genre)
             if word in modals)

counts = {}

for genre in genres:
    counts[genre] = [cfdist[genre][word] for word in modals]

bar_chart(genres, modals, counts).

In [None]:
import networkx as nx
import matplotlib
from nltk.corpus import wordnet as wn

def traverse(graph, start, node):
    graph.depth[node.name] = node.shortest_path_distance(start)
    for child in node.hyponyms():
        graph.add_edge(node.name, child.name)
        traverse(graph, start, child)

def hyponym_graph(start):
    G = nx.Graph()
    G.depth = {}
    traverse(G, start, start)
    return G

def graph_draw(graph):
    nx.draw_networkx(graph,
         node_size = [16 * graph.degree(n) for n in graph],
         node_color = [graph.depth[n] for n in graph],
         with_labels = False)
    matplotlib.pyplot.show()

In [None]:
dog = wn.synset('dog.n.01')
graph = hyponym_graph(dog)
graph_draw(graph)