# 1. Language Processing and Python

Code-only version of Chapter 1 of the [NLTK Book](https://www.nltk.org/book/ch01.html) for use in the TAHLR course

## 1 Computing with language: texts and words

### 1.1 Getting started with Python

In [None]:
!pip install -U matplotlib

In [None]:
from pprint import pprint

In [None]:
1 + 5 * 2 - 3

### 1.2. Getting started with NLTK

In [None]:
import nltk

In [None]:
nltk.download()

In [None]:
from nltk.book import *

In [None]:
text1

In [None]:
text2

### 1.3. Searching text

In [None]:
text1.concordance("monstrous")

In [None]:
text1.similar("monstrous")

In [None]:
text2.similar("monstrous")

In [None]:
text2.common_contexts(["monstrous", "very"])

In [None]:
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

In [None]:
text3.generate()

### 1.4 Counting vocabulary

In [None]:
len(text3)

In [None]:
print(sorted(set(text3)))

In [None]:
len(set(text3))

In [None]:
len(set(text3)) / len(text3)

In [None]:
text3.count("smote")

In [None]:
100 * text4.count('a') / len(text4)

In [None]:
def lexical_diversity(text):
    return len(set(text)) / len(text)

def percentage(count, total):
    return 100 * count / total

In [None]:
lexical_diversity(text3)

In [None]:
lexical_diversity(text5)

In [None]:
percentage(4, 5)

In [None]:
percentage(text4.count('a'), len(text4))

## 2. A closer look at Python: Texts as lists of words

### 2.1 Lists

In [None]:
# sent1 = ['Call', 'me', 'Ishmael', '.']
sent1

In [None]:
len(sent1)

In [None]:
lexical_diversity(sent1)

In [None]:
print(sent3)

In [None]:
['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail']

In [None]:
print(sent4 + sent1)

In [None]:
sent1.append("Some")
sent1

## 2.2 Indexing lists

In [None]:
text4[173]

In [None]:
text4.index('awaken')

In [None]:
print(text5[16715:16735])

In [None]:
print(text6[1600:1625])

In [None]:
sent = ['word1', 'word2', 'word3', 'word4', 'word5', 'word6', 'word7', 'word8', 'word9', 'word10']

In [None]:
sent[0]

In [None]:
sent[9]

In [None]:
# sent[10]

In [None]:
sent[5:8]

In [None]:
sent[:3]

In [None]:
print(text2[141525:])

In [None]:
sent[0] = 'First'
sent[9] = 'Last'
len(sent)

In [None]:
sent[1:9] = ['Second', 'Third']

In [None]:
len(sent)

In [None]:
# sent[9]

### 2.3 Variables

In [None]:
my_sent = ['Bravely', 'bold', 'Sir', 'Robin', ',', 'rode', 'forth', 'from', 'Camelot', '.']
noun_phrase = my_sent[1:4]
noun_phrase

In [None]:
wOrDs = sorted(noun_phrase)
wOrDs

In [None]:
# not = 'Camelot' # error!

In [None]:
vocab = set(text1)
vocab_size = len(vocab)
vocab_size

### 2.4 Strings

In [None]:
name = 'Monty'

In [None]:
name[0]

In [None]:
name[:4]

In [None]:
name * 2

In [None]:
name + '!'

In [None]:
' '.join(['Monty', 'Python'])

In [None]:
'Monty Python'.split()

## Computing with Language: Simple Statistics

In [None]:
saying = ['After', 'all', 'is', 'said', 'and', 'done', 'more', 'is', 'said', 'than', 'done']
tokens = set(saying)
tokens = sorted(tokens)
tokens[-2:]

### 3.1 Frequency distributions

In [None]:
fdist1 = FreqDist(text1)
print(fdist1)

In [None]:
print(fdist1.most_common(50))

In [None]:
fdist1['whale']

In [None]:
fdist1.plot(50, cumulative=True)

In [None]:
fdist1.plot(50, cumulative=True);

### 3.2 Fine-grained selection of words

In [None]:
V = set(text1)
long_words = [w for w in V if len(w) > 15]
print(sorted(long_words))

In [None]:
fdist4 = FreqDist(text4)
print(sorted([w for w in set(text5) if len(w) > 7 and fdist4[w] > 7], reverse=False))

### 3.3 Collocations and bigrams

In [None]:
list(bigrams(['more', 'is', 'said', 'than', 'done']))

In [None]:
text4.collocations()

In [None]:
text1.collocations()

### 3.4 Counting other things

In [None]:
print([len(w) for w in text1][:25])

In [None]:
fdist = FreqDist(len(w) for w in text1) 
fdist.most_common()

In [None]:
print(fdist.most_common())

In [None]:
fdist.max()

In [None]:
fdist[3]

In [None]:
fdist.freq(3)

In [None]:
print(round(fdist.freq(3), 3))

## 4 Back to Python: Making decisions and taking control

In [None]:
print(sent7)

In [None]:
[w for w in sent7 if len(w) < 4]

In [None]:
[w for w in sent7 if len(w) <= 4]

In [None]:
print([w for w in sent7 if len(w) != 4])

In [None]:
sorted(w for w in set(text1) if w.endswith('ableness'))

In [None]:
sorted(term for term in set(text4) if 'gnt' in term)

In [None]:
print(list(sorted(item for item in set(text6) if item.istitle())[:25]))

In [None]:
sorted(item for item in set(sent7) if item.isdigit())

In [None]:
sorted(w for w in set(text7) if '-' in w and 'index' in w)

In [None]:
sorted(wd for wd in set(text3) if wd.istitle() and len(wd) > 10)

In [None]:
sorted(w for w in set(sent7) if not w.islower())

In [None]:
print(sorted(t for t in set(text2) if 'cie' in t or 'cei' in t))

### 4.3 Operating on every element

In [None]:
print([len(w) for w in text1][:25])

In [None]:
print([w.upper() for w in text1][:25])

In [None]:
len(text1)

In [None]:
len(set(text1))

In [None]:
len(set(word.lower() for word in text1))

In [None]:
len(set(word.lower() for word in text1 if word.isalpha()))

### 4.3 Nested code blocks

In [None]:
word = 'cat'
if len(word) < 5:
    print('word length is less than 5')

In [None]:
if len(word) >= 5:
    print('word length is greater than or equal to 5')

In [None]:
for word in ['Call', 'me', 'Ishmael', '.']:
    print(word)

### 4.4 Looping with conditions

In [None]:
sent1 = ['Call', 'me', 'Ishmael', '.']

for xyzzy in sent1:
    if xyzzy.endswith('l'):
        print(xyzzy)

In [None]:
for token in sent1:
    if token.islower():
        print(token, 'is a lowercase word')
    elif token.istitle():
        print(token, 'is a titlecase word')
    else:
        print(token, 'is punctuation')

In [None]:
tricky = sorted(w for w in set(text2) if 'cie' in w or 'cei' in w)

for word in tricky:
    print(word, end=' ')