# NLTK 
- natural language toolkit 
- very popular, easy to use
- excellent example of integrating functionality with Python
- there is an excellent tutorial [book](http://www.nltk.org/book/) about NLTK
- you can learn quite a bit about NLP reading the book and playing around

In [None]:
import nltk

In [None]:
# gui that downloads data - comes up in separate window
# have to search for the window!
# can just grab files for book, 
# or everything if you want - several gigs

nltk.download()

In [None]:
from nltk.book import *

In [None]:
sense = text2
inaug = text4
chat = text5
monty = text6
wsj = text7
personals = text8

In [None]:
# shows matches with some context

inaug.concordance('greatness')

In [None]:
sense.concordance('affection')

In [None]:
sense.count('affection')

In [None]:
%matplotlib inline

inaug.dispersion_plot(\
        ["citizens", "democracy", 
         "freedom", "duties", "America"])

In [None]:
# total word count
# NLTK defined __len__ to be # of words

[len(wsj), type(wsj)]

In [None]:
# unique word count - no dups in a set

len(set(wsj))

In [None]:
# last 20 random(because sets are unordered) words

list(set(wsj))[:20]

In [None]:
# sort, then take a subset

list(sorted(set(wsj)))[5000:5010]

In [None]:
# can ask how often certain words appear

[wsj.count(t) for t in ['business', 'debt', 'inflation']]

In [None]:
# get a word frequency distribution

fd=FreqDist(wsj)
fd.most_common(30)


In [None]:
# can query the frequency distribution like a dictionary

[fd['as'], fd['apartment'], fd['apart'],
 fd['any'], wsj.count('any')]

In [None]:
# on average, each word used 8 times

len(wsj)/len(set(wsj)) 

In [None]:
# find long words used as many times as the number of 
# characters in the word. print a count and the words
# that work at each length

for wlen in range(8, 15):
    # set comprehension
    words = [w for w in set(wsj) if len(w) == wlen and wlen == fd[w]]
    print(wlen, len(words), words)

In [None]:
# collocations are word pairs that occur more likely than 
# indivigual word frequency would suggest
# singles text...

print(len(personals))
#personals.collocations()
print('; '.join(personals.collocation_list()))

In [None]:
print(len(wsj))
#wsj.collocations()
print('; '.join(wsj.collocation_list()))

In [None]:
jj = nltk.word_tokenize('''Jack always gets very nervous and stressed 
during the last three weeks of the Columbia term, 
but Jill actually enjoys the tension and deadlines!''')
jj


In [None]:
out = nltk.pos_tag(jj)
out

In [None]:
# can never remember what the tags means
# extract the tags, make a set to get rid of dups, sort, and
# get tag help

for tag in sorted(set([tag for word, tag in out])):
    nltk.help.upenn_tagset(tag)

# find anagrams and palindromes in a corpus

In [None]:
import collections
import random

In [None]:
# make sorted version of word to use as a key

''.join(sorted('sdf'))

In [None]:
# use the brown corpus:
# The Brown Corpus was the first million-word electronic corpus of English, 
# created in 1961 at Brown University. This corpus contains text from 500 sources, 
# and the sources have been categorized by genre, such as news, editorial, and so on

anas = collections.defaultdict(list)
brownwords = nltk.corpus.brown.words()
brownset = set(brownwords)
pals = []


for w in brownset:
    # make anagram dict
    
    # make a key
    anas[''.join(sorted(w))].append(w)
    # check for palindromes
    if w == w[::-1]:
        pals.append(w)
        
# get rid of all digit words,
# and words with no anagrams
bad = [k for k in anas if len(anas[k]) == 1 or k.isdigit()]
for b in bad:
    del anas[b]
        
# make a list for random.choice
keys = [k for k in anas]

def find_anagram():
    k = random.choice(keys)
    return anas[k]

[len(brownwords), len(brownset), len(pals), len(keys)]

In [None]:
[p for p in pals if len(p) > 3]

In [None]:
for j in range(15):
    print(find_anagram())

In [None]:
# more than 3 anagrams!!

more=[anas[k]  for k in anas if len(anas[k])>3]
sorted(more, key=lambda x : len(x), reverse=True)