# Computing with Language: Simple Statistics 

In [1]:
import nltk
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
#recap of the previous topic
saying = ['After','all','is','said','and','done','more','is','said','than','done']
print(saying)
tokens = set(saying)
tokens = sorted(tokens)
print(tokens)
print(tokens[-2:])

['After', 'all', 'is', 'said', 'and', 'done', 'more', 'is', 'said', 'than', 'done']
['After', 'all', 'and', 'done', 'is', 'more', 'said', 'than']
['said', 'than']


## Frequency Distributions

In [3]:
#How can we automatically identify the words of a text that are most informative about the topic and genre of the text?
#How to finding the 50 most frequent words in a book??
fdist1 = FreqDist(text1)
print(fdist1)
print('\n')
#print first 50 words in the document
print("printing first 50 words from text1 ::")
vocabulary1 = list(fdist1.keys())
print(vocabulary1[:50])
#print the frequency of 'whale'
print('\n')
print('Frequency of "Whale" in the text1 ::')
print(fdist1['whale'])

<FreqDist with 19317 samples and 260819 outcomes>


printing first 50 words from text1 ::
['[', 'Moby', 'Dick', 'by', 'Herman', 'Melville', '1851', ']', 'ETYMOLOGY', '.', '(', 'Supplied', 'a', 'Late', 'Consumptive', 'Usher', 'to', 'Grammar', 'School', ')', 'The', 'pale', '--', 'threadbare', 'in', 'coat', ',', 'heart', 'body', 'and', 'brain', ';', 'I', 'see', 'him', 'now', 'He', 'was', 'ever', 'dusting', 'his', 'old', 'lexicons', 'grammars', 'with', 'queer', 'handkerchief', 'mockingly', 'embellished', 'all']


Frequency of "Whale" in the text1 ::
906


In [4]:
#let's plot a commulative frequency plot
fdist1.plot(50,cumulative=True)


<Figure size 640x480 with 1 Axes>

<matplotlib.axes._subplots.AxesSubplot at 0x7f9698a31be0>

In [5]:
#if the frequent words don't helps, how about the words that occur once only ,the so called Hapaxes?
print("total no of words occured only once in text document :: ")
print(len(fdist1.hapaxes()))
print("\n")
print(fdist1.hapaxes()[:50])

total no of words occured only once in text document :: 
9002


['Herman', 'Melville', ']', 'ETYMOLOGY', 'Late', 'Consumptive', 'School', 'threadbare', 'lexicons', 'mockingly', 'flags', 'mortality', 'signification', 'HACKLUYT', 'Sw', 'HVAL', 'roundness', 'Dut', 'Ger', 'WALLEN', 'WALW', 'IAN', 'RICHARDSON', 'KETOS', 'GREEK', 'CETUS', 'LATIN', 'WHOEL', 'ANGLO', 'SAXON', 'WAL', 'HWAL', 'SWEDISH', 'ICELANDIC', 'BALEINE', 'BALLENA', 'FEGEE', 'ERROMANGOAN', 'Librarian', 'painstaking', 'burrower', 'grub', 'Vaticans', 'stalls', 'higgledy', 'piggledy', 'gospel', 'promiscuously', 'commentator', 'belongest']


## Fine-Grained selection of words

In [6]:
#find out long words in the corpus
#a.{w|w E V & p(w)}
#b.[w for w in v if p(w)]
v = set(text1)
long_words = [w for w in v if len(w)>15]
print(sorted(long_words))

['CIRCUMNAVIGATION', 'Physiognomically', 'apprehensiveness', 'cannibalistically', 'characteristically', 'circumnavigating', 'circumnavigation', 'circumnavigations', 'comprehensiveness', 'hermaphroditical', 'indiscriminately', 'indispensableness', 'irresistibleness', 'physiognomically', 'preternaturalness', 'responsibilities', 'simultaneousness', 'subterraneousness', 'supernaturalness', 'superstitiousness', 'uncomfortableness', 'uncompromisedness', 'undiscriminating', 'uninterpenetratingly']


In [9]:
#find the words which have length greater then 7 and have frequency greater then 7 in text5
fdist5 = FreqDist(text5)
print(sorted([w for w in set(text5) if len(w)>7 and fdist5[w]>7]))


['#14-19teens', '#talkcity_adults', '((((((((((', '........', 'Question', 'actually', 'anything', 'computer', 'cute.-ass', 'everyone', 'football', 'innocent', 'listening', 'remember', 'seriously', 'something', 'together', 'tomorrow', 'watching']
