<h1>2 Conditional Frequency Distributions</h1>

<h1>2.1 Conditions and Events</h1>

In [2]:
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said']

# Pairs are of the form: (condition, event)
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County')]

<h1>2.2 Counting Words by Genre</h1>

In [4]:
import nltk
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
            (genre, word)
            for genre in brown.categories()
            for word in brown.words(categories=genre))

In [7]:
# For each genre, loop over every word in the genre
# producing pairs consisting of the genre and the word
genres = ["news", "romance"]
genre_word_tups = [(genre, word)
              for genre in genres
              for word in brown.words(categories=genre)]
len(genre_word_tups)
genre_word_tups

[('news', 'The'),
 ('news', 'Fulton'),
 ('news', 'County'),
 ('news', 'Grand'),
 ('news', 'Jury'),
 ('news', 'said'),
 ('news', 'Friday'),
 ('news', 'an'),
 ('news', 'investigation'),
 ('news', 'of'),
 ('news', "Atlanta's"),
 ('news', 'recent'),
 ('news', 'primary'),
 ('news', 'election'),
 ('news', 'produced'),
 ('news', '``'),
 ('news', 'no'),
 ('news', 'evidence'),
 ('news', "''"),
 ('news', 'that'),
 ('news', 'any'),
 ('news', 'irregularities'),
 ('news', 'took'),
 ('news', 'place'),
 ('news', '.'),
 ('news', 'The'),
 ('news', 'jury'),
 ('news', 'further'),
 ('news', 'said'),
 ('news', 'in'),
 ('news', 'term-end'),
 ('news', 'presentments'),
 ('news', 'that'),
 ('news', 'the'),
 ('news', 'City'),
 ('news', 'Executive'),
 ('news', 'Committee'),
 ('news', ','),
 ('news', 'which'),
 ('news', 'had'),
 ('news', 'over-all'),
 ('news', 'charge'),
 ('news', 'of'),
 ('news', 'the'),
 ('news', 'election'),
 ('news', ','),
 ('news', '``'),
 ('news', 'deserves'),
 ('news', 'the'),
 ('news', 'p

In [11]:
cfd = nltk.ConditionalFreqDist(genre_word_tups)
print(cfd)
print(cfd.conditions())

<ConditionalFreqDist with 2 conditions>
['news', 'romance']


In [15]:
print(cfd["news"])
print(cfd["romance"])

# Find the 20 most common words in the text
print(cfd["romance"].most_common(20))

# Find how many times "romance" occurs
print(cfd["romance"]["could"])

<FreqDist with 14394 samples and 100554 outcomes>
<FreqDist with 8452 samples and 70022 outcomes>


193

<h1>2.3 Plotting and Tabulating Distributions</h1>

In [20]:
# This generates the pair EX: ("america", 1865)
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
            (target, fileid[:4])
            for fileid in inaugural.fileids()
            for w in inaugural.words(fileid)
            for target in ["america", "citizen"]
            if w.lower().startswith(target))


In [21]:
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch',
             'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
            (lang, len(word))
            for lang in languages
            for word in udhr.words(lang + "-Latin1"))

In [22]:
cfd.tabulate(conditions=["English", "German_Deutsch"],
             samples=range(10), cumulative=True)

                  0    1    2    3    4    5    6    7    8    9 
       English    0  185  525  883  997 1166 1283 1440 1558 1638 
German_Deutsch    0  171  263  614  717  894 1013 1110 1213 1275 


<h1>2.4 Generating Random Text with Bigrams</h1>

In [24]:
sent = ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven',
        'and', 'the', 'earth', '.']
list(nltk.bigrams(sent))

[('In', 'the'),
 ('the', 'beginning'),
 ('beginning', 'God'),
 ('God', 'created'),
 ('created', 'the'),
 ('the', 'heaven'),
 ('heaven', 'and'),
 ('and', 'the'),
 ('the', 'earth'),
 ('earth', '.')]

In [30]:
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word, end=" ")
        # Reset word to be the most likely word to occur near
        # the previous word
        word = cfdist[word].max()

# Import a text
text = nltk.corpus.genesis.words("english-kjv.txt")
# Make bigrams of the text
bigrams = nltk.bigrams(text)
# Create a cfd from the bigrams
# This tracks which words are most likely to follow a given word
cfd = nltk.ConditionalFreqDist(bigrams)

print(cfd["living"])
print(generate_model(cfd, "living"))

<FreqDist with 6 samples and 16 outcomes>
living creature that he said , and the land of the land of the land None
