<h1>3.8 Segmentation</h1>

<h3>Sentence Segmentation</h3>

In [1]:
import nltk
brown = nltk.corpus.brown


In [2]:
avg_sent_len = len(brown.words()) / len(brown.sents())
print(avg_sent_len)

20.250994070456922


In [3]:
import pprint
text = nltk.corpus.gutenberg.raw("chesterton-thursday.txt")
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])

['"Nonsense!"',
 'said Gregory, who was very rational when anyone else\nattempted paradox.',
 '"Why do all the clerks and navvies in the\n'
 'railway trains look so sad and tired, so very sad and tired?',
 'I will\ntell you.',
 'It is because they know that the train is going right.',
 'It\n'
 'is because they know that whatever place they have taken a ticket\n'
 'for that place they will reach.',
 'It is because after they have\n'
 'passed Sloane Square they know that the next station must be\n'
 'Victoria, and nothing but Victoria.',
 'Oh, their wild rapture!',
 'oh,\n'
 'their eyes like stars and their souls again in Eden, if the next\n'
 'station were unaccountably Baker Street!"',
 '"It is you who are unpoetical," replied the poet Syme.']


<h3>Word Segmentation</h3>

In [4]:
import _mypath
import language_mod as lm

text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"

# Segment by clusters of words
seg1 = "0000000000000001000000000010000000000000000100000000000"

# Segment by single words
seg2 = "0100100100100001001001000010100100010010000100010010000"

In [5]:
# Segment the text
clust = lm.segment(text, seg1)
words = lm.segment(text, seg2)

print(clust, words, sep="\n\n")

['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']

['do', 'you', 'see', 'the', 'kitty', 'see', 'the', 'doggy', 'do', 'you', 'like', 'the', 'kitty', 'like', 'the', 'doggy']


<img src="../Images/brent.png">

In [6]:
seg3 = "0000100100000011001000000110000100010000001100010000001"
print(lm.segment(text, seg3))
print(lm.evaluate(text, seg3))
print(lm.evaluate(text, seg2))
print(lm.evaluate(text, seg1))

['doyou', 'see', 'thekitt', 'y', 'see', 'thedogg', 'y', 'doyou', 'like', 'thekitt', 'y', 'like', 'thedogg', 'y']
47
48
64


In [7]:
lm.anneal(text, seg1, 5000, 1.2)

64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
64 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
62 ['doyousee', 'theki', 'ttyseet', 'hedoggy', 'doyoulikethekit', 'tyliket', 'hedoggy']
62 ['doyousee', 'theki', 'ttyseet', 'hedoggy', 'doyoulikethekit', 'tyliket', 'hedoggy']
62 ['doyousee', 'theki', 'ttyseet', 'hedoggy', 'doyoulikethekit', 'tyliket', 'hedoggy']
60 ['doyouseethekittyseet', 'hedoggy', 'do', 'youlikethek', 'ittyliket', 'hedoggy']
56 ['doyou', 'seethekittys', 'eet', 'hedoggy

'0000100010000001000100000010000100001000000100001000000'

<h1>3.9 Formatting: From Lists to Strings</h1>

<h3>From Lists to Strings</h3>

In [8]:
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']

silly_sp = ' '.join(silly)
silly_sc = ';'.join(silly)
silly_nsp = ''.join(silly)

print(silly_sp)
print(silly_sc)
print(silly_nsp)


We called him Tortoise because he taught us .
We;called;him;Tortoise;because;he;taught;us;.
WecalledhimTortoisebecausehetaughtus.


<h3>Strings and Formats</h3>

In [9]:
word = "cat"
sentence = """Hello
world."""
print(word)
print(sentence)
word


cat
Hello
world.


'cat'

In [10]:
sentence

'Hello\nworld.'

In [11]:
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog',
                       'snake', 'dog', 'cat'])
for word in sorted(fdist):
    print(word, '->', fdist[word], end='; ')

cat -> 3; dog -> 4; snake -> 1; 

<h3>String Formatting</h3>

In [12]:
# "{} ".format(): called a format string
for word in sorted(fdist):
    print("{} -> {};".format(word, fdist[word]), end=' ')

cat -> 3; dog -> 4; snake -> 1; 

In [13]:
# Extra args are ignored
"{} is a cool guy".format("Carson", "Pandas")


'Carson is a cool guy'

In [14]:
# Swap ordering
"from {1} to {0}".format("A", "B")


'from B to A'

In [15]:
# Loopy example
template = "Lee wants a {} right now"
menu = ["sandwich", "bagle", "eagle"]
for snack in menu:
    print(template.format(snack))


Lee wants a sandwich right now
Lee wants a bagle right now
Lee wants a eagle right now


In [16]:
# Numbers are right justified by default

# Right Justified
print("{:6}".format(7))

# Left Justified
print("{:<6}".format(7))

     7
7     


In [17]:
print("{:6}".format("dog"))

dog   


In [18]:
print("{:>6}".format("dog"))

   dog


In [19]:
# Specify Precision
import math
print("{:.4f}".format(math.pi))

3.1416


In [20]:
# Smart representation of percentages
count, total = 3205, 9375
"accuracy for {} words: {:.4%}".format(total, count / total)

'accuracy for 9375 words: 34.1867%'

In [37]:
from nltk.corpus import brown
from nltk import ConditionalFreqDist as cfd
import text_analysis as ta

brown_cfd = cfd(
                (genre, word)
                for genre in brown.categories()
                for word in brown.words(categories=genre))

In [38]:
genres = brown.categories()
print(genres)
modals = ["can", "could", "may", "might", "must", "will"]

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [40]:
ta.tabulate(brown_cfd, modals, genres)

Category            can  could    may  might   must   will 
adventure            46    151      5     58     27     50 
belles_lettres      246    213    207    113    170    236 
editorial           121     56     74     39     53    233 
fiction              37    166      8     44     55     52 
government          117     38    153     13    102    244 
hobbies             268     58    131     22     83    264 
humor                16     30      8      8      9     13 
learned             365    159    324    128    202    340 
lore                170    141    165     49     96    175 
mystery              42    141     13     57     30     20 
news                 93     86     66     38     50    389 
religion             82     59     78     12     54     71 
reviews              45     40     45     26     19     58 
romance              74    193     11     51     45     43 
science_fiction      16     49      4     12      8     16 


In [108]:
# customize column width
m_width = max(len(g) for g in genres) + 4
for genre in genres:
    print("{:{max_width}} {}".format(genre, len(genre), max_width=m_width))

adventure           9
belles_lettres      14
editorial           9
fiction             7
government          10
hobbies             7
humor               5
learned             7
lore                4
mystery             7
news                4
religion            8
reviews             7
romance             7
science_fiction     15


<h3>Writing Results to a File</h3>

In [176]:
output_file = open("output.txt", "w")
words = set(nltk.corpus.genesis.words("english-kjv.txt"))
for word in sorted(words):
    if word.isalnum():
        print(word, file=output_file)

# This will put the number at the end of the file
        
# Numeric data must be converted to a string before it
# can be written out to a file
print(len(words))
print(str(len(words)), file=output_file)

2789


<h3>Text Wrapping</h3>

In [177]:
saying = ['After', 'all', 'is', 'said', 'and', 'done', ',',
          'more', 'is', 'said', 'than', 'done', '.']
for word in saying:
    print(word, "(" + str(len(word)) + ")", end=' ')


After (5) all (3) is (2) said (4) and (3) done (4) , (1) more (4) is (2) said (4) than (4) done (4) . (1) 

In [181]:
from textwrap import fill

format = "%s (%d),"
pieces = [format % (word, len(word)) for word in saying]
output = ' '.join(pieces)
wrapped = fill(output)
print(wrapped)

After (5), all (3), is (2), said (4), and (3), done (4), , (1), more
(4), is (2), said (4), than (4), done (4), . (1),
