<h1>4.1 Back to the Basics</h1>

<h3>Shallow Copy</h3>

In [45]:
i = [1, 2, 3]
j = [4, 5, 6]
k = [7, 8, 9]

a = [i, j, k]
# Copy the object references from a
b = a[:]
b[0].append(-7)

print("A: {}\n\nB: {}".format(a, b,))

A: [[1, 2, 3, -7], [4, 5, 6], [7, 8, 9]]

B: [[1, 2, 3, -7], [4, 5, 6], [7, 8, 9]]


<h3>Deep Copy</h3>

In [46]:
from copy import deepcopy as dc

c = dc(a)
c[0].append(-999)
print("A: {}\n\nB: {}\n\nC: {}".format(a, b, c))

A: [[1, 2, 3, -7], [4, 5, 6], [7, 8, 9]]

B: [[1, 2, 3, -7], [4, 5, 6], [7, 8, 9]]

C: [[1, 2, 3, -7, -999], [4, 5, 6], [7, 8, 9]]


<h3>Equality</h3>

In [47]:
size = 5
python = ["Python"]
snake_nest = [python] * size

def test(nest):
    print(nest)
    # ==: same values
    if nest[0] == nest[1] == nest[2] == nest[3] == nest[4]:
        print("KITTY CAT")
    # is: same identity
    if nest[0] is nest[1] is nest[2] is nest[3] is nest[4]:
        print("PUPPY DOG")

def check_ids(nest):
    for snake in nest:
        print(id(snake))
            
test(snake_nest)
check_ids(snake_nest)

print("\n\nround 2")
import random
pos = random.choice(range(size))
snake_nest[pos] = ["Python"]

test(snake_nest)
check_ids(snake_nest)

[['Python'], ['Python'], ['Python'], ['Python'], ['Python']]
KITTY CAT
PUPPY DOG
4412257992
4412257992
4412257992
4412257992
4412257992


round 2
[['Python'], ['Python'], ['Python'], ['Python'], ['Python']]
KITTY CAT
4412257992
4412257992
4412087240
4412257992
4412257992


In [48]:
sent = ['No', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '.']
print(all(len(w) > 4 for w in sent))

print(any(len(w) > 4 for w in sent))


False
True


<h1>4.2 Sequences</h1>

In [49]:
faves = ["Chocolate", "Racecars"]
tup = "David", 234, faves
tup

('David', 234, ['Chocolate', 'Racecars'])

In [50]:
import nltk
from nltk import word_tokenize
raw = 'Red lorry, yellow lorry, red lorry, yellow lorry.'
text = word_tokenize(raw)
fdist = nltk.FreqDist(text)
print(sorted(fdist))
for key in fdist:
    print("{}: {}; ".format(key, fdist[key]), end=' ')

[',', '.', 'Red', 'lorry', 'red', 'yellow']
Red: 1;  lorry: 4;  ,: 3;  yellow: 2;  red: 1;  .: 1;  

In [51]:
words = ['I', 'turned', 'off', 'the', 'spectroroute']
words[2], words[3], words[4] = words[3], words[4], words[2]
print(words)

['I', 'turned', 'the', 'spectroroute', 'off']


In [52]:
words = ['I', 'turned', 'off', 'the', 'spectroroute']
tags = ['noun', 'verb', 'prep', 'det', 'noun']

print(zip(words, tags))
print(list(zip(words, tags)))
print(list(enumerate(words)))


<zip object at 0x10b67f648>
[('I', 'noun'), ('turned', 'verb'), ('off', 'prep'), ('the', 'det'), ('spectroroute', 'noun')]
[(0, 'I'), (1, 'turned'), (2, 'off'), (3, 'the'), (4, 'spectroroute')]


<h3>Divide Up Training/Test Data</h3>

In [53]:
text = nltk.corpus.nps_chat.words()
cut = int(0.9 * len(text))

In [54]:
training_data, test_data = text[:cut], text[cut:]

# Verify that no data was lost
print(text == training_data + test_data)

# Verify ratio of sizes is correct
print(len(training_data) / len(test_data))

True
9.0


<h3>Combining Different Sequence Types</h3>

In [55]:
words = 'I turned off the spectroroute'.split()
wordlens = [(len(word), word) for word in words]

print("Before Sorting: {}".format(wordlens))
wordlens.sort()
print('\n\n')
print("After Sorting: {}".format(wordlens))

' '.join(w for (_, w) in wordlens)

Before Sorting: [(1, 'I'), (6, 'turned'), (3, 'off'), (3, 'the'), (12, 'spectroroute')]



After Sorting: [(1, 'I'), (3, 'off'), (3, 'the'), (6, 'turned'), (12, 'spectroroute')]


'I off the turned spectroroute'

<h3>Generator Expressions</h3>

In [56]:
text = ('''"When I use a word," Humpty Dumpty said in rather a scornful tone, \
"it means just what I choose it to mean - neither more nor less."''')
gen_tokens = (w.lower() for w in word_tokenize(text) if w.isalnum())
print(gen_tokens)

# Can call sequence function on tokens
print(max(gen_tokens))


<generator object <genexpr> at 0x10b061af0>
word


In [57]:
fd = nltk.FreqDist(nltk.corpus.brown.words())

In [58]:
cumulative = 0.0
most_common_words = [word for (word, count) in fd.most_common()
                     if word.isalnum()]
print(most_common_words[:10])

for rank, word in enumerate(most_common_words):
    cumulative += fd.freq(word)
    print("{:3} {:8.2%} {:>6}".format(rank + 1, cumulative, word))
    if cumulative > 0.25:
        break


['the', 'of', 'and', 'to', 'a', 'in', 'that', 'is', 'was', 'for']
  1    5.40%    the
  2    8.51%     of
  3   10.91%    and
  4   13.13%     to
  5   15.01%      a
  6   16.69%     in
  7   17.58%   that
  8   18.44%     is
  9   19.28%    was
 10   20.04%    for
 11   20.67%    The
 12   21.27%   with
 13   21.85%     it
 14   22.43%     as
 15   22.99%     he
 16   23.55%    his
 17   24.10%     on
 18   24.65%     be
 19   25.09%      I


In [59]:
import _mypath
import text_analysis as ta
sent = ['The', 'dog', 'gave', 'John', 'the', 'newspaper']
n = 3

# With my function
my_ngrams = ta.extract_ngrams(n, sent)
for ng in my_ngrams:
    print(ng)

print('\n\n')
    
# With nltk's
n_grams = list(nltk.ngrams(sent, 3))
for ng in n_grams:
    print(ng, )

[('The', 'dog', 'gave'), ('dog', 'gave', 'John'), ('gave', 'John', 'the'), ('John', 'the', 'newspaper')]



('The', 'dog', 'gave')
('dog', 'gave', 'John')
('gave', 'John', 'the')
('John', 'the', 'newspaper')


In [60]:
# Build a matrix
m, n = 9, 5

matrix = [[(i + 1, j + 1) for j in range(n)] for i in range(m)]
for row in matrix:
    print(row)

[(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)]
[(2, 1), (2, 2), (2, 3), (2, 4), (2, 5)]
[(3, 1), (3, 2), (3, 3), (3, 4), (3, 5)]
[(4, 1), (4, 2), (4, 3), (4, 4), (4, 5)]
[(5, 1), (5, 2), (5, 3), (5, 4), (5, 5)]
[(6, 1), (6, 2), (6, 3), (6, 4), (6, 5)]
[(7, 1), (7, 2), (7, 3), (7, 4), (7, 5)]
[(8, 1), (8, 2), (8, 3), (8, 4), (8, 5)]
[(9, 1), (9, 2), (9, 3), (9, 4), (9, 5)]


<h1>4.4 Functions</h1>

<h3>Defensive Programming</h3>

In [61]:
def tag(word):
    error = "Argument to tag() must be a string"
    assert isinstance(word, str), error
    if word in ['a', 'the', 'all']:
        return 'det'
    else:
        return 'noun'

print(tag("the"))
print(tag("cat"))
# This will throw an error
# print(tag(["a", "the"]))



det
noun


In [62]:
from urllib import request
from bs4 import BeautifulSoup
constitution = "http://www.archives.gov/exhibits/charters/constitution_transcript.html"

def freq_words(url, n):
    html = request.urlopen(url).read().decode("utf8")
    text = BeautifulSoup(html, "lxml").get_text()
    freqdist = nltk.FreqDist(word.lower() for word in
                             word_tokenize(text))
    return [word for (word, _) in fd.most_common(n)]
    
freq_words(constitution, 10)

['the', ',', '.', 'of', 'and', 'to', 'a', 'in', 'that', 'is']

<h3>A Beautifully Documented Function</h3>

In [63]:
def accuracy(reference, test):
    """
    Calculate the fraction of test items that equal the corresponding reference items.

    Given a list of reference values and a corresponding list of test values,
    return the fraction of corresponding values that are equal.
    In particular, return the fraction of indexes
    {0<i<=len(test)} such that C{test[i] == reference[i]}.

        >>> accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])
        0.5

    :param reference: An ordered list of reference values
    :type reference: list
    :param test: A list of values to compare against the corresponding
        reference values
    :type test: list
    :return: the accuracy score
    :rtype: float
    :raises ValueError: If reference and length do not have the same length
    """

    if len(reference) != len(test):
        raise ValueError("Lists must have the same length.")
    num_correct = 0
    for x, y in zip(reference, test):
        if x == y:
            num_correct += 1
    return float(num_correct) / len(reference)

<h1>4.5 Doing More with Functions</h1>

<h3>Functions as Arguments</h3>

In [64]:
sent = ['Take', 'care', 'of', 'the', 'sense', ',', 'and', 'the',
        'sounds', 'will', 'take', 'care', 'of', 'themselves', '.']

# ()'s ommitted when treating the function as an object
def extract_property(prop):
    return [prop(word) for word in sent]

extract_property(len)

# This same thing can be done with map
mapped = map(len, sent)
print(list(mapped))


[4, 4, 2, 3, 5, 1, 3, 3, 6, 4, 4, 4, 2, 10, 1]


<h3>Lambda</h3>

In [65]:
extract_property(lambda w: w[-1])

['e', 'e', 'f', 'e', 'e', ',', 'd', 'e', 's', 'l', 'e', 'e', 'f', 's', '.']

In [66]:
print(sorted(sent))

# Sort by longest length to shortest length
print(sorted(sent, key = lambda x: (- len(x))))

[',', '.', 'Take', 'and', 'care', 'care', 'of', 'of', 'sense', 'sounds', 'take', 'the', 'the', 'themselves', 'will']
['themselves', 'sounds', 'sense', 'Take', 'care', 'will', 'take', 'care', 'the', 'and', 'the', 'of', 'of', ',', '.']


<h3>Accumulative Functions</h3>

In [69]:
def find_ss(substring, words):
    result = []
    for word in words:
        if substring in word:
            results.append(word)
    return result

def find_ss_gen(substring, words):
    for word in words:
        if substring in word:
            yield word

In [70]:
words = nltk.corpus.brown.words()
for item in find_ss_gen("zz", words):
    print(item, end=" ")
    

Grizzlies' fizzled Rizzuto huzzahs dazzler jazz Pezza Pezza Pezza embezzling embezzlement pizza jazz Ozzie nozzle drizzly puzzle puzzle dazzling Sizzling guzzle puzzles dazzling jazz jazz Jazz jazz Jazz jazz jazz Jazz jazz jazz jazz Jazz jazz dizzy jazz Jazz puzzler jazz jazzmen jazz jazz Jazz Jazz Jazz jazz Jazz jazz jazz jazz Jazz jazz jazz jazz jazz jazz jazz jazz jazz jazz Jazz Jazz jazz jazz nozzles nozzle puzzle buzz puzzle blizzard blizzard sizzling puzzled puzzle puzzle muzzle muzzle muezzin blizzard Neo-Jazz jazz muzzle piazzas puzzles puzzles embezzle buzzed snazzy buzzes puzzled puzzled muzzle whizzing jazz Belshazzar Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie's Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie Lizzie blizzard blizzards blizzard blizzard fuzzy Lazzeri Piazza piazza palazzi Piazza Piazza Palazzo Palazzo Palazzo Piazza Piazza Palazzo palazzo palazzo Palazzo Palazzo Piazza piazza piazza piazza Piazza Piazza Palazzo palazzo Piazza piazz

In [75]:
def permutations(seq):
    if len(seq) <= 1:
        yield seq
    else:
        for perm in permutations(seq[1:]):
            for i in range(len(perm) + 1):
                yield perm[:i] + seq[0:1] + perm[i:]
                
list(permutations(["police", "fish", "buffalo"]))

[['police', 'fish', 'buffalo'],
 ['fish', 'police', 'buffalo'],
 ['fish', 'buffalo', 'police'],
 ['police', 'buffalo', 'fish'],
 ['buffalo', 'police', 'fish'],
 ['buffalo', 'fish', 'police']]

<h3>Named Arguments</h3>

In [80]:
def repeat(msg="<empty>", num=1):
    return msg * num

repeat(num=3)
repeat(msg="Jiggles", num=10)


'JigglesJigglesJigglesJigglesJigglesJigglesJigglesJigglesJigglesJiggles'

In [86]:
def generic(*args, **kwargs):
    print("args: ", args)
    print("kwargs: ", kwargs)
    
generic(1, "African swallow", monty="python")

args:  (1, 'African swallow')
kwargs:  {'monty': 'python'}


<h3>Star</h3>

In [87]:
song = [["four", "calling", "birds"],
        ["three", "french", "hens"],
        ["two", "turtle", "doves"]]

list(zip(song[0], song[1], song[2]))

[('four', 'three', 'two'),
 ('calling', 'french', 'turtle'),
 ('birds', 'hens', 'doves')]

In [88]:
list(zip(*song))

[('four', 'three', 'two'),
 ('calling', 'french', 'turtle'),
 ('birds', 'hens', 'doves')]

In [95]:
def f(val1="a", val2="b"):
    print(val1, val2)