# Python Data Structure Demo

## Text Analysis and Generation

In [None]:
import urllib.request

md_url = 'https://www.gutenberg.org/files/2701/2701-0.txt'
md_text = urllib.request.urlopen(md_url).read().decode()

In [None]:
idx = md_text.index('Call me Ishmael')

In [None]:
idx

In [None]:
md_text[idx:idx+100]

In [None]:
md_text[idx:idx+100].split()

In [None]:
md_words = md_text.lower().split()

In [None]:
len(md_words)

In [None]:
md_words_uniq = set(md_words)
len(md_words_uniq)

In [None]:
# compute the frequency of each word in the text
md_word_count = {}
for w in md_words:
    pass

In [None]:
md_word_counts['the']

In [None]:
len(md_word_counts)

In [None]:
list(md_word_counts.items())[:10]

In [None]:
md_word_counts_sorted = sorted(md_word_counts.items())

In [None]:
md_word_counts_sorted[:10]

In [None]:
import urllib.request

sw_url = 'https://moss.cs.iit.edu/stopwords.txt'
sw_text = urllib.request.urlopen(sw_url).read().decode()
stopwords = sw_text.split()

In [None]:
# remove stopwords

md_word_count = { }

In [None]:
md_word_counts_sorted = sorted(md_word_counts.items(), key=lambda t: t[1], reverse=True)

In [None]:
md_word_counts_sorted[:10]

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# need lists of the first n words/counts to plot
n = 10
words = []
counts = []

plt.rcParams['figure.figsize'] = [12, 5]
plt.bar(range(n), counts)
plt.xticks(range(n), words, rotation=60, fontsize=12)
plt.show()

In [None]:
# collect all two-word phrases as tuples
phrases = []
for i in range(len(md_words)-1):
    pass

In [None]:
phrases[:10]

In [None]:
# map each word to a list of all the words that
# follow it in the text
phrase_dict = {}
for (w1, w2) in phrases:
    pass

In [None]:
phrase_dict['starboard']

In [None]:
# generate a sentence based on tw-word phrase statistics
# from Moby Dick
import random

gen_words['whale']
for _ in range(10):
    pass

' '.join(gen_words)

## Solving Sudoku

In [None]:
puzzle = '''..3 .2. 6..
            9.. 3.5 ..1
            ..1 8.6 4..
            
            ..8 1.2 9..
            7.. ... ..8
            ..6 7.8 2..
            
            ..2 6.9 5..
            8.. 2.3 ..9
            ..5 .1. 3..
            '''

In [None]:
# name all the squares in the game
rows = 'ABCDEFGHI'
cols = '123456789'
squares = ?

In [None]:
def parse_puzzle(puz_str):
    puzzle = [c if c in '123456789' else None
              for c in puz_str if c not in ' \n']
    return {squares[i]: puzzle[i]
            for i in range(0, len(squares))}

In [None]:
parse_puzzle(puzzle)

In [None]:
# what squares fall in the same columns? (list of lists)
vert_units = []

In [None]:
# what squares fall in the same rows? (list of lists)
horiz_units = []

In [None]:
# what squares fall in the same "boxes"? (list of lists)
box_units = []

In [None]:
all_units = vert_units + horiz_units + box_units

In [None]:
# associate each square with a list of all the units it belongs to

units = {}

In [None]:
units['A1']

In [None]:
# associate each square with the set of all its "peers" (i.e., all
# other squares that fall into one of its units)

peers = {}

In [None]:
peers['A1']

In [None]:
# what is a "catch-all" solution?
sol = {}

In [None]:
sol

In [None]:
# solve sudoku via constraint satisfaction, adapted from Norvig (https://norvig.com/sudoku.html)
#
# constraints:
#   - if a square is assigned a (single) value, eliminate that value from its peers
#   - if a unit has only one square that can hold a given value, assign the value there

def assign(sol, sq, val):
    '''If assigning a value to a square, eliminate all other
    values from that square in the solution.'''
    for other in sol[sq].replace(val, ''):
        eliminate(sol, sq, other)

def eliminate(sol, sq, val):
    '''Eliminate a value from a square. Also check whether
    eliminating a value from the given square narrows down 
    that value to only one affected square; if so, assign 
    the value to that square.'''
    if val not in sol[sq]:
        return
    sol[sq] = sol[sq].replace(val, '')
    if len(sol[sq]) == 1:
        last = sol[sq][0]
        for p in peers[sq]:
            eliminate(sol, p, last)
    for u in units[sq]:
        candidates = [s for s in u if val in sol[s]]
        if len(candidates) == 1:
            assign(sol, candidates[0], val)

In [None]:
def solve_puzzle(puzz_str):
    puz = parse_puzzle(puzz_str)
    sol = {s: '123456789' for s in squares}

    for sq, val in puz.items():
        if val:
            assign(sol, sq, val)
    return sol

In [None]:
solve_puzzle(puzzle)

In [None]:
def print_sol(sol):
    for r in rows:
        for c in cols:
            print('{:^6} '.format(sol[r+c]), end='')
        print()

In [None]:
print_sol(solve_puzzle(puzzle))

In [None]:
# NYTimes "Medium" puzzle
print_sol(solve_puzzle('''
.51 8.. 3.6
.3. ... ...
..9 .42 .15

..4 .75 ...
3.. ... ...
.8. 9.. ...

... ... 8..
.1. ..6 .9.
..7 ... ..4
'''))

## California Housing Data

In [None]:
# NB: makes use of housing data included in Google Colab sample datasets

csv_file = '/content/sample_data/california_housing_train.csv'
with open(csv_file) as infile:
    print(infile.readline())

In [None]:
# load CSV file contents into a dictionary
housing_data = {}

with open(csv_file) as infile:
    pass

In [None]:
housing_data['"population"'][:10]

In [None]:
sum(housing_data['"population"']) / len(housing_data['"population"'])

## Specialized Data Structure: Pandas DataFrame

In [None]:
import pandas

df = pandas.read_csv('/content/sample_data/california_housing_train.csv')

In [None]:
df['population'][:10]

In [None]:
df['population'].mean()