# Human behind the Curtain, or Ghost in the Machine?
#### Exploring the foundations of AI with Python

### Example: Drawing marbles from a jar

In [None]:
marbles_sample = {'red': 10,
                 'green': 15,
                 'blue': 15,
                 'yellow': 10}

In [None]:
from random import choices
from collections import Counter

In [None]:
num_selections = 1000

marbles_sim = choices(list(marbles_sample.keys()), weights=list(marbles_sample.values()),
                k=num_selections)

In [None]:
marbles_sim_counter = Counter()
for marble in marbles_sim:
    marbles_sim_counter[marble] += 1

In [None]:
for marble, count in marbles_sim_counter.items():
    print('color: {}, percent_seen: {}, percent_in_sample: {}'.format(marble, 
                                                                      count/num_selections,
                                                                      marbles_sample[marble]/50))

## Text tables 

### Building a probability distribution (1): single letters

In [None]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

In [None]:
gutenberg.fileids()

In [None]:
texts = gutenberg.raw()

In [None]:
non_alpha = {c for c in set(texts) if not c.isalpha()}

In [None]:
texts_cleaned = texts
for c in non_alpha:
    texts_cleaned = texts_cleaned.replace(c, '')
texts_cleaned = texts_cleaned.lower()

In [None]:
texts_cleaned[:1000]

In [None]:
def create_dist(text):
    '''
    Given a Python string, compute the distribution of characters.
    '''
    dist = Counter()
    for character in text:
        dist[character] += 1
    return dist

In [None]:
dist = create_dist(texts_cleaned)

In [None]:
def generate_text(dist, length=500):
    '''
    Given a distribution of letters (Python dictionary), create a text of the given length by random sampling.
    '''
    text = ''.join(choices(list(dist.keys()), weights=list(dist.values()), k=length))
    return text

In [None]:
generate_text(dist)

In [None]:
non_alpha_spc = {c for c in set(texts) if not c.isalpha() and not c.isspace()}

In [None]:
texts_cleaned = texts
for c in non_alpha_spc:
    texts_cleaned = texts_cleaned.replace(c, '').lower()

In [None]:
import re
texts_cleaned = re.sub('\s+', ' ', texts_cleaned)

In [None]:
texts_cleaned[:1000]

In [None]:
dist = create_dist(texts_cleaned)
generate_text(dist)

In [None]:
''.join(choices(list(set(texts_cleaned)), k=500))

## Markov chains

### Building a probability distribution (2): transitions from 1 letter

In [None]:
from collections import defaultdict
def create_dist_pairs(text):
    '''
    Given a Python string, create a transition table showing the frequency with which any given character is followed by any other
    '''
    dist = defaultdict(Counter) # Initializes the inner dictionary to a Counter 
    for i, character in enumerate(text[:-1]):
        first_char = character
        next_char = text[i+1] # text[i] is the current character
        dist[first_char][next_char] += 1 # Increment the frequency observed for this transition
    return dist

In [None]:
def generate_text_from_pairs(dist, length=500):
    '''
    Given a transition table, create a text of n characters by random sampling.
    '''
    first_char = choices(list(dist.keys()))[0] # Using a uniform distribution: any character equally likely
    text = first_char # The text to be generated starts with this character
    for i in range(length):
        transitions = dist[first_char] # Access the nest dictionary
        next_char = choices(list(transitions.keys()), list(transitions.values()))[0]
        text += next_char
        first_char = next_char # Reset for the next time through
    return text

In [None]:
pair_dist = create_dist_pairs(texts_cleaned)

In [None]:
generate_text_from_pairs(pair_dist)

### Building a probability distribution (3): transitions with n-grams

In [None]:
def create_dist_n(sequence, n=2):
    '''Returns a 2-D dictionary, where the outer keys are ngrams of length n,
    the inner keys represent the elements following each ngram, and the values
    represent the weights of each transition.'''
    dist = defaultdict(Counter)
    for i, element in enumerate(sequence[:-n]): # We don't want go past the end of the sequence
        first_elem = tuple(sequence[i:i+n]) # Convert to tuple so we can use as a dict key
        next_elem = sequence[i+n]
        dist[first_elem][next_elem] += 1
    return dist

In [None]:
def generate_text_from_ngrams(dist, length=500, sep=''):
    '''
    Given a transition table, create a text of n elements by random sampling.
    '''
    first_elem = choices(list(dist.keys()))[0] 
    size = len(first_elem)
    elements = list(first_elem) # The text to be generated starts with this character
    for i in range(length):
        transitions = dist[first_elem] # Access the nest dictionary
        next_elem = choices(list(transitions.keys()), list(transitions.values()))[0]
        elements.append(next_elem)
        first_elem = tuple(elements[-size:]) # Reset for the next time through
    return sep.join(elements)

In [None]:
dist_3 = create_dist_n(texts_cleaned, n=2)

In [None]:
generate_text_from_ngrams(dist_3)

In [None]:
dist_4 = create_dist_n(texts_cleaned, n=3)
generate_text_from_ngrams(dist_4)

In [None]:
dist_words = create_dist_n(texts_cleaned.split(), n=2)

In [None]:
generate_text_from_ngrams(dist_words, length=100, sep=' ')