In [None]:
# Based on https://nbviewer.org/gist/yoavg/d76121dfde2618422139

In [None]:
# Import some basic python packages

import json
from collections import defaultdict, Counter
from tqdm import tqdm
from random import random


In [None]:
# Get a corpus of Shakespearean text

! [ -f shakespeare_input.txt ] || curl -O https://norvig.com/ngrams/shakespeare_input.txt
! wc shakespeare_input.txt

In [None]:
# Read in the data
with open('shakespeare_input.txt') as f:
    text = f.read()
    
# Show a sample
print(text[:100])

In [None]:
# Define a function to build a simple character language model

def train_char_lm(data, order=4):
    # Create a defaultdict of Counters to store the language model
    lm = defaultdict(Counter)

    # also count each span, for initial sampling
    span_counter = Counter()

    # Process each span of length order, and count the next character
    for i in tqdm(range(len(data)-order)):
        history, char = data[i:i+order], data[i+order]
        lm[history][char]+=1

        # Also count the spans, for future sampling
        span_counter[data[i:i+order]] += 1

    # Convert the counts to probabilities
    def normalize(counter):
        s = float(sum(counter.values()))
        return [(c,cnt/s) for c,cnt in counter.most_common()]
    
    normalized_lm = {hist:normalize(chars) for hist, chars in lm.items()}

    span_probs = normalize(span_counter)

    # Return the language model and span probabilities
    return normalized_lm, span_probs


In [None]:
# Try training an order-4 language model
lm, span_probs = train_char_lm(text, order=4)


In [None]:
# Look at the letter probabilities following "Rome"
lm['Rome']

In [None]:
# Look at the letter probabilities following "Juli"
lm['Juli']

In [None]:
# Define a function to generate a single letter from the language model

def generate_letter(lm, history, order):
        # Take the end of the text that is given
        history = history[-order:]

        # Look up the LM probabilities
        dist = lm[history]
        
        # Sample a random number bewteen 0 and 1
        x = random()
        
        # Use that to choose a letter from the distribution
        for c, v in dist:
            x = x - v
            if x <= 0: return c


In [None]:
# Test it out: generate a letter after "Juli" 10 times 
for i in range(10):
    print(generate_letter(lm, 'Juli', 4))

In [None]:
# Define a function to randomly sample a seed span 

def generate_seed(span_probs):
    x = random()
    for c,v in span_probs:
        x = x - v
        if x <= 0: return c


In [None]:
# Define a function to generate text, one letter at a time

def generate_text(lm, seeds, order, nletters=1000):
    # Get a starting seed (a span of length order)
    starting_seed = generate_seed(seeds)
    
    # Set the history to that seed
    history = starting_seed
    
    # Create an array to store the output
    out = []
    
    # Sample a number of letters equal to nletters
    for i in range(nletters):

        # Sample one letter based on the history
        c = generate_letter(lm, history, order)
        
        # Add the sampled letter to the history
        history += c

    # Return the output
    return history[order:]

In [None]:
# Estimate an order-1 character LM
lm, span_probs = train_char_lm(text, order=1)

In [None]:
# Try generating some text
print(generate_text(lm, span_probs, 1))

In [None]:
# Estimate an order-4 character LM
lm, span_probs = train_char_lm(text, order=4)

In [None]:
# Try generating some text
print(generate_text(lm, span_probs, 4))

In [None]:
# Estimate an order-7 character LM
lm, span_probs = train_char_lm(text, order=7)

In [None]:
# Try generating some text
print(generate_text(lm, span_probs, 7))