In [None]:
from nltk import word_tokenize, sent_tokenize
from collections import Counter
import copy
import numpy as np

In [None]:
def read_file(filename):
    sequences=[]
    with open(filename) as file:
        data=file.read()
        sents=sent_tokenize(data)
        for sent in sents:
            tokens=word_tokenize(sent)
            sequences.append(tokens)
            
    return sequences

In [None]:
# Read data from file and tokenize them into sequences comprised of tokens.

# 2020 Democratic Party platform
# sequences=read_file("../data/democrat_platform_2020.txt")

# 2020 Republican Party platform
# sequences=read_file("../data/republican_platform_2020.txt")

# Pride and Prejudice (Jane Austen)
sequences=read_file("../data/stylometry/1342_pride_and_prejudice.txt")

# All of Shakespeare's plays
# sequences=read_file("../data/pg100_plays.txt")

max_sequences=10000

In [None]:
class NgramModel():

    def __init__(self, sequences, order):
        
        # For this exercise we're going to encode the LM as a sparse dictionary (training less storage for more compute)
        # We'll store the LM as a dictionary with the conditioning context as keys; each value is a 
        # Counter object that keeps track of the number of times we see a word following that context.
        
        self.counts={}
        
        # Markov order (order 1 = conditioning on previous 1 word; order 2 = previous 2 words, etc.)
        self.order=order
        
        vocab={"[END]":0}
                
        for s_idx, tokens in enumerate(sequences):
            # We'll add [START] and [END] tokens to encode the beginning/end of sentences
            token_copy=copy.deepcopy(tokens)
            for i in range(order):
                token_copy.insert(0, "[START]")
            token_copy.append("[END]")
            
        
            for i in range(order, len(token_copy)):
                context=" ".join(token_copy[i-order:i])
                word=token_copy[i]
                
                if word not in vocab:
                    vocab[word]=len(vocab)
                
                # For just the first sentence, print the conditioning context + word
                if s_idx == 0:
                    print("Context: %s Next: %s" % (context.ljust(50), word))
                    
                if context not in self.counts:
                    self.counts[context]=Counter()
                self.counts[context][word]+=1
                


    def sample(self, context):

        total=sum(self.counts[context].values())
        
        dist=[]
        vocab=[]

        # Create a probability distribution for each conditioning context, over the vocab that we've observed it with.
        for idx, word in enumerate(self.counts[context]):
            prob=self.counts[context][word]/total
            dist.append(prob)
            vocab.append(word)

        index=np.argmax(np.random.multinomial(1, pvals=dist))
        return vocab[index]
        
    def generate_sequence(self):
        generated=["[START]"]*(self.order)
        word=None
        while word != "[END]":
            context=' '.join(generated[-self.order:] if self.order > 0 else "")
            word=self.sample(context)
            print(word)
            generated.append(word)
    
    

In [None]:
ngram1=NgramModel(sequences[:max_sequences], order=1)

In [None]:
ngram1.generate_sequence()

In [None]:
ngram0=NgramModel(sequences[:max_sequences], order=0)

In [None]:
ngram0.generate_sequence()

In [None]:
ngram2=NgramModel(sequences[:max_sequences], order=2)

In [None]:
ngram2.generate_sequence()

A1. Explore sampling sequences from LMs of different orders above; what do you notice about the structure of the generated texts (and how they differ by orders)?  Explore LMs trained on different datasets as well.

A2. In a second-order LM estimated from `1342_pride_and_prejudice.txt` above, what's $P(\textrm{are} | \textrm{Lady Lucas})$?

A3. Keep increasing the order of LMs (well past 3); compare the text that's generated to the original dataset (in the files above); are the LMs simply memorizing the source material?