In [28]:
import string
import random
from typing import List
import time

# Ngram Model practice. With help of the tutorial linked below:

# https://towardsdatascience.com/text-generation-using-n-gram-model-8d12d9802aa0



# Creating a function to tokenize the text. 
def tokenize(text) -> List[str]:
    
    for punct in string.punctuation:
        text = text.replace(punct, ' '+punct+' ')
    t = text.split()
    return t

In [29]:
# Function that splits text into ngrams and adds starting strings to beginning. 

def get_ngrams(n: int, tokens: list) -> list:

    tokens = (n-1)*['<START>']+tokens
    l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) for i in range(n-1, len(tokens))]
    return l


In [82]:
# Final ngram class

class NgramModel:
    def __init__(self, n):
        self.n = n
        
        self.context = {}
        
        self.ngram_counter = {}
    
    # Updates model after each ngram is analyzed.
    def update(self, sentence: str) -> None:

        n = self.n
        ngrams = get_ngrams(n, tokenize(sentence))
        for ngram in ngrams:
            if ngram in self.ngram_counter:
                self.ngram_counter[ngram] += 1.0
            else:
                self.ngram_counter[ngram] = 1.0

            prev_words, target_word = ngram
            if prev_words in self.context:
                self.context[prev_words].append(target_word)
            else:
                self.context[prev_words] = [target_word]
                
    # Sets up conditional probability for text generation.
    def cond_prob(self, context, token):
        
        try:
            count_of_token = self.ngram_counter[(context, token)]
            count_of_context = float(len(self.context[context]))
            result = count_of_token / count_of_context

        except KeyError:
            result = 0.0
        return result

    
    def random_token(self, context):
        nur = random.random()
        map_to_probs = {}
        token_of_interest = self.context[context]
        for token in token_of_interest:
            map_to_probs[token] = self.cond_prob(context, token)

        summ = 0
        for token in sorted(map_to_probs):
            summ += map_to_probs[token]
            if summ > nur:
                return token
            
    # Text generation function
    def generate_text(self, token_count: int):
        
        n = self.n
        context_queue = (n - 1) * ['<START>']
        result = []
        for _ in range(token_count):
            obj = self.random_token(tuple(context_queue))
            result.append(obj)
            if n > 1:
                context_queue.pop(0)
                if obj == '.':
                    context_queue = (n - 1) * ['<START>']
                else:
                    context_queue.append(obj)
        return ' '.join(result)

In [83]:
def create_ngram_model(n, path):
    
    m = NgramModel(n)
    with open(path, 'r') as f:
        text = f.read()
        text = text.split('.')
        for sentence in text:
            sentence += '.'
            m.update(sentence)
    return m

In [87]:
model = create_ngram_model(6, 'moby.txt')

In [88]:
# Output of 200 words from Moby Dick. 

print(model.generate_text(200))

A short , stout , ruddy young fellow , very pugnacious concerning whales , who somehow seemed to think that the great leviathans had personally and hereditarily affronted him ; and therefore it was a sort of point of honour with him , to destroy them whenever encountered . My father , in old Tolland county , cut down a pine tree once , and found a silver ring grown over in it ; some old darkey’s wedding ring . Come hither ! bury thyself in a life which , to your now equally abhorred and abhorring , landed world , is more oblivious than death . Born in throes , ‘tis fit that man should live in pains and die in pangs ! So be it , then . ” “With what ? ” shouted I . It had a careless look , as if it were meant for the uses of the public ; so , entering , the first thing I did was to stumble over an ash - box in the porch . Rat - tat ! So man’s seconds tick ! Oh ! how immaterial are all materials ! What things real are there ,
