# n-Grams and Grammars
__MATH 3480__ - Dr. Michael Olson

Reading:
* Grus, Chapter 21
  * Sections: n-Gram Language Models, Grammars

-----
## bigram

In [1]:
def fix_unicode(text: str) -> str:
    return text.replace(u"\u2019", "'")

In [None]:
import re
from numpy import random

file = open('WhatIsDataScience.txt')
content = file.read()
content = content.split('\n')

content

In [None]:
regex = r"[\w']+|[\.]"                       # matches a word or a period

document = []

for paragraph in content:
     if not paragraph == '':
        words = re.findall(regex, fix_unicode(paragraph))
        document.extend(words)

document

In [None]:
from collections import defaultdict

transitions = defaultdict(list)
for prev, current in zip(document, document[1:]):
    transitions[prev].append(current)

In [None]:
def generate_using_bigrams() -> str:
    current = "."   # this means the next word will start a sentence
    result = []
    while True:
        next_word_candidates = transitions[current]    # bigrams (current, _)
        current = random.choice(next_word_candidates)  # choose one at random
        result.append(current)                         # append it to results
        if current == ".": return " ".join(result)     # if "." we're done

In [None]:
generate_using_bigrams()

## trigram

In [None]:
trigram_transitions = defaultdict(list)
starts = []

for prev, current, next in zip(document, document[1:], document[2:]):

    if prev == ".":              # if the previous "word" was a period
        starts.append(current)   # then this is a start word

    trigram_transitions[(prev, current)].append(next)

In [None]:
def generate_using_trigrams() -> str:
    current = random.choice(starts)   # choose a random starting word
    prev = "."                        # and precede it with a '.'
    result = [current]
    while True:
        next_word_candidates = trigram_transitions[(prev, current)]
        next_word = random.choice(next_word_candidates)

        prev, current = current, next_word
        result.append(current)

        if current == ".":
            return " ".join(result)

In [None]:
generate_using_trigrams()

## n-gram

In [None]:
from nltk import ngrams

n = 6
sixgrams = ngrams(document, n)

for grams in sixgrams:
  print(grams)

-----
## Grammars

In [None]:
from typing import List, Dict

# Type alias to refer to grammars later
Grammar = Dict[str, List[str]]

grammar = {
    "_S"  : ["_NP _VP"],
    "_NP" : ["_N",
             "_A _NP _P _A _N"],
    "_VP" : ["_V",
             "_V _NP"],
    "_N"  : ["data science", "Python", "regression"],
    "_A"  : ["big", "linear", "logistic"],
    "_P"  : ["about", "near"],
    "_V"  : ["learns", "trains", "tests", "is"]
}

def is_terminal(token: str) -> bool:
    return token[0] != "_"

In [None]:
def expand(grammar: Grammar, tokens: List[str]) -> List[str]:
    for i, token in enumerate(tokens):
        # If this is a terminal token, skip it.
        if is_terminal(token): continue

        # Otherwise, it's a nonterminal token,
        # so we need to choose a replacement at random.
        replacement = random.choice(grammar[token])

        if is_terminal(replacement):
            tokens[i] = replacement
        else:
            # Replacement could be, e.g., "_NP _VP", so we need to
            # split it on spaces and splice it in.
            tokens = tokens[:i] + replacement.split() + tokens[(i+1):]

        # Now call expand on the new list of tokens.
        return expand(grammar, tokens)

    # If we get here, we had all terminals and are done.
    return tokens

In [None]:
def generate_sentence(grammar: Grammar) -> List[str]:
    return expand(grammar, ["_S"])

In [None]:
generate_sentence(grammar)