# N-gram Language Models

## Setup

In [117]:
import random

NumberOfOutputSentences = 5

## Training data

In [125]:
trainingData = """
    A cat is an animal. 
    A dog is also an animal.
    Both a cat and a dog are animals.
    Every cat is an animal. 
    Every animal is not a cat. 
    A cat is never a dog.
    The cat sat on the mat.
    The dog barks at the cat.
    The cat runs away.
"""

## Tokenization

In [126]:
tokenizedData = (
    trainingData
    .lower()
    .replace(".", " .")
    .split()
)

print(f"{tokenizedData = }")

tokenizedData = ['a', 'cat', 'is', 'an', 'animal', '.', 'a', 'dog', 'is', 'also', 'an', 'animal', '.', 'both', 'a', 'cat', 'and', 'a', 'dog', 'are', 'animals', '.', 'every', 'cat', 'is', 'an', 'animal', '.', 'every', 'animal', 'is', 'not', 'a', 'cat', '.', 'a', 'cat', 'is', 'never', 'a', 'dog', '.', 'the', 'cat', 'sat', 'on', 'the', 'mat', '.', 'the', 'dog', 'barks', 'at', 'the', 'cat', '.', 'the', 'cat', 'runs', 'away', '.']


## Calculate token frequencies
A.K.A counting the number of times each token occurs in the trainging data

In [127]:
tokenFrequencies = {}

for token in tokenizedData:
    if token in tokenFrequencies:
        tokenFrequencies[token] += 1
    else:
        tokenFrequencies[token] = 1

print(f"{tokenFrequencies = }")

tokenFrequencies = {'a': 7, 'cat': 8, 'is': 5, 'an': 3, 'animal': 4, '.': 9, 'dog': 4, 'also': 1, 'both': 1, 'and': 1, 'are': 1, 'animals': 1, 'every': 2, 'not': 1, 'never': 1, 'the': 5, 'sat': 1, 'on': 1, 'mat': 1, 'barks': 1, 'at': 1, 'runs': 1, 'away': 1}


## Print probabilities

In [128]:
for token, count in sorted(tokenFrequencies.items(), key=lambda item: item[1], reverse=True):
    print(f'{token:15} {count / len(tokenizedData) * 100:-5,.2f} %')

.               14.75 %
cat             13.11 %
a               11.48 %
is               8.20 %
the              8.20 %
animal           6.56 %
dog              6.56 %
an               4.92 %
every            3.28 %
also             1.64 %
both             1.64 %
and              1.64 %
are              1.64 %
animals          1.64 %
not              1.64 %
never            1.64 %
sat              1.64 %
on               1.64 %
mat              1.64 %
barks            1.64 %
at               1.64 %
runs             1.64 %
away             1.64 %


## Unigram model output

In [195]:
def prettyPrint(words):
    text = ''
    isTitle = True

    for word in words:
        if word != '.' and text != '':
            text += ' '

        if isTitle:
            text += word.capitalize()
        else:
            text += word

        isTitle = (word == '.')
    
    print(text)

words = []
sentenceCounter = 0

while (sentenceCounter < NumberOfOutputSentences):
    currentWord = random.choices(list(tokenFrequencies.keys()), weights=tokenFrequencies.values())[0]
    if currentWord == ".": sentenceCounter += 1
    words.append(currentWord)

prettyPrint(words)

Away animal never. Barks animal.. Cat an the is an every animal an the animal. A an both the.


## Building a bigram model

In [136]:
bigrams = {}

for index in range(len(tokenizedData)-1):
    currentWord = tokenizedData[index]
    nextWord = tokenizedData[index + 1]

    if not currentWord in bigrams:
        bigrams[currentWord] = {nextWord: 1}
    elif nextWord not in bigrams[currentWord]:
        bigrams[currentWord][nextWord] = 1
    else: 
        bigrams[currentWord][nextWord] += 1

print(f"{bigrams = }")

bigrams = {'a': {'cat': 4, 'dog': 3}, 'cat': {'is': 3, 'and': 1, '.': 2, 'sat': 1, 'runs': 1}, 'is': {'an': 2, 'also': 1, 'not': 1, 'never': 1}, 'an': {'animal': 3}, 'animal': {'.': 3, 'is': 1}, '.': {'a': 2, 'both': 1, 'every': 2, 'the': 3}, 'dog': {'is': 1, 'are': 1, '.': 1, 'barks': 1}, 'also': {'an': 1}, 'both': {'a': 1}, 'and': {'a': 1}, 'are': {'animals': 1}, 'animals': {'.': 1}, 'every': {'cat': 1, 'animal': 1}, 'not': {'a': 1}, 'never': {'a': 1}, 'the': {'cat': 3, 'mat': 1, 'dog': 1}, 'sat': {'on': 1}, 'on': {'the': 1}, 'mat': {'.': 1}, 'barks': {'at': 1}, 'at': {'the': 1}, 'runs': {'away': 1}, 'away': {'.': 1}}


## Print probabilities

In [147]:
BOLD = "\033[1m"
RESET = "\033[0m"

for head in sorted(bigrams.keys()):
    print(f"{BOLD}{head}{RESET}")  # Huvudord

    followers = bigrams[head]
    total = sum(followers.values())  # Totalt antal följande ord
    sorted_followers = sorted(followers.items())

    for word, count in sorted_followers:
        percent = (count / total) * 100
        print(f"    {word:<10} {percent:.2f}%")  # Procent med två decimaler
    print()


[1m.[0m
    a          25.00%
    both       12.50%
    every      25.00%
    the        37.50%

[1ma[0m
    cat        57.14%
    dog        42.86%

[1malso[0m
    an         100.00%

[1man[0m
    animal     100.00%

[1mand[0m
    a          100.00%

[1manimal[0m
    .          75.00%
    is         25.00%

[1manimals[0m
    .          100.00%

[1mare[0m
    animals    100.00%

[1mat[0m
    the        100.00%

[1maway[0m
    .          100.00%

[1mbarks[0m
    at         100.00%

[1mboth[0m
    a          100.00%

[1mcat[0m
    .          25.00%
    and        12.50%
    is         37.50%
    runs       12.50%
    sat        12.50%

[1mdog[0m
    .          25.00%
    are        25.00%
    barks      25.00%
    is         25.00%

[1mevery[0m
    animal     50.00%
    cat        50.00%

[1mis[0m
    also       20.00%
    an         40.00%
    never      20.00%
    not        20.00%

[1mmat[0m
    .          100.00%

[1mnever[0m
    a          100.00%

## Bigram model output

The following code generates a set number of sentences using a bigram model trained on the provided data.

In [199]:
words = []
sentenceCounter = 0

while (sentenceCounter < NumberOfOutputSentences):
    currentWord = words[-1] if len(words) > 0 else '.'

    if currentWord not in bigrams: break
    
    currentWord = random.choices(list(bigrams[currentWord].keys()), weights=bigrams[currentWord].values())[0]
    if currentWord == ".": sentenceCounter += 1

    words.append(currentWord)

prettyPrint(words)

Every cat runs away. The cat sat on the cat is never a dog. A dog are animals. Both a dog barks at the mat. A dog.
