# N-gram Language Models

## Setup

In [None]:
import random

NumberOfOutputSentences = 5

## Training data

In [None]:
trainingData = """
    A cat is an animal. 
    A dog is also an animal.
    Both a cat and a dog are animals.
    Every cat is an animal. 
    Every animal is not a cat. 
    A cat is never a dog.
    The cat sat on the mat.
    The dog barks at the cat.
    The cat runs away.
"""

In [None]:
with open("alice.txt", encoding="utf-8") as file:
    alice = file.read()

with open("monteCristo.txt", encoding="utf-8") as file:
    monteCristo = file.read()

with open("cSharp.txt", encoding="utf-8") as file:
    cSharp = file.read()

trainingData = monteCristo

## Tokenization

In [None]:
tokenizedData = (
    trainingData
    .lower()
    .replace(".", " .")
    .split()
)

print(f"{tokenizedData = }")

## Calculate token frequencies
A.K.A counting the number of times each token occurs in the trainging data

In [None]:
tokenFrequencies = {}

for token in tokenizedData:
    if token in tokenFrequencies:
        tokenFrequencies[token] += 1
    else:
        tokenFrequencies[token] = 1

print(f"{tokenFrequencies = }")

## Print probabilities

In [None]:
for token, count in sorted(tokenFrequencies.items(), key=lambda item: item[1], reverse=True):
    print(f'{token:15} {count / len(tokenizedData) * 100:-5,.2f} %')

## Unigram model output

In [None]:
def prettyPrint(words):
    text = ''
    isTitle = True

    for word in words:
        if word != '.' and text != '':
            text += ' '

        if isTitle:
            text += word.capitalize()
        else:
            text += word

        isTitle = (word == '.')
    
    print(text)

words = []
sentenceCounter = 0

while (sentenceCounter < NumberOfOutputSentences):
    currentWord = random.choices(list(tokenFrequencies.keys()), weights=tokenFrequencies.values())[0]
    if currentWord == ".": sentenceCounter += 1
    words.append(currentWord)

prettyPrint(words)

## Building a bigram model

In [None]:
bigrams = {}

for index in range(len(tokenizedData)-1):
    currentWord = tokenizedData[index]
    nextWord = tokenizedData[index + 1]

    if not currentWord in bigrams:
        bigrams[currentWord] = {nextWord: 1}
    elif nextWord not in bigrams[currentWord]:
        bigrams[currentWord][nextWord] = 1
    else: 
        bigrams[currentWord][nextWord] += 1

print(f"{bigrams = }")

## Print probabilities

In [None]:
def print_ngram_probabilities(ngrams):
    BOLD = "\033[1m"
    RESET = "\033[0m"

    for head in sorted(ngrams.keys()):
        print(f"{BOLD}{head}{RESET}")  # Huvudord

        followers = ngrams[head]
        total = sum(followers.values())  # Totalt antal följande ord
        sorted_followers = sorted(followers.items())

        for word, count in sorted_followers:
            percent = (count / total) * 100
            print(f"    {word:<10} {percent:.2f}%")  # Procent med två decimaler
        print()

print_ngram_probabilities(bigrams)

## Bigram model output

The following code generates a set number of sentences using a bigram model trained on the provided data.

In [None]:
words = []
sentenceCounter = 0

while (sentenceCounter < NumberOfOutputSentences):
    currentWord = words[-1] if len(words) > 0 else '.'

    if currentWord not in bigrams: break
    
    currentWord = random.choices(list(bigrams[currentWord].keys()), weights=bigrams[currentWord].values())[0]
    if currentWord == ".": sentenceCounter += 1

    words.append(currentWord)

prettyPrint(words)

# Generic code for n-Gram models
The following below can be reused for n-gram models with different values of n.

The output below shows a trigram data structure.

In [None]:
def generate_ngrams(n):
    ngrams = {}

    for i in range(len(tokenizedData) - n + 1):
        key = tuple(tokenizedData[i:i+n-1])
        nextWord = tokenizedData[i+n-1]

        if not key in ngrams:
            ngrams[key] = {nextWord: 1}
        elif nextWord not in ngrams[key]:
            ngrams[key][nextWord] = 1
        else:
            ngrams[key][nextWord] += 1

    return ngrams

ngrams = generate_ngrams(3)
print(f"{ngrams = }")


## Print probabilities

In [None]:
print_ngram_probabilities(ngrams)

## Compare outputs for different n-values

In [None]:
def generate_text(ngrams):
    contextLength = len(next(iter(ngrams)))
    title = ['Unigram', 'Bigram', 'Trigram', 'Four-gram', 'Five-gram']

    BOLD = "\033[1m"
    RESET = "\033[0m"

    if contextLength < len(title):
        print(f'{BOLD}{title[contextLength]}{RESET}')
    else:
        print(f'{BOLD}{contextLength + 1}-gram{RESET}')

    startKeys = [key for key in ngrams.keys() if key[0] == '.']
    words = list(random.choice(startKeys))
    sentenceCounter = 0

    while (sentenceCounter < NumberOfOutputSentences):
        previousWords = tuple(words[-contextLength:])

        if previousWords not in ngrams: break
        
        currentWord = random.choices(list(ngrams[previousWords].keys()), weights=ngrams[previousWords].values())[0]
        if currentWord == ".": sentenceCounter += 1

        words.append(currentWord)

    words = words[1:]

    prettyPrint(words)
    print()

bigrams = generate_ngrams(2)
trigrams = generate_ngrams(3)
fourgrams = generate_ngrams(4)
fivegrams = generate_ngrams(5)

generate_text(bigrams)
generate_text(trigrams)
generate_text(fourgrams)
generate_text(fivegrams)
