In [10]:
import numpy as np
import pandas as pd

np.random.seed(42)

In [11]:
class TrigramWordGenerator:
    def __init__(self, file_path):
        # Read words from the .txt file, add special token '.'
        with open(file_path, 'r') as f:
            self.words = ['.' + word.strip().lower() + '.' for word in f.readlines()]
        self.char_to_idx = {}
        self.idx_to_char = {}
        self._create_trigram_probs()

    def _create_trigram_probs(self):
        # Create a set of all characters
        characters = set(''.join(self.words))

        # Create character to index mappings, and vice versa
        for i, char in enumerate(characters):
            self.char_to_idx[char] = i
            self.idx_to_char[i] = char

        # Count trigrams
        trigram_counts = np.zeros((len(characters), len(characters), len(characters)))
        # add tiny value to avoid zero probabilities
        trigram_counts += 1e-10
        for word in self.words:
            for i in range(len(word) - 2):
                c1, c2, c3 = word[i], word[i + 1], word[i + 2]
                trigram_counts[self.char_to_idx[c1], self.char_to_idx[c2], self.char_to_idx[c3]] += 1

        # Normalize the counts to get probabilities
        self.trigram_probs = trigram_counts / trigram_counts.sum(axis=(1, 2), keepdims=True)

    def get_neg_log_likelihood(self, word):
        result = 0
        word = '.' + word.strip().lower() + '.'
        for i in range(len(word) - 2):
            c1, c2, c3 = word[i], word[i + 1], word[i + 2]
            result += np.log(self.trigram_probs[self.char_to_idx[c1], self.char_to_idx[c2], self.char_to_idx[c3]])
        return -round(result, 3)

    def generate_word(self, max_word_length):
        # Randomly choose the first character based on the unigram distribution
        first_char_probs = self.trigram_probs[self.char_to_idx['.']].sum(axis=1)
        first_char_probs /= first_char_probs.sum()  # Normalize the probabilities
        first_char_idx = np.random.multinomial(1, first_char_probs).argmax()
        first_char = self.idx_to_char[first_char_idx]

        # Randomly choose the second character based on the bigram distribution
        second_char_probs = self.trigram_probs[self.char_to_idx['.'], first_char_idx]
        second_char_probs /= second_char_probs.sum()  # Normalize the probabilities
        second_char_idx = np.random.multinomial(1, second_char_probs).argmax()
        second_char = self.idx_to_char[second_char_idx]

        generated_word = [first_char, second_char]

        # Generate the rest of the characters based on trigram probabilities
        for _ in range(max_word_length - 2):
            next_char_probs = self.trigram_probs[self.char_to_idx[generated_word[-2]], self.char_to_idx[generated_word[-1]]]
            next_char_probs /= next_char_probs.sum()  # Normalize the probabilities
            next_char_idx = np.random.multinomial(1, next_char_probs).argmax()
            next_char = self.idx_to_char[next_char_idx]

            # Check if the next character is the end character
            if next_char == '.':
                break
            else:
                generated_word.append(next_char)

        generated_word = ''.join(generated_word)
        return generated_word, self.get_neg_log_likelihood(generated_word)

In [12]:
trigram_word_generator = TrigramWordGenerator('data/names.txt')

In [13]:
max_word_length = 30
num_words = 40
generated_words = [trigram_word_generator.generate_word(max_word_length) for _ in range(num_words)]
df = pd.DataFrame(generated_words, columns=['word', 'neg_log_likelihood'])
df.sort_values(by='neg_log_likelihood', inplace=True)
df.reset_index(drop=True, inplace=True)
print(df)
print(f'Average negative log likelihood: {df.neg_log_likelihood.mean():.2f}')

                         word  neg_log_likelihood
0                          za               2.884
1                          ka               2.969
2                         ken               3.650
3                          se               3.779
4                         zah               3.864
5                         jan               4.293
6                         bel               4.320
7                         san               4.543
8                        jaya               5.354
9                         zey               5.592
10                       bros               6.574
11                       keni               6.646
12                      jayce               7.419
13                      jamir               7.681
14                        jod               7.908
15                       zart               8.109
16                      jalah               8.346
17                      stonn               8.472
18                      jeste               8.549
