In [90]:
# take the core n-gram functionality and
# put it in a class

In [91]:
import numpy as np
from numpy.random import multinomial
import pandas as pd
from tqdm import tqdm

In [217]:
class NGramModel:
    
    def __init__(self, prepped_data: np.ndarray, n: int):
        
        self.data = prepped_data # data already padded with start/end token(s)
        self.n = n
        
        self.unique_chars = sorted(list(set(''.join(self.data))))
        self.stoi = { s:i for i,s in enumerate(self.unique_chars) }
        self.itos = { i:s for s,i in self.stoi.items() }
        
        # create ngram matrix
        mat_shape = (len(self.unique_chars),) * (self.n)
        self.ngrams = np.zeros(mat_shape)
        #print(self.ngrams.shape, mat_shape)
        
    def learn(self):
        self.count_ngrams()
        self.calculate_probabilities()
    
    def count_ngrams(self):
        for sample in tqdm(self.data,
                          desc=f"Counting {self.n}-Grams"):
            l = len(sample)
            nn = self.n - 1
            for i in range(l-nn):
                x = sample[i:i+nn]
                y = sample[i+nn]
                
                x_idx = tuple([self.stoi[c] for c in x])
                y_idx = self.stoi[y]
                
                self.ngrams[x_idx][y_idx] += 1

    def calculate_probabilities(self):
        total_its = np.prod(self.ngrams.shape[:-1])
        
        for idx in tqdm(np.ndindex(*self.ngrams.shape[:-1]),
                        total=total_its,
                        desc="    Calculating probabilities"):
            s = self.ngrams[idx].sum()
            if s != 0.0:
                self.ngrams[idx] /= s
    
    def forward(self):
        out = ''
        out += '<' * (self.n-1)
        
        while True:
            last = tuple([self.stoi[c] for c in out[-(self.n-1):]])
            probs = self.ngrams[last]
            pred_idx = multinomial(1, probs).argmax()
            pred_char = self.itos[pred_idx]
            if pred_char == '>': break
            
            out += pred_char
            
        return out.replace('<','')

In [218]:
# load dataset using pandas
dataset = pd.read_csv("datasets/names.txt").to_numpy()

def prep_dataset(data: np.ndarray, n: int):
    start_token = '<' * (n-1)
    end_token = '>'
    samples = []
    for sample in data:
        s = start_token + sample.item() + end_token
        samples.append(s)
    return samples

In [227]:
# create a model for n, starting from 2, to 6
models = [
    NGramModel(prep_dataset(dataset, i), i)
    for i in range(2, 7)
]

# call learn method for each model
for model in models:
    model.learn()

Counting 2-Grams: 100%|███████████████████████████████████| 32032/32032 [00:00<00:00, 324213.06it/s]
    Calculating probabilities: 100%|████████████████████████████| 28/28 [00:00<00:00, 349525.33it/s]
Counting 3-Grams: 100%|███████████████████████████████████| 32032/32032 [00:00<00:00, 293788.11it/s]
    Calculating probabilities: 100%|██████████████████████████| 784/784 [00:00<00:00, 670678.02it/s]
Counting 4-Grams: 100%|███████████████████████████████████| 32032/32032 [00:00<00:00, 281175.13it/s]
    Calculating probabilities: 100%|██████████████████████| 21952/21952 [00:00<00:00, 902449.98it/s]
Counting 5-Grams: 100%|███████████████████████████████████| 32032/32032 [00:00<00:00, 240888.93it/s]
    Calculating probabilities: 100%|███████████████████| 614656/614656 [00:00<00:00, 1049095.72it/s]
Counting 6-Grams: 100%|███████████████████████████████████| 32032/32032 [00:00<00:00, 176816.80it/s]
    Calculating probabilities: 100%|███████████████| 17210368/17210368 [00:15<00:00, 107895

In [230]:
# create 20 names for each model

n_preds = 50

results = {}
for n, model in enumerate(models):
    
    y = []
    for _ in range(n_preds):
        y.append(model.forward())
        
    results[f"{n+2}-Gram"] = y

In [231]:
df = pd.DataFrame(results)

df

Unnamed: 0,2-Gram,3-Gram,4-Gram,5-Gram,6-Gram
0,beryrmilelilarona,jenzia,jiaan,ward,termaine
1,roslarararalea,khapospyn,nah,anye,oz
2,khanethigase,tayl,anh,iknoor,maddalynn
3,jee,simiyane,brie,nayelli,fadil
4,br,ariany,yamaris,jet,anvith
5,eteti,amer,cecileannah,eriya,senaida
6,be,everen,mily,yar,perceus
7,zee,kaylyani,josses,sauloralina,delsin
8,jaclish,via,ell,jason,madiha
9,keyl,jrundonandynluwa,zey,edine,chanie
