In [375]:
import numpy as np
from collections import defaultdict
from numpy.random import multinomial

import pandas as pd

In [390]:
class NgramGenerator:
    def __init__(self, data: list, n: int = 4, use_words: bool = False):
        self.data = data
        self.n = n
        self.use_words = use_words
        self.ngrams = defaultdict(dict)
        
        self.start_token = ('< ' * self.n)
        self.end_token = ' >'
        
        self.count_ngrams()
        self.calculate_ngram_probabilities()

    def count_ngrams(self):
        text = []
        for sample in self.data:
            text.append(self.start_token + sample + self.end_token)
        for sample in text:
            if self.use_words: 
                sample = sample.split()
                
            for i in range(len(sample)-self.n):
                if self.use_words:
                    k = ' '.join(sample[i:i+self.n])
                else:
                    k = sample[i:i+self.n]
                v = sample[i+self.n]
                
                if v in self.ngrams[k]:
                    self.ngrams[k][v] += 1
                else:
                    self.ngrams[k][v] = 1
        
    def calculate_ngram_probabilities(self):
        for k in self.ngrams.keys():
            s = sum(self.ngrams[k].values())
            for kk in self.ngrams[k].keys():
                self.ngrams[k][kk] /= s
    
    def __call__(self):
        out = []
        if self.use_words:
            out = self.start_token.split()
        else:
            for i in self.start_token: out.append(i)
            
        while True:    
                
            curgram = out[-self.n:]
            
            if self.use_words:
                nextgram_probs = self.ngrams[' '.join(curgram)]
            else:
                nextgram_probs = self.ngrams[''.join(curgram)]
            
            idx = multinomial(1, [*nextgram_probs.values()]).argmax()
            pred = [*nextgram_probs.keys()][idx]
            #print(curgram, '--->', pred)

            if pred == self.end_token.strip():
                break

            out.append(pred)
        
        spc = ''
        if self.use_words: spc = ' '
        ret = spc.join(out[:]).replace('<','').strip()
        return ret

In [442]:
#X = pd.read_csv(open("datasets/McDonald_s_Reviews.csv", errors='ignore'))["review"].to_numpy().astype(str)
X = pd.read_csv(open("datasets/Emusk_2021_tweets.csv", errors='ignore'))["Text"].to_numpy().astype(str)
X[1]

'@comma_ai Tesla Full Self-Driving will work at a safety level well above that of the average driver this year, of that I am confident. Can’t speak for regulators though.'

In [449]:
g = NgramGenerator(data=X, n=2, use_words=True)

In [462]:
g()

'@Model3Owners I was in the first 5 years or so of Tesla, I thought we had &lt;10% chance of success will be landing Starships on Mars'

In [463]:
for i in X:
    if 'first 5 years' in i: print(i)

@teslaownersSV @Kristennetten @itsALLrisky @SenSanders I had majority control for the first 5 years or so of Tesla, but car companies need lots of capital, so now I’m around 20% or so, despite putting in as much as I could along the way
