<a href="https://colab.research.google.com/github/faezesarlakifar/SBU-NLP-Lab-summer-school/blob/main/N_gram_text_generation_(add_laplace_smoothing).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Sbu-logo.svg/1200px-Sbu-logo.svg.png" alt="keras" width="150" height="150">

<h1 align=center><font size = 7>NLP Summer school</font></h1>
<h1 align=center><font size = 6>NLP Research Lab</font></h1>
<h1 align=center><font size = 5>Shahid Beheshti University</font></h1>
<h1 align=center><font size = 4> July 2022 </font></h1>

## Original Repo
https://github.com/olegborisovv/NGram_LanguageModel

In [1]:
import string
import random
import time
from typing import List
import nltk

#### initialize parameters

In [2]:
n = 6
path = 'Frankenstein.txt'

In [3]:
# ideally we would use some smart text tokenizer, but for simplicity use this one
def tokenize(text: str) -> List[str]:
    """
    :param text: Takes input sentence
    :return: tokenized sentence
    """
    for punct in string.punctuation:
        text = text.replace(punct, ' '+punct+' ')
    t = text.split()

    return t

In [4]:
def get_ngrams(n: int, tokens: list) -> list:
    """
    :param n: n-gram size
    :param tokens: tokenized sentence
    :return: list of ngrams

    ngrams of tuple form: ((previous wordS!), target word)
    """
    # tokens.append('<END>')
    tokens = (n-1)*['<START>']+tokens
    l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) for i in range(n-1, len(tokens))]
    return l

#### get all tokens and size of vocabulary

In [6]:
    with open(path, 'r', encoding='utf-8') as f:
        text = ' '.join([line.strip() for line in f.readlines() if not line.startswith('#')])
        text = text.split('.')
        text = random.sample(text, 2000)
        tokens = []
        for sentence in text:
            # add back the fullstop
            sentence += '.'
            tokens.extend(tokenize(sentence))

    print(tokens[1:100])
    vocab  = nltk.FreqDist(tokens)
    vocab_size = len(vocab)
    print(vocab_size)

['was', 'often', 'tempted', ',', 'when', 'all', 'was', 'at', 'peace', 'around', 'me', ',', 'and', 'I', 'the', 'only', 'unquiet', 'thing', 'that', 'wandered', 'restless', 'in', 'a', 'scene', 'so', 'beautiful', 'and', 'heavenly—if', 'I', 'except', 'some', 'bat', ',', 'or', 'the', 'frogs', ',', 'whose', 'harsh', 'and', 'interrupted', 'croaking', 'was', 'heard', 'only', 'when', 'I', 'approached', 'the', 'shore—often', ',', 'I', 'say', ',', 'I', 'was', 'tempted', 'to', 'plunge', 'into', 'the', 'silent', 'lake', ',', 'that', 'the', 'waters', 'might', 'close', 'over', 'me', 'and', 'my', 'calamities', 'for', 'ever', '.', 'I', 'learned', 'from', 'your', 'papers', 'that', 'you', 'were', 'my', 'father', ',', 'my', 'creator', ';', 'and', 'to', 'whom', 'could', 'I', 'apply', 'with', 'more']
6408


In [7]:
n_grams = nltk.ngrams(tokens, n)
n_vocab = nltk.FreqDist(n_grams)

m = n - 1
m_grams = nltk.ngrams(tokens, m)
m_vocab = nltk.FreqDist(m_grams)

print(len(m_vocab))

57807


In [8]:
class NgramModel(object):

    def __init__(self, n, laplace=1):
        self.n = n

        # dictionary that keeps list of candidate words given context
        self.context = {}

        # keeps track of how many times ngram has appeared in the text before
        self.ngram_counter = {}

        # use this for  Laplace smoothing to n-gram frequency distribution
        self.laplace = laplace

        # use vocab size in laplace smoothing and we update its value later
        self.vocab_size = 0

    def update(self, sentence: str) -> None:
        """
        Updates Language Model
        :param sentence: input text
        """
        n = self.n
        ngrams = get_ngrams(n, tokenize(sentence))
        for ngram in ngrams:
            if ngram in self.ngram_counter:
                self.ngram_counter[ngram] += 1.0
            else:
                self.ngram_counter[ngram] = 1.0

            prev_words, target_word = ngram
            if prev_words in self.context:
                self.context[prev_words].append(target_word)
            else:
                self.context[prev_words] = [target_word]

    def smooth(self, token, count_of_token, count_of_context):

        count = count_of_token

        def smoothed_count(n_gram, n_count):
            m_gram = n_gram[:-1]
            m_count = m_vocab[token]
            if(m_count == 0):
                m_count = count_of_context
            return (n_count + self.laplace) / (m_count + (self.laplace * self.vocab_size))

        return smoothed_count(token, count)

    def prob(self, context, token):
        """
        Calculates probability of a candidate token to be generated given a context
        :return: conditional probability
        """
        count_of_token = self.ngram_counter[(context, token)]
        count_of_context = float(len(self.context[context]))

        """
        that exception occurred when context size is 0 and we have divide by zero exception.
        we use laplace smmoothing for handle this problem.
        """

        if (self.n == 1):
            x = count_of_token + self.laplace
            y = count_of_context + (self.laplace * vocab_size)
            result = x / y
        else:
            result = self.smooth(token, count_of_token, count_of_context)
        return result

    def random_token(self, context):
        """
        Given a context we "semi-randomly" select the next word to append in a sequence
        :param context:
        :return:
        """
        r = random.random()
        map_to_probs = {}
        try:
            token_of_interest = self.context[context]
            for token in token_of_interest:
                map_to_probs[token] = self.prob(context, token)

            summ = 0
            for token in sorted(map_to_probs):
#             return token
                summ += map_to_probs[token]
                if summ > r:
                    return token
        except KeyError:
            pass
            #return random.choice(str(tokens))

    def generate_text(self, token_count: int):
        """
        :param token_count: number of words to be produced
        :return: generated text
        """
        n = self.n
        context_queue = (n - 1) * ['<START>']
        result = []
        for _ in range(token_count):
            obj = self.random_token(tuple(context_queue))
            result.append(obj)
            if n > 1:
                context_queue.pop(0)
                if (obj == '.'):
                    context_queue = (n - 1) * ['<START>']
                else:
                    context_queue.append(obj)
        return ' '.join(result)

How can we add smoothing functionality?

Why didn't we return the most probable token? (stay tuned for the rest of the materials)






In [13]:
def create_ngram_model(n, path):
    m = NgramModel(n)
    with open(path, 'r', encoding='utf-8') as f:
        text = ' '.join([line.strip() for line in f.readlines() if not line.startswith('#')])
        text = text.split('.')
        text = random.sample(text, 2000)
        for sentence in text:
            # add back the fullstop
            sentence += '.'
            m.update(sentence)
    return m

In [14]:
if __name__ == "__main__":
    start = time.time()
    m = create_ngram_model(6, 'Frankenstein.txt')

    print (f'Language Model creating time: {time.time() - start}')
    start = time.time()
#     random.seed(44)
    print(f'{"="*120}\nGenerated text:')
    print(m.generate_text(100))
    print(f'{"="*120}')





Language Model creating time: 0.33425021171569824
Generated text:
It is impossible to communicate to you a conception of the trembling sensation , half pleasurable and half fearful , with which I am preparing to depart . Often , when all was dry , the heavens cloudless , and I was parched by thirst , a slight cloud would bedim the sky , shed the few drops that revived me , and vanish . Oppressed by the recollection of my various misfortunes , I now swallowed double my usual quantity and soon slept profoundly . Will you smile at the enthusiasm I express concerning this divine wanderer ? You
