# Statistical Language Modeling: Solutions

- Natural Language Understanding
- Evgeny A. Stepanov
- stepanov.evgeny.a@gmail.com

Dan Jurafsky and James H. Martin's __Speech and Language Processing__ ([3rd ed. draft](https://web.stanford.edu/~jurafsky/slp3/)) is advised for reading. 

__Requirements__

- [NL2SparQL4NLU](https://github.com/esrel/NL2SparQL4NLU) dataset

    - run `git clone https://github.com/esrel/NL2SparQL4NLU.git`
    

## Provided functions and classes (from the Lab Notebook)

### Get nbest from dict w.r.t. value

In [1]:
def nbest(d, n=1):
    """
    get n max values from a dict
    :param d: input dict (values are numbers)
    :param n: number of values to get (int)
    :return: dict of top n key-values
    """
    return dict(sorted(d.items(), key=lambda item: item[1], reverse=True)[:n])

### Data structure to store ngram counts and probabilities

In [2]:
class Node(object):

    def __init__(self, word=None):
        self.word = word
        self.children = {}
        self.count = 0
        
    def __set__(self, instance, value):
        self.instance = value

    def __get__(self, instance, owner):
        return self.instance


class Trie(object):
    
    def __init__(self):
        self.root = Node('*')
        self.error = Node()
    
    def __set__(self, instance, value):
        self.instance = value

    def __get__(self, instance, owner):
        return self.instance

    def add(self, sequence):
        node = self.root
        node.count += 1  # total count
        for word in sequence:
            node.children[word] = node.children.setdefault(word, Node(word))
            node = node.children[word]
            node.count += 1

    def get(self, sequence):
        node = self.root
        for word in sequence:
            node = node.children.get(word, self.error)
        return node
    
    def traverse(self, node=None, sequence=[]):
        node = self.root if not node else node
        
        if not node.children:
            yield sequence

        for word, n in node.children.items():
            sequence.append(word)
            yield from self.traverse(n, sequence)
            sequence.pop()

### Counting Ngrams and Probabilities

#### Extract ngrams from a sequence (list)

In [3]:
def ngrams(sequence, n=2):
    """
    returns ngrams as a list-of-lists of sequence elements
    :param sequence: list of elements
    :param n: ngram size to extract
    :return: list of ngrams
    """
    return [sequence[i:i+n] for i in range(len(sequence) - n + 1)]

#### Count ngrams in corpus (list-of-lists)

In [4]:
def ngramcount(corpus, n=2):
    """
    count ngrams in a corpus and stores as a Trie
    :param corpus: list-of-lists
    :param n: ngram size to count
    :param glue: symbol for ngram concatenation into string
    :return: dict of ngram frequencies
    """
    counts = Trie()
    counts.size = n  # meta-info: ngram-size
    for sent in corpus:
        for ngram in ngrams(sent, n=n):
            counts.add(ngram) 
    return counts

#### Compute probabilities from ngram counts
- flags to:
    - use log probabilities (default: True)
    - use add one smoothing (default: False)

In [5]:
def ngramprobs(counts, logs=True, smoothing=False):
    """
    compute ngram probabilities from frequency counts
    :param counts: counts trie
    :param logs: if to compute log-probabilities
    :param smoothing: if to use add 1 smoothing
    :return: trie augmented with probabilties
    """
    from math import log
    
    # set meta-information
    counts.logs = logs  # meta-info: log probabilities are used
    counts.smoothing = smoothing # meta-info: smoothing true|false
    
    # add 1 smoothing
    v = compute_ngram_vocabulary_size(counts, n = counts.size - 1) if smoothing else 0
    a = 1 if smoothing else 0
    
    # update error probability:
    counts.error.probability = log(a / v) if (smoothing and logs) else 0.0
    
    for ngram in counts.traverse(): 
        n = counts.get(ngram)       # get ngram node
        p = counts.get(ngram[:-1])  # get parent node
        prob = (n.count + a) / (p.count + v)
        n.probability = log(prob) if logs else prob
    return counts

#### Function to compute ngram vocabulary size for add one smoothing

In [6]:
def compute_ngram_vocabulary_size(counts, n=1):
    return len(set(["+".join(ngram[:n]) for ngram in counts.traverse()]))

#### Function to convert log probabilities to raw probabilities

In [7]:
def logp2p(value):
    from math import exp
    return exp(value) if value else 0.0

### Corpus Pre-Processing
- Pre-process training and test sets using [corpus and lexicon preprocessing functions](corpus_pp_python.ipynb) to:
    - add sentence begin & end tags
    - handle unknown words (e.g. frequency cut-off)

In [8]:
def read_corpus(corpus_file):
    """
    read corpus into a list-of-lists, splitting sentences into tokens by space (' ')
    :param corpus_file: corpus file in sentence-per-line format (tokenized)
    :return: corpus as list of lists
    """
    return [line.strip().split() for line in open(corpus_file, 'r')]

In [9]:
trn='NL2SparQL4NLU.trn.data'
tst='NL2SparQL4NLU.tst.data'

trn_data = read_corpus(trn)
tst_data = read_corpus(tst)

### Ngram Modeling

#### Counting ngrams & Reporting most frequent

In [10]:
counts = ngramcount(trn_data, n=2)

print(nbest({"+".join(ngram): counts.get(ngram).count for ngram in counts.traverse()}, n=5))

{'<s>+what': 511, '<s>+show': 450, 'show+me': 377, 'movies+</s>': 333, '<unk>+</s>': 284}


#### Querying Ngrams & Reporting Probabilities

In [11]:
probs = ngramprobs(counts, logs=False)

print(probs.get(['of', 'the']).probability)
print(probs.get(['is', 'the']).probability)
print(probs.get(['the', 'play']).probability)  # not in training data

for ngram in probs.traverse(node=probs.get(['italy']), sequence=['italy']):
    print(ngram, probs.get(ngram).probability)

0.30642504118616143
0.3582089552238806
0.0
['italy', '</s>'] 0.6
['italy', 'make'] 0.2
['italy', 'in'] 0.2


### Ngram Sequence Scoring

In [12]:
def score(model, sentence):
    """
    score a sentence given ngram model
    :param model: trie ngram model
    :param sentence: sentence as a list of tokens
    :return: log probability
    """
    from numpy import prod
    probs = [model.get(ngram).probability for ngram in ngrams(sentence, model.size)]
    return sum(probs) if model.logs else prod(probs)

In [13]:
sent1 = ['<s>', 'star', 'of', 'twilight', '</s>']
sent2 = ['<s>', 'star', 'of', 'thor', '</s>']  ## not in training data
sent3 = ['<s>', 'star', 'of', '<unk>', '</s>']

print(score(probs, sent1))
print(score(probs, sent2))
print(score(probs, sent3))

1.60719558392048e-07
0.0
1.0876422746511928e-06


### Ngram Sequence Generation

In [14]:
def generate(model, bos='<s>', eos='</s>'):
    """
    generate a random sequence from ngram model
    :param model: trie ngram model
    :param bos: beginning-of-sentence tag
    :param eos: end-of-sentence tag
    :return: sentence as list & log probability
    """
    import random
    word = bos
    sent = [bos]
    while word != eos:
        c_node = model.get(sent[-(model.size - 1):])
        word = random.choice(list(c_node.children.keys()))
        sent.append(word)
    return sent

In [15]:
for i in range(5):
    s = generate(probs)
    print("({}): {}".format(score(probs, s), " ".join(s)))

(4.613597629256453e-17): <s> g movies steven spielberg start making how popular japanese in france </s>
(1.582749916339163e-18): <s> bring in country the best films by directors in china </s>
(7.439710837198544e-15): <s> type of recent pg rating did my kid </s>
(4.743777307478727e-09): <s> person who made over street </s>
(1.5130332685755095e-05): <s> rated </s>


### Scoring Test Set

In [16]:
# let's use all-on
lm = ngramprobs(counts, logs=True, smoothing=True)

for sent in tst_data[:10]:
    print("({}): {}".format(score(lm, sent), " ".join(sent)))

(-17.936125948586973): <s> star of <unk> </s>
(-27.42784311979704): <s> who is in the movie the campaign </s>
(-34.34157079825078): <s> list the cast of the movie the campaign </s>
(-21.79617068101572): <s> who was in twilight </s>
(-16.453029986807024): <s> who is in <unk> </s>
(-24.4942619985087): <s> actor from lost </s>
(-29.630631881867696): <s> who played in the movie rocky </s>
(-32.10468394242209): <s> who played in the movie captain america </s>
(-29.33155184726785): <s> cast and crew for in july </s>
(-27.049305109495606): <s> who is in movie in july </s>
