In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from IPython.display import display

## Language Model
- A model of probability of a sequence of words(sentence)
    + Unigram
    + Bigram
    + Trigram
    + Ngram

## Bigram
- 2 consecutive words in a sentence
    - Eg.: "The quick brown fox jumps over the lazy dog." -> Bigram:
        + The quick
        + Quick brown
        + Brown fox
        + Fox jumps
        + ...
- Bigram model: Probability of 2 word appear together
$$p(w_t|w_{t-1})$$
    - Eg:
        + p(brown|quick) = 0.5
        + p(the|the) = 0
- How to build bigram model: p(brown|quick)
    - Count how many times 'quick' -> 'brown' appear
    - Count how many times 'quick' appear
$$p(brown|quick) = \frac{count(quick \rightarrow brown)}{count(quick)}$$

## Language model Bayes Rule
$$p(ABC) = p(C|AB)p(AB) = p(C|AB)p(B|A)p(A)$$
- Trigram
$$p(C|AB) = \frac{count(ABC)}{count(AB)}$$
- Bigram
$$p(B|A) = \frac{count(AB)}{count(A)}$$
- Unigram
$$p(A) = \frac{count(A)}{corpus\ length}$$

#### Longer sentence
- Longer sentence
$$p(ABCDE) = p(E|ABCD)p(D|ABC)p(C|AB)p(B|A)p(A)$$
- Long sentence with only Bigram(Break down the sentence -> more phrase are capable)
$$p(ABCDE) = p(E|D)p(D|C)p(C|B)p(B|A)p(A)$$

#### Problems with Language model
- p(dog|the quick brown fox jumps over the) > 0: because it appear in the document
- p(turtle|the quick brown fox jumps over the) = 0: because it not appear but tutle is still valid
- Solution: Smoothing
$$p_{smooth}(B|A) = \frac{count(AB) + 1}{count(A) + V}$$
    + V: vocab size = number of distinct words

## Dataset - Brown Corpus

In [2]:
from nltk.corpus import brown
import operator

KEEP_WORDS = set([
    'king', 'man', 'queen', 'woman',
    'italy', 'rome', 'france', 'paris',
    'london', 'britain', 'england',
])

def get_sentences_with_word2idx_limit_vocab(n_vocab=2000, keep_words=KEEP_WORDS):
    # returns 57340 of the Brown corpus
    # each sentence is represented as a list of individual string tokens
    sentences = brown.sents()
    indexed_sentences = []

    i = 2
    word2idx = {'START': 0, 'END': 1}
    idx2word = ['START', 'END']

    word_idx_count = {
        0: float('inf'),
        1: float('inf'),
    }

    for sentence in sentences:
        indexed_sentence = []
        for token in sentence:
            token = token.lower()
            if token not in word2idx:
                idx2word.append(token)
                word2idx[token] = i
                i += 1

            # keep track of counts for later sorting
            idx = word2idx[token]
            word_idx_count[idx] = word_idx_count.get(idx, 0) + 1

            indexed_sentence.append(idx)
        indexed_sentences.append(indexed_sentence)

    # restrict vocab size

    # set all the words I want to keep to infinity
    # so that they are included when I pick the most
    # common words
    for word in keep_words:
        word_idx_count[word2idx[word]] = float('inf')

    sorted_word_idx_count = sorted(
        word_idx_count.items(),
        key=operator.itemgetter(1),
        reverse=True)
    word2idx_small = {}
    new_idx = 0
    idx_new_idx_map = {}
    for idx, count in sorted_word_idx_count[:n_vocab]:
        word = idx2word[idx]

        word2idx_small[word] = new_idx
        idx_new_idx_map[idx] = new_idx
        new_idx += 1

    # let 'unknown' be the last token
    word2idx_small['UNKNOWN'] = new_idx 
    unknown = new_idx

    assert('START' in word2idx_small)
    assert('END' in word2idx_small)
    for word in keep_words:
        assert(word in word2idx_small)

    # map old idx to new idx
    sentences_small = []
    for sentence in indexed_sentences:
        if len(sentence) > 1:
            new_sentence = [
                idx_new_idx_map[idx] if idx in idx_new_idx_map 
                else unknown 
                    for idx in sentence]
            sentences_small.append(new_sentence)

    return sentences_small, word2idx_small

#### Load dataset

In [3]:
sentences, word2idx = get_sentences_with_word2idx_limit_vocab(10000)

# vocab size
V = len(word2idx)

display(sentences[10])
print('Number of sentences:', len(sentences))
print('Vocab size:', V)

[28, 3111, 21, 13, 249, 26, 172, 893, 18, 6629, 27, 38, 315, 15]

Number of sentences: 57013
Vocab size: 10001


#### Rebuild

In [4]:
# Reverse index
idx2word = dict((v, k) for k, v in word2idx.items())

In [5]:
def rebuild_sentence(sentence):
    return ' '.join(idx2word[i] for i in sentence)

In [6]:
## Rebuild sentences[0]
rebuild_sentence(sentences[0])

"the fulton county grand jury said friday an investigation of UNKNOWN recent primary election produced `` no evidence '' that any irregularities took place ."

In [7]:
## Rebuild sentences[1]
rebuild_sentence(sentences[1])

"the jury further said in UNKNOWN UNKNOWN that the city executive committee , which had over-all charge of the election , `` deserves the praise and thanks of the city of atlanta '' for the manner in which the election was conducted ."

## Bigram Model

## $$p_{smooth}(B|A) = \frac{count(AB) + smoothing}{count(A) + V}$$

In [8]:
def get_bigram_probs(sentences, V, start_idx, end_idx, smoothing=1):
    '''
    structure of bigram probability matrix will be:
        (last word, current word) --> probability
    we will use add-1 smoothing
    note: we'll always ignore this from the END token
    '''
    bigram_probs = np.ones((V, V)) * smoothing
    for sentence in sentences:
        for i in range(len(sentence)):
            if i == 0:
                # beginning word
                bigram_probs[start_idx, sentence[i]] += 1
            else:
                # middle word
                bigram_probs[sentence[i-1], sentence[i]] += 1

            # if we're at the final word
            # we update the bigram for last -> current
            # AND current -> END token
            if i == len(sentence) - 1:
                # final word
                bigram_probs[sentence[i], end_idx] += 1

    # normalize the counts along the rows to get probabilities
    bigram_probs /= bigram_probs.sum(axis=1, keepdims=True)
    return bigram_probs

In [9]:
# a matrix where:
# row = last word
# col = current word
# value at [row, col] = p(current word | last word)
bigram_probs = get_bigram_probs(
    sentences, V, 
    start_idx=word2idx['START'],
    end_idx=word2idx['END'],
    smoothing=0.1)

bigram_probs.shape

(10001, 10001)

## Evaluation

- Probability score
    + p in range [0,1]
## $$p(w_1, w_2, ..., w_T) = p(w_1)\prod_{t=2}^{T}p(w_t|w_{t-1})$$

- Log Probability score
    + alway negative (< 0)
    + shorter sentences have high log prob score
## $$log(p(w_1, w_2, ..., w_T)) = log(p(w_1)) + \sum_{t=2}^{T}log(p(w_t|w_{t-1}))$$

- Normalized Log Probability score
    + alway negative (< 0)
    + Long sentences ~ Short sentences score 
## $$\frac{1}{T}log(p(w_1, w_2, ..., w_T)) = \frac{1}{T}[log(p(w_1)) + \sum_{t=2}^{T}log(p(w_t|w_{t-1}))]$$

In [10]:
start_idx = word2idx['START']
end_idx = word2idx['END']

## Normalized Log Probability
def get_score(sentence):
    '''
    a function to calculate normalized log prob score for a sentence
    '''
    score = 0
    for i in range(len(sentence)):
        if i == 0:
            # beginning word
            score += np.log(bigram_probs[start_idx, sentence[i]])
        else:
            # middle word
            score += np.log(bigram_probs[sentence[i-1], sentence[i]])
    # final word
    score += np.log(bigram_probs[sentence[-1], end_idx])

    # normalize the score
    return score / (len(sentence) + 1)

### Evaluate real sentences (from Brown corpus dataset)

In [11]:
real_idx = np.random.choice(len(sentences))
real = sentences[real_idx]

display(rebuild_sentence(real))
display(get_score(real))

"you are still in france '' ."

-3.921470607899102

In [12]:
real_idx = np.random.choice(len(sentences))
real = sentences[real_idx]

display(rebuild_sentence(real))
display(get_score(real))

'in fact , the whole generation of the founding fathers of UNKNOWN -- UNKNOWN , monk , davis , UNKNOWN , and the rest -- are just now at a considerable discount .'

-4.8694435452005305

In [13]:
real_idx = np.random.choice(len(sentences))
real = sentences[real_idx]

display(rebuild_sentence(real))
display(get_score(real))

"the UNKNOWN screw machine , now known as the brown & sharpe hand screw machine , takes its ancestry directly from mr. brown's efforts to introduce equipment to simplify the manufacture of the sewing machine ."

-5.667113926377538

### Evaluate fake sentences (random words)

In [14]:
fake = np.random.choice(V, size=5)

display(rebuild_sentence(fake))
display(get_score(fake))

'probable dug roar lyrics coming'

-9.12586696592133

In [15]:
fake = np.random.choice(V, size=10)

display(rebuild_sentence(fake))
display(get_score(fake))

"coins grace 1954 leather simplify suffered victor plato's fibers authenticity"

-9.600882899836927

In [16]:
fake = np.random.choice(V, size=15)

display(rebuild_sentence(fake))
display(get_score(fake))

'uncle vivid carla cups fortunately deeply resources amusing concede manufacture saved liquor oh book consonantal'

-9.248811743297562

## Evaluate custom sentences

In [17]:
def evaluate_custom_sentence(custom):
    custom = custom.lower().split()

    bad_sentence = False
    for token in custom:
        if token not in word2idx:
            bad_sentence = True

    if bad_sentence:
        print("Sorry, you entered words that are not in the vocabulary")
    else:
        # convert sentence into list of indexes
        custom = [word2idx[token] for token in custom]
        print("SCORE:", get_score(custom))

In [18]:
my_sentence = 'my name is Peter'
evaluate_custom_sentence(my_sentence)

SCORE: -7.512908229477057


In [19]:
my_sentence = 'Go to school'
evaluate_custom_sentence(my_sentence)

SCORE: -7.130362607114305


In [20]:
my_sentence = 'I love you so much do you know'
evaluate_custom_sentence(my_sentence)

SCORE: -5.73929340860894
