# Lecture 3: Language Modeling

### Underflow issues

In [1]:
a = 1e-10
b = 1e-90
c = 1e-30
d = 5e-130
e = 1e-40
f = 1e-100
a*b*c*d*e*f

0.0

In [2]:
import numpy as np 

In [3]:
np.log(a) + np.log(b) + np.log(c) + np.log(d) + np.log(e) + np.log(f)

-919.4245992851843

### Review: Defaultdict and Counter

In [4]:
from collections import defaultdict, Counter 

In [5]:
# Below, we'll implement our language model as the defaultdict with a counter
lm = defaultdict(Counter) 
# keys: previous N-1 words 
# values: Counter dictionaries 
    #keys: w_n
    #values: count of times you saw w_n with the previous N-1 words 
    
#trigram language model 
lm[('the', 'dog')]['ate'] += 1 
lm[('the', 'dog')]['slept'] += 1
lm[('the', 'dog')]['ate'] += 1 

In [6]:
lm[('the', 'dog')]['ate']

2

In [7]:
#helpful because you don't get errors with keys that don't exist 
lm[('the', 'dog')]['drank']

0

### LM from Shakespeare

#### 1. Load and preprocess

In [8]:
import re 
import random 

In [9]:
# Downloading the complete works of William Shakespeare 
# Then Katie manually removed the preamble 

#https://www.kaggle.com/datasets/kewagbln/shakespeareonline

In [11]:
shakespeare_text = open("data/t8.shakespeare-remove-preamble.txt").read()

In [12]:
type(shakespeare_text)

str

In [13]:
shakespeare_text[0:100] #first 100 characters

'1609\n\nTHE SONNETS\n\nby William Shakespeare\n\n\n\n                     1\n  From fairest creatures we desi'

In [14]:
#look at some of the text  
print(shakespeare_text[20000:21000])

xpense of many a vanished sight.
  Then can I grieve at grievances foregone,
  And heavily from woe to woe tell o'er
  The sad account of fore-bemoaned moan,
  Which I new pay as if not paid before.
    But if the while I think on thee (dear friend)
    All losses are restored, and sorrows end.


                     31  
  Thy bosom is endeared with all hearts,
  Which I by lacking have supposed dead,
  And there reigns love and all love's loving parts,
  And all those friends which I thought buried.
  How many a holy and obsequious tear
  Hath dear religious love stol'n from mine eye,
  As interest of the dead, which now appear,
  But things removed that hidden in thee lie.
  Thou art the grave where buried love doth live,
  Hung with the trophies of my lovers gone,
  Who all their parts of me to thee did give,
  That due of many, now is thine alone.
    Their images I loved, I view in thee,
    And thou (all they) hast all the all of me.


                     32
  If thou survive m

In [15]:
#Simple tokenization, just split on any non-alphanumeric character 
tokens = re.split(r'\W+', shakespeare_text.lower())

In [16]:
print('Total number of tokens=', len(tokens))
print('Total number of word types=', len(set(tokens)))

Total number of tokens= 927705
Total number of word types= 23724


In [17]:
#look at some of the tokens
tokens[105:115]

['else', 'this', 'glutton', 'be', 'to', 'eat', 'the', 'world', 's', 'due']

In [18]:
from typing import List, Tuple
# List and Tuple are generic types from the typing module that allow you to specify what type 
# of elements should be inside these collections

#### 2. Create n-grams

In [19]:
def create_ngrams(toks: list, N: int)-> List[tuple]:
    """
    Iterate through the tokens in the order they appear in the corpus 
    Sliding window of N to create the n-grams
    
    Returns: list of tuples of n-grams
    
    Example bigram (N=2) output: 
        [('else', 'this'),
         ('this', 'glutton'),
         ('glutton', 'be'),
         ('be', 'to'),
         ('to', 'eat'),]
    """
    all_ngrams = []
    for i in range(len(tokens)-N+1): 
        ngram = toks[i:i+N]
        all_ngrams.append(tuple(ngram))
    return all_ngrams

In [20]:
bigrams = create_ngrams(tokens, 2)
bigrams[105:115]

[('else', 'this'),
 ('this', 'glutton'),
 ('glutton', 'be'),
 ('be', 'to'),
 ('to', 'eat'),
 ('eat', 'the'),
 ('the', 'world'),
 ('world', 's'),
 ('s', 'due'),
 ('due', 'by')]

In [21]:
trigrams = create_ngrams(tokens, 3)
trigrams[105:115]

[('else', 'this', 'glutton'),
 ('this', 'glutton', 'be'),
 ('glutton', 'be', 'to'),
 ('be', 'to', 'eat'),
 ('to', 'eat', 'the'),
 ('eat', 'the', 'world'),
 ('the', 'world', 's'),
 ('world', 's', 'due'),
 ('s', 'due', 'by'),
 ('due', 'by', 'the')]

In [22]:
bigram_counts = Counter(bigrams)

In [23]:
#look at ranked list
topk= 10
sorted(bigram_counts.items(), key=lambda kv: -kv[1])[0:topk]

[(('i', 'am'), 1855),
 (('i', 'll'), 1745),
 (('of', 'the'), 1715),
 (('my', 'lord'), 1666),
 (('in', 'the'), 1643),
 (('i', 'have'), 1620),
 (('i', 'will'), 1566),
 (('to', 'the'), 1430),
 (('it', 'is'), 1078),
 (('to', 'be'), 973)]

In [24]:
trigram_counts = Counter(trigrams)

In [25]:
#look at ranked list
topk= 10
sorted(trigram_counts.items(), key=lambda kv: -kv[1])[0:topk]

[(('i', 'pray', 'you'), 242),
 (('so', 'long', 'as'), 234),
 (('of', 'the', 'complete'), 219),
 (('the', 'complete', 'works'), 219),
 (('complete', 'works', 'of'), 219),
 (('works', 'of', 'william'), 219),
 (('of', 'william', 'shakespeare'), 219),
 (('this', 'electronic', 'version'), 218),
 (('electronic', 'version', 'of'), 218),
 (('version', 'of', 'the'), 218)]

In [26]:
def probabilities(word2counts: Counter)-> List[tuple]:
    """
    Helper function
    
    Input: a Counter with 
        keys = n-grams
        values = counts of that n-grams 
        
    Returns: List of (ngram, probability)
    
    Example: 
        >>> word2counts = {'too': 1, 'dost': 1, 'resemble': 1, 'grow': 1} 
        >>> probability_over_all_words(word2counts)
        [('too', 0.25), ('dost', 0.25), ('resemble', 0.25), ('grow', 0.25)]
    """
    total_counts = float(sum(word2counts.values()))
    word_probs = []
    for word, count in word2counts.items():
        prob = count/total_counts
        word_probs.append((word, prob))
    return word_probs

In [27]:
word2counts = {'too': 1, 'dost': 1, 'resemble': 1, 'grow': 1, 'prove': 1, 'was': 1, 'put': 1, 'are': 1}
probabilities(word2counts)

[('too', 0.125),
 ('dost', 0.125),
 ('resemble', 0.125),
 ('grow', 0.125),
 ('prove', 0.125),
 ('was', 0.125),
 ('put', 0.125),
 ('are', 0.125)]

In [28]:
def train_lm(tokens: List, N: int) -> dict:
    """
    Returns: Language model dict  
        keys = previous N-1 words 
        values = List of tuples
            first item = word w_n 
            second item = probability w_n given previous N-1 words (keys)
            
    Example (N=3):  
        {('sweet', 'self'):
              [('too', 0.25),
               ('dost', 0.25),
               ('resemble', 0.25),
               ('grow', 0.25)]
        }
    """
    lm = defaultdict(Counter)
    
    # get the counts
    for i in range(len(tokens)-(N-1)):
        previous_words =  tokens[i:i+N-1]
        next_word = tokens[i+N-1]
        lm[tuple(previous_words)][next_word] += 1
        
    #turn counts into probabilites 
    outlm = {previous_words: probabilities(words2count) for previous_words, words2count in lm.items()}
    return outlm

In [29]:
outlm = train_lm(tokens, 4)

In [30]:
type(outlm)

dict

In [31]:
#just visualize some of the outputs
list(outlm.items())[200:230]

[(('of', 'mine', 'shall'),
  [('sum', 0.3333333333333333),
   ('be', 0.3333333333333333),
   ('give', 0.3333333333333333)]),
 (('mine', 'shall', 'sum'), [('my', 1.0)]),
 (('shall', 'sum', 'my'), [('count', 1.0)]),
 (('sum', 'my', 'count'), [('and', 1.0)]),
 (('my', 'count', 'and'), [('make', 1.0)]),
 (('count', 'and', 'make'), [('my', 1.0)]),
 (('and', 'make', 'my'),
  [('old', 0.14285714285714285),
   ('wars', 0.14285714285714285),
   ('misery', 0.14285714285714285),
   ('image', 0.14285714285714285),
   ('vouch', 0.14285714285714285),
   ('challenge', 0.14285714285714285),
   ('seated', 0.14285714285714285)]),
 (('make', 'my', 'old'), [('excuse', 1.0)]),
 (('my', 'old', 'excuse'), [('proving', 1.0)]),
 (('old', 'excuse', 'proving'), [('his', 1.0)]),
 (('excuse', 'proving', 'his'), [('beauty', 1.0)]),
 (('proving', 'his', 'beauty'), [('by', 1.0)]),
 (('his', 'beauty', 'by'), [('succession', 1.0)]),
 (('beauty', 'by', 'succession'), [('thine', 1.0)]),
 (('by', 'succession', 'thine'), [

Generation code in Lecture 4. 

Acknowledgements: 
- Some of this code was adapted from Yoav Goldberg's [character-level language model](https://nbviewer.org/gist/yoavg/d76121dfde2618422139)