# Lecture 4: Wraping up Language Modeling

### Rolling weighted die 

In [1]:
import numpy as np 
from collections import Counter

In [2]:
sample = np.random.choice([1, 2, 3, 4, 5, 6])
sample

6

In [3]:
# Rolling a "weighted die"
sample = np.random.choice([1, 2, 3, 4, 5, 6], p=[0.5, 0.1, 0.1, 0.1, 0.1, 0.1])
sample

4

In [4]:
counts = Counter()
for _ in range(1000): 
    sample = np.random.choice([1, 2, 3, 4, 5, 6], p=[0.5, 0.1, 0.1, 0.1, 0.1, 0.1])
    counts[sample] += 1
print(counts)

Counter({1: 527, 2: 106, 3: 102, 4: 93, 5: 87, 6: 85})


In [5]:
# How we are going to generate 
distribution = [('world', 0.7), ('skirts',0.1), ('tables',0.2)]
words = [x[0] for x in distribution]
probs = [x[1] for x in distribution]

count_chosen = Counter()

#Simulation with 10,000 trials, see if we get the correct expected values
for _ in range(10000): 
    word = np.random.choice(words, p=probs)
    count_chosen[word] +=1 
    
print(count_chosen)

Counter({'world': 5023, 'tables': 2984, 'skirts': 1993})


### LM from Shakespeare

In [6]:
import numpy as np 
from collections import defaultdict, Counter 
import re 
import random 
from typing import List, Tuple

In [7]:
shakespeare_text = open("t8.shakespeare-remove-preamble.txt").read()

In [8]:
#Simple tokenization, just split on any non-alphanumeric character 
tokens = re.split(r'\W+', shakespeare_text.lower())

In [11]:
def probabilities(word2counts: Counter)-> List[tuple]:
    """
    Helper function
    
    Input: a Counter with 
        keys = n-grams
        values = counts of that n-grams 
        
    Returns: List of (ngram, probability)
    
    Example: 
        >>> word2counts = {'too': 1, 'dost': 1, 'resemble': 1, 'grow': 1} 
        >>> probability_over_all_words(word2counts)
        [('too', 0.25), ('dost', 0.25), ('resemble', 0.25), ('grow', 0.25)]
    """
    total_counts = float(sum(word2counts.values()))
    word_probs = []
    for word, count in word2counts.items():
        prob = count/total_counts
        word_probs.append((word, prob))
    return word_probs

In [12]:
def train_lm(tokens: List, N: int) -> dict:
    """
    Returns: Language model dict  
        keys = previous N-1 words 
        values = List of tuples
            first item = word w_n 
            second item = probability w_n given previous N-1 words (keys)
            
    Example (N=3):  
        {('sweet', 'self'):
              [('too', 0.25),
               ('dost', 0.25),
               ('resemble', 0.25),
               ('grow', 0.25)]
        }
    """
    lm = defaultdict(Counter)
    
    # get the counts
    for i in range(len(tokens)-(N-1)):
        previous_words =  tokens[i:i+N-1]
        next_word = tokens[i+N-1]
        lm[tuple(previous_words)][next_word] += 1
        
    #turn counts into probabilites 
    outlm = {previous_words: probabilities(words2count) for previous_words, words2count in lm.items()}
    return outlm

In [13]:
outlm = train_lm(tokens, 4)

In [14]:
type(outlm)

dict

In [15]:
#just visualize some of the outputs
list(outlm.items())[220:230]

[(('be', 'new', 'made'), [('when', 1.0)]),
 (('new', 'made', 'when'), [('thou', 1.0)]),
 (('made', 'when', 'thou'), [('art', 1.0)]),
 (('when', 'thou', 'art'),
  [('old', 0.14285714285714285),
   ('all', 0.07142857142857142),
   ('gone', 0.07142857142857142),
   ('a', 0.07142857142857142),
   ('king', 0.35714285714285715),
   ('dead', 0.14285714285714285),
   ('timon', 0.07142857142857142),
   ('forth', 0.07142857142857142)]),
 (('thou', 'art', 'old'), [('and', 1.0)]),
 (('art', 'old', 'and'), [('see', 0.5), ('rich', 0.5)]),
 (('old', 'and', 'see'), [('thy', 1.0)]),
 (('and', 'see', 'thy'), [('blood', 0.5), ('master', 0.5)]),
 (('see', 'thy', 'blood'), [('warm', 1.0)]),
 (('thy', 'blood', 'warm'), [('when', 1.0)])]

#### 3. Generate

In [16]:
def generate_word(lm: dict, N: int, previous_words: list):
    """
    Generates a single word from the learned language model 
    """
    distribution = lm[tuple(previous_words)]
    words = [x[0] for x in distribution]
    probs = [x[1] for x in distribution]
    return np.random.choice(words, p=probs)

In [17]:
# only three learned options for the next starting word 
outlm[("hath", "in", "the")]

[('world', 0.3333333333333333),
 ('skirts', 0.3333333333333333),
 ('tables', 0.3333333333333333)]

In [18]:
# Run a couple of times 
generate_word(outlm, 4, ["hath", "in", "the"])

'tables'

In [19]:
def generate_text(lm: dict, N: int, start_grams: list, nwords=100):
    """
    Generates nwords of text given: 
        - the trained language model (lm)
        - the start grams
    """
    assert len(start_grams) == N-1
    previous_words = start_grams 
    out = []
    for i in range(nwords):
        generated_word = generate_word(lm, N, previous_words)
        previous_words = previous_words[1:N-1] + [generated_word]
        out.append(generated_word)
    return " ".join(out)

In [22]:
generate_text(outlm, 4, ["hath", "in", "the"], nwords=20)

'tables of their thoughts to every ticklish reader set them down on two low stools and sew volumnia i pray'

Acknowledgements: 
- Some of this code was adapted from Yoav Goldberg's [character-level language model](https://nbviewer.org/gist/yoavg/d76121dfde2618422139)