# Lecture 2: word vector representation 

In [None]:
import nltk
import random
import string
import operator
import itertools
import collections
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.svm import LinearSVC
from tqdm import tqdm

%matplotlib inline

## Exercise 1: tokenization

In this exercise you define a couple of tokenizers and use them on a toy sentence

In [None]:
test_sentence_1 = 'The quick brown fox jumps over the lazy dog.'

### 1.1 - delimiter tokenization

In [None]:
def tokenize_by_split(text):
    """Tokenizes a given string of text by splitting words by whitespace"""
    # your code goes here
    return tokens

In [None]:
assert tokenize_by_split(test_sentence_1) == ['The', 'quick', 'brown', 'fox', 
                                              'jumps', 'over', 'the', 'lazy', 'dog.']
print('done')

### 1.2 - removing punctuation

In [None]:
def remove_punkt_and_tokenize_by_split(text):
    """Replaces punktuation from given string of text with whitespace, then
    tokenizes it by splitting words by whitespace"""
    punkt_symbols = string.punctuation
    # your code goes here
    return tokens

In [None]:
assert remove_punkt_and_tokenize_by_split(test_sentence_1) == ['The', 'quick', 'brown', 'fox', 
                                                               'jumps', 'over', 'the', 'lazy', 'dog']
print('done')

### 1.3 - using regular expression

In [None]:
test_sentence_2 = "This is a test that isn't so simple: 1.23"

In [None]:
def tokenize_by_regex(text):
    """Tokenizes a given string of text by applying the 'tokenize' method 
    of the provided 'tokenizer' object"""
    tokenizer = nltk.RegexpTokenizer('\w+')
    # your code goes here
    return tokens

In [None]:
assert tokenize_by_regex(test_sentence_2) == ['This', 'is', 'a', 'test', 'that', 
                                              'isn', 't', 'so', 'simple', '1', '23']
print('done')

### 1.4 - using an advanced tokenizer model

In [None]:
# download the PUNKT tokenizer model
nltk.download('punkt')

In [None]:
def tokenize_by_punkt_model(text):
    """Tokenizes a given string of text by applying the NLTK Punkt tokenizer model.
    Uses nltk.word_tokenize method"""
    # your code goes here
    return tokens

In [None]:
assert tokenize_by_punkt_model(test_sentence_2) == ['This', 'is', 'a', 'test', 'that', 
                                                    'is', "n't", 'so', 'simple', ':', '1.23']
print('done')

## Exercise 2: n-grams and stopwords

The Brown Corpus was the first million-word electronic corpus of English, created in 1961 at Brown University. First of all, let's get it!

In [None]:
nltk.download('brown')

The Brown corpus contains texts from different categories

In [None]:
nltk.corpus.brown.categories()

Sentences from each category can be accessed as follows

In [None]:
adv_sents = list(nltk.corpus.brown.sents(categories='adventure'))
print(len(adv_sents))

In [None]:
adv_sents[:2]

Let's see what are the most frequent unigrams in the 'adventure' category

In [None]:
# joins a list of lists of tokens into a one large string of text
adventure_text = ' '.join(list(itertools.chain.from_iterable(adv_sents)))

In [None]:
# uses the tokenizer function we've just written to tokenize text
adventure_tokens = tokenize_by_regex(adventure_text)

In [None]:
# turns each token to lowercase (simple normalization technique)
lowered_tokens = [token.lower() for token in adventure_tokens]
print(len(lowered_tokens))

In [None]:
# counts the number of occurances for each unigram
word_counter = collections.Counter(lowered_tokens)

In [None]:
labels = [w[0] for w in word_counter.most_common(35)]
values = [w[1] for w in word_counter.most_common(35)]
indexes = np.arange(len(labels))

plt.figure(figsize=(20,10))
plt.bar(indexes, values)
plt.xticks(indexes, labels)
plt.show()

We notice that some of the most common words above are not very interesting!

### stop-words

In this exersise you remove stopwords, find the most frequent bigrams, then display them on a barplot

In [None]:
# get the list of english stopwords
nltk.download('stopwords')

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english'))
print(len(stopwords))

### 2.1 - filtering stop-words

In [None]:
# remove stopwords from the list of 'lowered_tokens'
# your code goes here
stopword_filtered_tokens = 

### 2.2 - getting the bigrams

In [None]:
# turn your filtered list of unigrams into a list of bigrams, joint by whitespace
# to achieve that, use the function nltk.ngrams(your_tokens, 2)
# your code goes here
filtered_bigrams = 

### 2.3 - counting occurances

In [None]:
# now count the occurances of bigrams using a new Counter instance
# your code goes here
bigram_counter = 

assert {'miss langford', 'mary jane', 'billy tilghman'}.issubset(set(map(operator.itemgetter(0), 
                                                                         bigram_counter.most_common(15))))
print('done')

In [None]:
labels = [w[0] for w in bigram_counter.most_common(15)]
values = [w[1] for w in bigram_counter.most_common(15)]
indexes = np.arange(len(labels))

plt.figure(figsize=(20,10))
plt.bar(indexes, values)
plt.xticks(indexes, labels)
plt.show()

## Exercise 3: vocabularies

In this exercise you write a function that builds a vocabulary from the provided text corpus. Then you use it to encode tokens into numeric form

### 3.1 - building a vocabulary

In [None]:
def build_vocabulary(tokens, max_size):
    """
    Builds a dictionary of at most max_size most frequent tokens from the supplied list of tokens.
    More frequent tokens should have a lower id, but that is not strictly required.
    Two special symbols 'NULL':0 and 'UNKN':1 should also be added to the dictionary.
    
    EXAMPLE:
    {
        'NULL': 0,
        'UNKN': 1,
        'the': 2,
        'and': 3,
        'a': 4,
        ...
    }
    """
    vocabulary = {}
    reserved_symbols = ["NULL", "UNKN"]
    
    # your code goes here

    return vocabulary

In [None]:
VOC_SIZE = 5000

my_vocabulary = build_vocabulary(lowered_tokens, VOC_SIZE)

assert len(my_vocabulary) == VOC_SIZE
assert {'NULL', 'UNKN'}.issubset(set(my_vocabulary.keys()))
assert set([w[0] for w in word_counter.most_common(VOC_SIZE-10)]).issubset(set(my_vocabulary.keys()))
print('done')

### 3.2 - encoding tokens

In [None]:
def vectorize_tokens(sentence, tokenizer, token_to_id, max_len):
    """
    Converts a list of tokens to a list of token ids using the supplied dictionary.
    Pads resulting list with NULL identifiers up to max_len length.
    """
    ids = []
    
    # STEP 1: convert sentence to a list of tokens
    # your code goes here
    
    # STEP 2: replace tokens with their identifiers from the vocabulary
    # If the token is not present in the vocabulary, replace it with UNKN identifier

    # STEP 3: pad the sequence id's with NULL identifiers until so that it's length is equal to max_len

    return ids

In [None]:
MAX_LEN = 16
test_sentence = 'The animals thundered away into the moonlight , heading for the ridges .'
vectorized = vectorize_tokens(test_sentence,
                              tokenize_by_regex, my_vocabulary, MAX_LEN)

assert len(vectorized) == MAX_LEN
assert [my_vocabulary.get(t, my_vocabulary['UNKN']) 
        for t in tokenize_by_regex(test_sentence)] + [0]*(MAX_LEN-len(tokenize_by_regex(test_sentence))) == vectorized
print('done')

### Exercise 4: one-hot encoding

In this exercise you create a function to compute sentence similarity, then build a simple Information Retrieval system

In [None]:
VOC_SIZE = 5000

adv_brown_sents = [' '.join(sent) for sent in nltk.corpus.brown.sents(categories='adventure')]
print(len(adv_brown_sents))

In [None]:
# initialize the CountVectorizer instance
tfidf_vectorizer = TfidfVectorizer(max_features=VOC_SIZE, stop_words=stopwords, lowercase=True)

In [None]:
# builds the vocabulary from the data
tfidf_vectorizer.fit(adv_brown_sents)

In [None]:
tfidf_vectorizer_vocab = dict(zip(range(len(tfidf_vectorizer.get_feature_names())),
                                  tfidf_vectorizer.get_feature_names()))

In [None]:
# applies one-hot encoding to the provided data, transforming sentences into vectors
vectorized_sents = tfidf_vectorizer.transform(adv_brown_sents)

# the resulting matrix has shape (N_SAMPLES x VOC_SIZE)
vectorized_sents.shape

Let's have a look at the sentence vector

In [None]:
# sentence
print(adv_brown_sents[0])

In [None]:
# sentence vector is almost all zeroes
print(vectorized_sents[0].toarray())

In [None]:
# nonzero elements of the sentence vector
print(vectorized_sents[0].nonzero()[1])

In [None]:
# the words are the same, but the word order is lost, and stopwords are removed
[tfidf_vectorizer_vocab[wid] for wid in vectorized_sents[0].nonzero()[1]]

In [None]:
# we can now compute the similarity between sentences like so:
sent1vector = vectorized_sents[0]
sent10vector = vectorized_sents[10]
similarity = cosine_similarity(sent1vector, sent10vector)
print(similarity)

### 4.1 - sentence similarity

In [None]:
def onehot_sentence_similarity(sent1, sent2, vectorizer):
    """Encodes provided sentences using the 'vectorizer' object,
    then computes the cosine similarity between sentence vectors
    Outputs a real number between [0,1] """
    
    # CountVectorizer requires a list of sentences as input
    sent1 = [sent1]
    sent2 = [sent2]
    
    # your code goes here
    
    return similarity

In [None]:
test_sentence_1 = 'I like building robots'
test_sentence_2 = 'I also like building pillow fortresses'

In [None]:
assert onehot_sentence_similarity(test_sentence_1, test_sentence_2, tfidf_vectorizer) > 0.5
print('done')

### 4.2 - information retrieval

In [None]:
class SearchEngine(object):
    def __init__(self, knowledge_base, voc_size=5000):
        """
        Implements a simple information retrieval system based on Tf-Idf text representation.
        """
        
        self.kbase = np.array(knowledge_base)
        self.vectorizer = TfidfVectorizer(max_features=voc_size)
        self.vectorized_kbase = self.vectorizer.fit_transform(knowledge_base)
        
    def search(self, query, top_k=3):
        """
        Retrieves the top-k documents from the knowledge_base most similar to given query
        """
        
        vectorized_query = self.vectorizer.transform([query])
        
        # your code goes here
        # STEP 1: compute similarities between query and all documents in knowledge base

        # STEP 2: sort the similarities to find most similar document indices
        # HINT: use np.argsort to do that
        
        # STEP 3: gets top-k most similar documents from self.kbase, returns them
        
        return results
        

In [None]:
se = SearchEngine(adv_brown_sents)

In [None]:
query1 = 'take it easy'
result1 = se.search(query1, top_k=1)[0]
assert query1 in result1

query2 = 'uneasy feeling'
result2 = se.search(query2, top_k=1)[0]
assert query2 in result2

print('done')

### Exercise 5: language models

In this exercise you build a 4-gram language model, then use it to generate grammaticaly valid text

In [None]:
model = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
 
for sentence in tqdm(nltk.corpus.brown.sents()):
    for w1, w2, w3 in nltk.trigrams(sentence, pad_right=True, pad_left=True):
        # iterate over all trigrams, accumulate co-occurance counts
        model[(w1, w2)][w3] += 1

for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        # normalize counts to produce a valid probability distribution
        model[w1_w2][w3] /= total_count

In [None]:
text = [None, None]
 
sentence_finished = False
 
while not sentence_finished:
    # introduce a stochastic variable
    r = random.random()
    accumulator = .0
 
    for word in model[tuple(text[-2:])].keys():
        pr = model[tuple(text[-2:])][word]
        accumulator += pr
        
        # frequent trigrams are more likely to overflow accumulator:
        if accumulator >= r:
            text.append(word)
            break
 
    if text[-2:] == [None, None]:
        sentence_finished = True

print(' '.join([t for t in text if t]))

### 5.1 By analogy, implement a 4-gram language model