# Assignment 1 - Word2Vec
1.  Try a real corpus (instead of banana apple, try something real... on the internet....) - not so big!  

Just you have a good taste of real stuff....like 50 documents, each have 50 words....

2. Try window size of 2

3. Try CBOW (instead of skipgrams)

4. Compare normal version of skipgrams vs. negative sampling version of skipgrams in terms of time (using real corpus)

In [27]:
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import spacy
import pandas as pd
spacy.__version__

'3.4.2'

## 1. Load data
A real corpus

In [43]:
nlp = spacy.load('./en_core_web_sm/')
text = open('./dataset/alchemist.txt',mode='r') #change later
df = pd.DataFrame(text.readlines())

### Clean Data

In [44]:
def clean_data(df_col):
    corpus = []
    for item in df_col:
        item = re.sub('[^A-Za-z0-9]+', ' ', str(item)) # remove special characters
        item = item.lower() # lower all characters
        item = item.split() # split data
        corpus.append(' '.join(str(x) for x in item))
    return corpus

In [45]:
#1. tokenize
#data cleaned
corpus = clean_data(df[0])
#data tokenized
corpus_tokenized = [sent.split(" ") for sent in corpus]

In [46]:
#2. numericalize (vocab)
#2.1 get all the unique words
#we want to flatten unit (basically merge all list)
flatten = lambda l: [item for sublist in l for item in sublist]
vocabs = list(set(flatten(corpus_tokenized)))

#2.2 assign id to all these vocabs
word2index = {v: idx for idx, v in enumerate(vocabs)}

#adding unknown word
vocabs.append('<UNK>')
word2index['<UNK>'] = 1233

voc_size = len(vocabs)
voc_size

1233

In [104]:
skipgrams = []
cbow = []
for sent in corpus_tokenized:
    #for each sent ('apple', 'banana', 'fruit')
    for i in range(2,len(sent)-2): #start from 1 to second last
        center_word = sent[i]
        outside_word = [sent[i-2],sent[i-1],sent[i+1],sent[i+2]] #window size = 2
        cbow.append((outside_word, center_word))
        for o in outside_word: 
            skipgrams.append((center_word,o))

In [107]:
skipgrams[:5]

[('crowning', 'high'),
 ('crowning', 'up'),
 ('crowning', 'the'),
 ('crowning', 'grassy'),
 ('the', 'up')]

In [106]:
cbow[:4]

[(['high', 'up', 'the', 'grassy'], 'crowning'),
 (['up', 'crowning', 'grassy', 'summit'], 'the'),
 (['crowning', 'the', 'summit', 'of'], 'grassy'),
 (['the', 'grassy', 'of', 'a'], 'summit')]

## Continuous Bag-of-Words (CBOW)

## Compare normal version of skipgrams vs. negative sampling version of skipgrams in terms of time (using real corpus)

In [110]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [111]:
def random_batch(batch_size, corpus):
    skipgrams = []
    #for each corpus
    for sent in corpus_tokenized:
        #for each sent ('apple', 'banana', 'fruit')
        for i in range(2,len(sent)-2): #start from 1 to second last
            # print(sent[i])
            center_word = word2index[sent[i]]
            outside_word = [word2index[sent[i-2]],word2index[sent[i-1]],word2index[sent[i+1]],word2index[sent[i+2]]] #window_size = 2
            #here we want to create (banana, apple), (banana, fruit) append to some list
            for o in outside_word:
                skipgrams.append([center_word,o])
    #only get a batch, mot the entire lsit
    random_index = np.random.choice(range(len(skipgrams)),batch_size,replace=False)
    
    #appending some list of inputs and labels
    random_inputs, random_labels = [] , []
    for index in random_index:
        random_inputs.append([skipgrams[index][0]]) #center words, this will be as shape of (1,) -> (1,1) for modeling
        random_labels.append([skipgrams[index][1]])

    return np.array(random_inputs),np.array(random_labels)

### Skipgram

In [146]:
#preparing all_vocabs
batch_size = 2

def prepare_seqeunce(seq, word2index):
    #map(fucntion, list of something)
    #map will look at each of element in this list, and apply this function
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_seqeunce(list(vocabs),word2index).expand(batch_size, voc_size)
all_vocabs.shape

torch.Size([2, 1233])

#### Model

In [147]:
class Skipgram(nn.Module):
    def __init__(self,voc_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_center_word = nn.Embedding(voc_size, emb_size) #is a lookup table mapping all ids in voc_size, into some vector of size emb_size
        self.embedding_outside_word = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center_word, outside_word, all_vocabs):
        #center_word, outside_word: (batch_size,1)
        #all_vocabs : (batch_size, voc_size)
        #convert them into embedding
        center_word_embed = self.embedding_center_word(center_word)     #v_c (batch_size,1, emb_size)
        outside_word_embed = self.embedding_outside_word(outside_word)  #u_o (batch_size,1, emb_size)
        all_vocabs_embed = self.embedding_outside_word(all_vocabs)      #u_w (batch_size,voc_size, emb_size)
        print(center_word_embed.shape,outside_word_embed.shape,all_vocabs_embed.shape)
        #bmm is basically @ or .dot but across batches (ie., ignore the batch dimension)
        top_term = outside_word_embed.bmm(center_word_embed.transpose(1,2)).squeeze(2)
        #(batch_size,1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) ===> (batch_size, 1)
        top_term_exp = torch.exp(top_term) #exp(uo vc)
        #(batch_size, 1)
        lower_term = all_vocabs_embed.bmm(center_word_embed.transpose(1,2)).squeeze(2)
        #(batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) ===> (batch_size, voc_size)
        lower_term_sum = torch.sum(torch.exp(lower_term)) #sum exp(uw, vc)
        #(batch_size, 1)
        loss_fn = -torch.mean(torch.log(top_term_exp/lower_term_sum))
        #(batc_size,1) / (batch_size,1) ==mena==> scalar
        return loss_fn

In [148]:
#preparing all_vocabs
batch_size = 2

def prepare_seqeunce(seq, word2index):
    #map(fucntion, list of something)
    #map will look at each of element in this list, and apply this function
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

all_vocabs = prepare_seqeunce(list(vocabs),word2index).expand(batch_size, voc_size)
all_vocabs.shape

torch.Size([2, 1233])

In [149]:
input, label = random_batch(2, corpus_tokenized)
input_tensor = torch.LongTensor(input)
label_tensor  = torch.LongTensor(label)
input_tensor.shape,label_tensor.shape

(torch.Size([2, 1]), torch.Size([2, 1]))

In [150]:
batch_size = 2 #why? no reason
emb_size = 2 #why? no reason; usually 50,100, 300 but 2 so we can plot (50 can also plot, but need PCA)
model = Skipgram(voc_size,emb_size)

criterion = nn.CrossEntropyLoss() #-log
optimizer = optim.Adam(model.parameters(), lr=0.001)

#### training

In [154]:
import time
num_epochs = 5000
#for epoch
for epoch in range(num_epochs):

    start = time.time()
    
    #get random batch
    input_batch, label_batch = random_batch(batch_size,corpus)
    input_batch = torch.LongTensor(input_batch)
    label_batch = torch.LongTensor(label_batch)

    print(input_batch.shape,label_batch.shape,all_vocabs.shape)
    # break

    #loss = model
    loss = model(input_batch,label_batch,all_vocabs)
    #backpropagate
    loss.backward()
    #update alpha
    optimizer.step()

    end = time.time()

    #print epoch loss
    epoch_mins, epoch_secs = epoch_time(start, end)

    # if (epoch + 1) % 1000 == 0:
    print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

    break

torch.Size([2, 1]) torch.Size([2, 1]) torch.Size([2, 1233])


IndexError: index out of range in self

### Negative Sampling

In [152]:
##Unigram Distribution
z = 0.0001
#count all the occurence of vocabs
from collections import Counter

word_count = Counter(flatten(corpus_tokenized))
word_count

num_total_words = sum([c for w, c in word_count.items()])
num_total_words

unigram_table = []

for v in vocabs:
    uw = word_count[v]/num_total_words
    uw_alpha = uw ** 0.75
    uw_alpha_dividebyz = int(uw_alpha/z)
    # print('Vocab :',v)
    # print('distribution :', uw_alpha_dividebyz)
    unigram_table.extend([v] * uw_alpha_dividebyz)

Counter(unigram_table)

Counter({'digressed': 21,
         'primeval': 21,
         'trapdoor': 21,
         'with': 263,
         'disliking': 21,
         'knowledge': 21,
         'beard': 21,
         'branch': 21,
         'works': 21,
         'high': 35,
         'pierre': 70,
         'rous': 21,
         'nearby': 21,
         'humanity': 21,
         'education': 21,
         'spirit': 21,
         'sinister': 48,
         'snatched': 21,
         'forest': 35,
         'dwelled': 21,
         'race': 21,
         'spell': 35,
         'gold': 21,
         'familiar': 21,
         'directly': 21,
         'seeking': 21,
         'days': 48,
         'woods': 35,
         'lips': 21,
         'my': 542,
         'speaking': 21,
         'feudalism': 21,
         'language': 21,
         'proclaiming': 21,
         'particularly': 21,
         'rational': 21,
         'custom': 21,
         'discuss': 21,
         'line': 90,
         'murd': 21,
         'steadily': 21,
         'studied': 21,
      

In [145]:
def prepare_sequence(seq, word2index):
    #map(fucntion, list of something)
    #map will look at each of element in this list, and apply this function
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)
    
import random
#you don't want to pick samples = targets, basically negative samples
#k = number of negative samples - how many? they found 10 is the best
#will be run during training
#after random_batch, 
def negative_sampling(targets, unigram_table, k):
    #targets is already in id.....
    #but the unigram_table is in word....
    #1. get the batch size of this targets
    batch_size = targets.shape[0]
    neg_samples = []
    #2. for each batch
    for i in range(batch_size):
        #randomly pick k negative words from unigram_table
        target_index = targets[i].item()  #looping each of the batch....
        nsample = []
        while len(nsample) < k:
            neg = random.choice(unigram_table)
            #if this word == target, skip this word
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        #append this word to some list
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))  #tensor[], tensor[]
    return torch.cat(neg_samples)  #tensor[[], []]

In [135]:
num_neg = 2 # in the real code, we gonna use 10 (like in the paper)
neg_samples = negative_sampling(label_batch, unigram_table, num_neg)
# neg_samples[0].shape
neg_samples.shape 

torch.Size([2, 2])

#### Model

In [136]:
#the model will accept three vectors - u_o, v_c, u_k
#u_o - vectos for outside words
#v_C - vector for center word
#u_k - vectors for negative word

class SkipgramNeg(nn.Module):
    def __init__(self,voc_size, emb_size):
        super(SkipgramNeg,self).__init__()
        self.embedding_center_word = nn.Embedding(voc_size, emb_size)
        self.embedding_outside_word = nn.Embedding(voc_size, emb_size)
        self.logsigmoid = nn.LogSigmoid()

    def forward(self, center_words, outside_words, negative_words):
        #center_words, outside_words  (batch_size,1)
        #negative_words (batch_size,k) 
        center_embed    = self.embedding_center_word(center_words)      #(batch_size,1, emb_size)
        outside_embed   = self.embedding_outside_word(outside_words)   #(batch_size,1, emb_size)
        neg_embed       = self.embedding_outside_word(negative_words)      #(batch_size,k, emb_size)
        
        uovc            = outside_embed.bmm(center_embed.transpose(1,2)).squeeze(2)
        ukvc            = -neg_embed.bmm(center_embed.transpose(1,2)).squeeze(2)
        ukvc_sum        =  torch.sum(ukvc, 1).view(-1, 1) #(batch_size, 1)
        loss = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum) #(batch_size,1)+(batch_size,1)
        
        return -torch.mean(loss) #scalar, loss should be scalar, to call backward()

In [137]:
voc_size = len(vocabs)
voc_size

batch_size = 2 #why? no reason
emb_size = 2 #why? no reason; usually 50,100, 300 but 2 so we can plot (50 can also plot, but need PCA)
model = SkipgramNeg(voc_size,emb_size)

criterion = nn.CrossEntropyLoss() #-log
optimizer = optim.Adam(model.parameters(), lr=0.001)

#### training

In [140]:
import time

# Training
num_epochs = 5000
for epoch in range(num_epochs):
    start = time.time()
    
    input_batch, target_batch = random_batch(batch_size, corpus)
    
    #input_batch: [batch_size, 1]
    input_batch = torch.LongTensor(input_batch)
    #target_batch: [batch_size, 1]
    target_batch = torch.LongTensor(target_batch)
    
    #negs_batch:   [batch_size, num_neg]
    negs_batch = negative_sampling(target_batch, unigram_table, num_neg)
    
    optimizer.zero_grad()
        
    loss = model(input_batch, target_batch, negs_batch)
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

Epoch: 1000 | cost: 1.553973 | time: 0m 0s
Epoch: 2000 | cost: 2.089709 | time: 0m 0s
Epoch: 3000 | cost: 1.233664 | time: 0m 0s
Epoch: 4000 | cost: 1.328872 | time: 0m 0s
Epoch: 5000 | cost: 1.754867 | time: 0m 0s


## Appendix

In [65]:
# def dataBags(corpus_tokenized):
#     # data - [(context), target]
#     data = []
#     for sent in corpus_tokenized:
#         for i in range(2, len(sent) - 2):
#             context = [sent[i - 2], sent[i - 1], sent[i + 1], sent[i + 2]]
#             target = sent[i]
#             data.append((context, target))

#     return data

# cbow = dataBags(corpus_tokenized)
# cbow[:5]

In [76]:
#Example to import db file
import pandas as pd
import sqlite3
def ReadSQL(filename):
    connection = sqlite3.connect(filename)
    data = pd.read_sql("SELECT * from city_table",connection)
    return data

ReadSQL('city.db')

Unnamed: 0,City,Country,Population
0,athens,greece,1368
1,bangkok,thailand,1178
2,barcelona,spain,1280
3,berlin,east_germany,3481
4,birmingham,united_kingdom,1112
...,...,...,...
66,tokyo,japan,8535
67,toronto,canada,668
68,vienna,austria,1766
69,warsaw,poland,965
