In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1994)

In [None]:
print(torch.__version__)
print(nltk.__version__)

In [None]:
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)
# gpus = [0]
# torch.cuda.set_device(0)

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [None]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
        
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [None]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index("<UNK>"), seq))
    return Variable(LongTensor(idxs)) 

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

In [None]:
f=open("/Users/farshad/Desktop/CS224n/Harry Potter txt/Harry Potter 1 - Sorcerer's Stone.txt",'r', encoding = 'cp1252')
raw=f.read()
tokens = nltk.sent_tokenize(raw)
corpus = [nltk.word_tokenize(tokens[i]) for i in range(len(tokens))]
corpus = [[word.lower() for word in sent] for sent in corpus]

In [None]:
word_count = Counter(flatten(corpus))
border = int(len(word_count) * 0.002) 
stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border]
stopwords = [s[0] for s in stopwords]

In [None]:
vocab = list(set(flatten(corpus)) - set(stopwords))
vocab.append('<UNK>')
print(len(set(flatten(corpus))), len(vocab))

In [None]:
word2index = {'<UNK>' : 0}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

index2word = {v: k for k, v in word2index.items()}

In [None]:
WINDOW_SIZE = 3
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])

In [None]:
windows[:5]

In [None]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE*2+1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': 
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))

print(train_data[:10])

In [None]:
x_p = []
y_p = []

for tr in train_data:
    x_p.append(prepare_word(tr[0], word2index).view(1,-1))
    y_p.append(prepare_word(tr[1], word2index).view(1,-1))
train_data = list(zip(x_p, y_p))

In [None]:
len(train_data)

In [None]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim):
        super(Skipgram, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_u = nn.Embedding(vocab_size, embedding_dim)
        
        self.embedding_v.weight.data.uniform_(-1,1)
        self.embedding_u.weight.data.uniform_(0,0)
        
    def forward(self, center_words, target_words, outer_words):
        center_embeds = self.embedding_v(center_words)
        target_embeds = self.embedding_u(target_words)
        outer_embeds = self.embedding_u(outer_words)
        
        scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        
        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1)))
        
        return nll
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds 

In [None]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100

In [None]:
losses = []
model = Skipgram(len(word2index), EMBEDDING_SIZE)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
for epochs in range(100):
    total_loss = torch.Tensor([0])
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs)
        targets = torch.cat(targets)
        vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab))
        model.zero_grad()

        loss = model(inputs, targets, vocabs)
        
        loss.backward()
        optimizer.step()
   
        total_loss += loss.data
    losses.append(total_loss)
    print(losses)