# 2. Skip-gram with negative sampling
- Implementation Skip-gram model with negative sampling.
- Explaination of each codes are written by korean.
- https://nbviewer.jupyter.org/github/DSKSD/DeepNLP-models-Pytorch/blob/master/notebooks/02.Skip-gram-Negative-Sampling.ipynb

## 1. Import

In [1]:
import torch
import torch.nn as nn
from torch.autograd  import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l : [item for sublist in l for item in sublist]
random.seed(1024)

## 2. Setting Torch

In [2]:
print(torch.__version__)
print(nltk.__version__)

0.4.1
3.3


In [None]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

## 3. Get Batch
- Get Batch 함수를 정의하여 Batch를 생성해주는 원시적인 방법도 좋지만, <br>
Pytorch에는 자동으로 Batch를 생성해주는 DataLoader라는 함수가 있다.
- 따라서 이 함수보다 DataLoader를 이용하여 Batch를 생성한다.(**7.Prepare Data에서 실시한다.**)
- torch.utils.data.DataLoader(data, batch_size = , shuffle = )
- https://pytorch.org/docs/stable/data.html

In [None]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

## 4. Training word sequence

In [None]:
def prepare_sequence(seq, word):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))
    
def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

## 5. Data
- Load corpus : **Gutenberg corpus**
- gutenberg corpus가 없다면 **nltk.download()**를 통해 다운받을 수 있다.

In [None]:
nltk.corpus.gutenberg.fileids()

In [None]:
%%time
# 참고자료에 있는 코드
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:500]
corpus = [[word.lower() for word in sent] for sent in corpus]
word_count = Counter(flatten(corpus))

In [None]:
## 빈도가 희박한 단어 제거
min_count = 3
exclude = []

for w, c in word_count.items(): # dictionary.items() = [(key1, val1), (key2, val2)...]
    if c < min_count:
        exclude.append(w)

#vocab = list(set(corpus) - set(exclude))
ㅍocab = list(set(flatten(corpus)) - set(exclude))

## 6. Build Voca

In [None]:
word2index = {'<UNK>' : 0}
for vo in vocab:
    if word2index.get(vo) == None:
        word2index[vo] = len(word2index)

index2word = {i:w for w, i in word2index.items()}

In [None]:
window_size = 5
windows =  flatten([list(nltk.ngrams(['<DUMMY>'] * window_size + c + ['<DUMMY>'] * window_size, window_size * 2 + 1)) for c in corpus])

In [None]:
train_data = []

for window in windows:
    for i in range(window_size * 2 + 1):
        if window[i] in exclude or window[window_size] in exclude: 
            continue # min_count
        if i == window_size or window[i] == '<DUMMY>': 
            continue
        train_data.append((window[window_size], window[i]))

X_p = []
y_p = []

for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1))
    y_p.append(prepare_word(tr[1], word2index).view(1, -1))
    
train_data = list(zip(X_p, y_p))

## 7. Unigram Distribution

In [5]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(0)

In [None]:
USE_CUDA = torch.cuda.is_available()

In [7]:
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [8]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [9]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:500]
corpus = [[word.lower() for word in sent] for sent in corpus]

word_count = Counter(flatten(corpus))

MIN_COUNT = 3
exclude = []

for w, c in word_count.items():
    if c < MIN_COUNT:
        exclude.append(w)
        

vocab = list(set(flatten(corpus)) - set(exclude))

word2index = {}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}

WINDOW_SIZE = 5
windows =  flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])

train_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if window[i] in exclude or window[WINDOW_SIZE] in exclude: 
            continue # min_count
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': 
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))

X_p = []
y_p = []

for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1))
    y_p.append(prepare_word(tr[1], word2index).view(1, -1))
    
train_data = list(zip(X_p, y_p))

len(train_data)

50242

In [10]:
Z = 0.001

In [11]:
word_count = Counter(flatten(corpus))
num_total_words = sum([c for w, c in word_count.items() if w not in exclude])
unigram_table = []

for vo in vocab:
    unigram_table.extend([vo] * int(((word_count[vo]/num_total_words)**0.75)/Z))

In [12]:
print(len(vocab), len(unigram_table))

478 3500


In [13]:
def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].data.cpu().tolist()[0] if USE_CUDA else targets[i].data.tolist()[0]
        while len(nsample) < k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    
    return torch.cat(neg_samples)

In [14]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                
        initrange = (2.0 / (vocab_size + projection_dim))**0.5 # Xavier init
        self.embedding_v.weight.data.uniform_(-initrange, initrange) # init
        self.embedding_u.weight.data.uniform_(-0.0, 0.0) # init
        
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        target_embeds = self.embedding_u(target_words) # B x 1 x D
        
        neg_embeds = -self.embedding_u(negative_words) # B x K x D
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1
        negative_score = torch.sum(neg_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2), 1).view(negs.size(0), -1) # BxK -> Bx1
        
        loss = self.logsigmoid(positive_score) + self.logsigmoid(negative_score)
        
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

In [15]:
EMBEDDING_SIZE = 30 
BATCH_SIZE = 256
EPOCH = 100
NEG = 10 # Num of Negative Sampling

In [16]:
losses = []
model = SkipgramNegSampling(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [17]:
for epoch in range(EPOCH):
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        negs = negative_sampling(targets, unigram_table, NEG)
        model.zero_grad()
        loss = model(inputs, targets, negs)
        
        loss.backward()
        optimizer.step()
        losses.append(loss.data.tolist())
    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch, np.mean(losses)))
        losses = []

Epoch : 0, mean_loss : 1.06
Epoch : 10, mean_loss : 0.86
Epoch : 20, mean_loss : 0.79
Epoch : 30, mean_loss : 0.74
Epoch : 40, mean_loss : 0.71
Epoch : 50, mean_loss : 0.69
Epoch : 60, mean_loss : 0.67
Epoch : 70, mean_loss : 0.65
Epoch : 80, mean_loss : 0.64
Epoch : 90, mean_loss : 0.63


In [18]:
def word_similarity(target, vocab):
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target, word2index))
    else:
        target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: 
            continue
        
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0]
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

In [42]:
test = random.choice(list(vocab))
print(test)
word_similarity(test, vocab)

does


[['thinks', 0.7355144619941711],
 ['tell', 0.6772541403770447],
 ['city', 0.6555599570274353],
 ['penny', 0.6512199640274048],
 ['pale', 0.6497499942779541],
 ['till', 0.6416484117507935],
 ['why', 0.624262273311615],
 ['think', 0.6155771017074585],
 ['hear', 0.6021900177001953],
 ['if', 0.5793417692184448]]

# 질문

## 1. 두 코드가 다른가?

In [None]:
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    torch.cuda.set_device(0)
    FloatTensor = torch.cuda.FloatTensor
    LongTensor = torch.cuda.LongTensor
    ByteTensor = torch.cuda.ByteTensor
else:
    FloatTensor = torch.FloatTensor
    LongTensor = torch.cuda.LongTensor
    ByteTensor = torch.cuda.ByteTensor

In [None]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor