# 1. Skip-gram with naiive softmax
- Implementation model 'Skip-gram with naiive softmax'
- https://nbviewer.jupyter.org/github/DSKSD/DeepNLP-models-Pytorch/blob/master/notebooks/01.Skip-gram-Naive-Softmax.ipynb

## 1. Import

In [656]:
import torch  # torch
import torch.nn as nn  # neural network
from torch.autograd import Variable  # variable function(grad 계산 가능하게)
import torch.optim as optim  # optimizer
import torch.nn.functional as F  # 뭔지 모름
import nltk  # Natural Language Processing Toolkit
import random
import numpy as np
from collections import Counter

# 일자로 펴주기
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)  # seec값 고정

## 2. Setting Torch

In [657]:
print('torch version : ', torch.__version__)
print('nltk version : ', nltk.__version__)

torch version :  0.4.1
nltk version :  3.2.4


In [658]:
USE_CUDA = torch.cuda.is_available()  # Cuda 사용 가능 여부(GUP)
gpus = [0]
if USE_CUDA:
    torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

## 3. Get Batch

In [659]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch

    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

## 4. Training Word Sequence

In [660]:
# 아래 함수들을 직관적으로 바꾸기


def prepare_sequence(seq,
                     word2index):  # seqence(list 등)이 왔을 때 각 단어들을 인덱스(숫자)로 바꿔라
    idxs = []
    for w in seq:
        if word2index.get(
                w) != None:  # 우리가 만든 voca에 해당 단어가 있으면 해당 단어에 상응하는 인덱스로 바꿔라
            idxs.append(word2index[w])
        else:
            idxs.append(
                word2index["<UNK>"])  # voca에 해당 단어가 없으면 '<UNK>'의 인덱스(0)로 바꿔라
    return Variable(LongTensor(
        idxs))  # 인덱스가 모인 list를 DataTpye이 LongTensor인 Tensor로 바꾸고 Variable을 씌어라


def prepare_word(word, word2index):  # word가 들어올 때 해당 word를 인덱스로 바꿔라
    if word2index.get(word) != None:  # 해당 word가 있다면 LongTensor로 변환하고 출력해라
        return Variable(LongTensor(word2index[word]))
    else:
        return LongTensor(
            [word2index['<UNK>']])  # 해당 word가 없다면 UNK인덱스(0)로 출력해라

In [661]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"],seq))
    return Variable(LongTensor(idxs))


def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

## 5. Data
- Load corpus : **Gutenberg corpis**
- gutenberg corpus가 없다면 **nltk.download()**를 통해 다운받을 수 있다.

In [662]:
nltk.corpus.gutenberg.fileids() # gutenberg 안에 있는 문서들

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [663]:
# 시험을 위해 문장 샘플링
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100] # 해당 문서를 문장으로 나누기(100개만)
corpus = [[word.lower() for word in sent] for sent in corpus]  # 문장을 단어로 분해, 소문자 처리

In [664]:
# Stopword 뽑아내기
word_count = Counter(flatten(corpus))  # 각각의 단어의 개수, sorted
border = int(len(word_count) * 0.01)  # 단어 종류의 개수 * 0.01

# 가장 많이 사용된 문자 상위 border개, 하위 border개
stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border]
len(stopwords)

10

In [665]:
stopwords = [s[0] for s in stopwords]  # (word, number) 중 word만 추출
stopwords

[',', '.', 'the', 'of', 'and', 'man', 'artificial', 'civitas', '--(', 'state']

## 6. Build Voca

In [666]:
vocab = list(set(flatten(corpus)) - set(stopwords))  # stopword 제거하기
vocab.append('<UNK>')
print('Number of word in corpus : ', len(set(flatten(corpus))))
print('Number of word in vocab : ', len(vocab))

Number of word in corpus :  592
Number of word in vocab :  583


In [667]:
word2index = {'<UNK>': 0}  # '<UNK>'fmf 0 index로 만들어주기

# 각 단어마다 index 할당해주기
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
# index : word -> word : index로 변환하기
index2word = {i: w for w, i in word2index.items()}
index2word[1]

'with'

## 7. Prepare Data
- **nltk.ngrams(sequence, n, pad_lef = False, pad_right = False, pad_symbol = None)**
- http://madhukaudantha.blogspot.com/2015/05/nltk-tutorial03-n-gram.html

In [668]:
window_size = 3
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * window_size + c + ['<DUMMY>'] * window_size,window_size * 2 + 1)) for c in corpus])
windows[0]

('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by')

In [669]:
# My code
train_data = []  # (center word, context word)

for win in windows:
    for i in range(window_size * 2 + 1):  # 수정된 부분
        if win[i] == '<DUMMY>' or i == window_size:  # 수정된 부분
            continue
        train_data.append((win[window_size], win[i]))
print(train_data[:window_size * 2])

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]


In [670]:
print('Number of train_data : ', len(train_data))
print(train_data[0])

Number of train_data :  7606
('[', 'moby')


In [671]:
X_p = []
y_p = []

for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1)) # center word 2차 행렬로 만들기
    y_p.append(prepare_word(tr[1], word2index).view(1, -1)) # context word 2차 행렬로 만들기

In [672]:
train_data = list(zip(X_p, y_p)) # (tensor(conter word), tensor(context word))
len(train_data)

7606

 ## 8. Modeling

In [673]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # procjetion_dim의 개수로 벡터화([a,b,c,d])
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # procjetion_dim의 개수로 벡터화([a,b,c,d])
        
        self.embedding_v.weight.data.uniform_(-1, 1) # -1 ~ 1 값으로 초기화
        self.embedding_u.weight.data.uniform_(0, 0) # -1 ~ 1 값으로 초기화
        #self.out = nn.Linear(procjection_dim, vocab_size) 
        
    def forward(self, center_words, target_words, outer_words):
        center_embeds = self.embedding_v(center_words) # (Batch x 1 x D) center_words 벡터화
        target_embeds = self.embedding_u(target_words) # (Batch x 1 x D)
        outer_embeds = self.embedding_u(outer_words) # (batch x V x D)
        
        scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1xD * BxDx1 -> Bx1
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # BxVxD * BxDx1 -> BxV
        
        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        
        return nll
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

## 9. Train

In [674]:
embedding_size = 30
batch_size = 256
epoch = 100

losses = []
model = Skipgram(len(word2index), embedding_size)
if USE_CUDA:
    model = model.CUDA()
optimizer = optim.Adam(model.parameters(), lr = 0.01)

In [675]:
for epoch in range(epoch):
    for i, batch in enumerate(getBatch(batch_size, train_data)):
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab))
        model.zero_grad()
        
        loss = model(inputs, targets, vocabs)
        
        loss.backward()
        optimizer.step()
        
        losses.append(loss.data.tolist())
    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss ; %.02f" % (epoch, np.mean(losses)))
        losses = []

Epoch : 0, mean_loss ; 6.19
Epoch : 10, mean_loss ; 4.38
Epoch : 20, mean_loss ; 3.47
Epoch : 30, mean_loss ; 3.31
Epoch : 40, mean_loss ; 3.26
Epoch : 50, mean_loss ; 3.24
Epoch : 60, mean_loss ; 3.23
Epoch : 70, mean_loss ; 3.21
Epoch : 80, mean_loss ; 3.21
Epoch : 90, mean_loss ; 3.20


## 10. Test

In [676]:
def word_similarity(target, word):
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target, word2index))
    else:
        target_V = model.prediction(prepare_word(target, word2index))
    
    similarities = []
    
    for i in range(len(vocab)):
        if vocab[i] == target:
            continue
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0]
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key = lambda x: x[1], reverse = True)[:10] # sort by similarity

In [702]:
test = random.choice(list(vocab))
print(test)
word_similarity(test, vocab)

mockingly


[['embellished', 0.7563604116439819],
 ['handkerchief', 0.6995153427124023],
 ['gay', 0.6726282238960266],
 ['tail', 0.6395581960678101],
 ['queer', 0.6295337677001953],
 ['sore', 0.6193282008171082],
 ['eyes', 0.5807200074195862],
 ['known', 0.5707562565803528],
 ['incontinently', 0.5502141118049622],
 ['t', 0.5393688678741455]]

## 새롭게 알게 된 내용
### 1. 두 list의 차집합
- 그냥 set(list1) - set(list2)를 빼면 된다.
- 단, 더 많은 list에서 빼야 확인 가능!

### 2. zip(*list)
- 두 list가 한 list에 있을 때 따로 zip하는거랑 같은 효과
https://stackoverflow.com/questions/29139350/difference-between-- ziplist-and-ziplist/29139418

### 3. Dimension의 개념
- 내가 평소에 쓰던 pandas의 axis와 같은 개념(0 : 행, 1 : 열)

### 4. iterable의 개념
- member를 하나씩 차례로 반환 가능한 object를 말한다.
- iterable의 예로는 sequence type인 list, str, tuple 등이 대표적이다.
- http://bluese05.tistory.com/55

### 5. Python의 map()
- built-in 함수로 list 나 dictionary와 같은 iterable 한 데이터를 인자로 받아 list 안의 개별 item을 함수의 인자로 전달하여<br>
결과를 list 형태로 반환해 주는 함수이다.
- map(str, [1, 2, 3])와 같이 자료형 int, float, str 등을 넣었는데 plus_ten처럼 함수(클래스)를 직접 만들어서 넣어도 됩니다.
- http://bluese05.tistory.com/58

### 6. Python의 lambda로 함수 만들기
- https://dojang.io/mod/page/view.php?id=1059

## 사용한 PyTorch 함수

### 1. torch.bmm
![torch.bmm](image/torch.bmm.png)

### 2. nn.embedding
![nn.embedding](image/nn.embedding.png)

### 3. torch.cat
![torch.cat](image/torch.cat.png)