# 1. Skip-gram with naiive softmax
- Implementation model 'Skip-gram with naiive softmax'
- https://nbviewer.jupyter.org/github/DSKSD/DeepNLP-models-Pytorch/blob/master/notebooks/01.Skip-gram-Naive-Softmax.ipynb

## 1. Import

In [259]:
import torch  # torch
import torch.nn as nn  # neural network
from torch.autograd import Variable  # variable function(grad 계산 가능하게)
import torch.optim as optim  # optimizer
import torch.nn.functional as F  # 뭔지 모름
import nltk  # Natural Language Processing Toolkit
import random
import numpy as np
from collections import Counter

# 일자로 펴주기
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)  # seec값 고정

## 2. Setting Torch

In [260]:
print('torch version : ', torch.__version__)
print('nltk version : ', nltk.__version__)

torch version :  0.4.1
nltk version :  3.2.4


In [261]:
USE_CUDA = torch.cuda.is_available()  # Cuda 사용 가능 여부(GUP)
gpus = [0]
if USE_CUDA:
    torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

## 3. Get Batch

In [262]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch

    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

## 4. Training Word Sequence

In [325]:
# 아래 함수들을 직관적으로 바꾸기


def prepare_sequence(seq,
                     word2index):  # seqence(list 등)이 왔을 때 각 단어들을 인덱스(숫자)로 바꿔라
    idxs = []
    for w in seq:
        if word2index.get(
                w) != None:  # 우리가 만든 voca에 해당 단어가 있으면 해당 단어에 상응하는 인덱스로 바꿔라
            idxs.append(word2index[w])
        else:
            idxs.append(
                word2index["<UNK>"])  # voca에 해당 단어가 없으면 '<UNK>'의 인덱스(0)로 바꿔라
    return Variable(LongTensor(
        idxs))  # 인덱스가 모인 list를 DataTpye이 LongTensor인 Tensor로 바꾸고 Variable을 씌어라


def prepare_word(word, word2index):  # word가 들어올 때 해당 word를 인덱스로 바꿔라
    if word2index.get(word) != None:  # 해당 word가 있다면 LongTensor로 변환하고 출력해라
        return Variable(LongTensor(word2index[word]))
    else:
        return LongTensor(
            [word2index['<UNK>']])  # 해당 word가 없다면 UNK인덱스(0)로 출력해라

In [326]:
def prepare_sequence(seq, word2index):
    idxs = list(
        map(
            lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"],
            seq))
    return Variable(LongTensor(idxs))


def prepare_word(word, word2index):
    return Variable(
        LongTensor([word2index[word]]) if word2index.get(word) is not None else
        LongTensor([word2index["<UNK>"]]))

## 5. Data
- Load corpus : **Gutenberg corpis**
- gutenberg corpus가 없다면 **nltk.download()**를 통해 다운받을 수 있다.

In [264]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [265]:
# 시험을 위해 문장 샘플링
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100]
corpus = [[word.lower() for word in sent] for sent in corpus]  # 문장 분해, 소문자 처리

In [266]:
# Stopword 뽑아내기
word_count = Counter(flatten(corpus))  # 각각의 단어의 개수, sorted
border = int(len(word_count) * 0.01)  # 단어 종류의 개수 * 0.01

# 가장 많이 사용된 문자 상위 border개, 하위 border개
stopwords = word_count.most_common()[:border] + list(
    reversed(word_count.most_common()))[:border]

In [267]:
stopwords

[(',', 96),
 ('.', 66),
 ('the', 58),
 ('of', 36),
 ('and', 35),
 ('man', 1),
 ('artificial', 1),
 ('civitas', 1),
 ('--(', 1),
 ('state', 1)]

In [268]:
stopwords = [s[0] for s in stopwords]  # (word, number) 중 word만 추출
stopwords

[',', '.', 'the', 'of', 'and', 'man', 'artificial', 'civitas', '--(', 'state']

## 6. Build Voca

In [269]:
vocab = list(set(flatten(corpus)) - set(stopwords))  # stopword 제거하기
vocab.append('<UNK>')
print('Number of word in corpus : ', len(set(flatten(corpus))))
print('Number of word in vocab : ', len(vocab))

Number of word in corpus :  592
Number of word in vocab :  583


In [270]:
word2index = {'<UNK>': 0}  # '<UNK>'fmf 0 index로 만들어주기

# 각 단어마다 index 할당해주기
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
# index : word -> word : index로 변환하기
index2word = {i: w for w, i in word2index.items()}
index2word

{0: '<UNK>',
 1: 'with',
 2: 'on',
 3: 'please',
 4: 'vessel',
 5: 'which',
 6: 'an',
 7: 'his',
 8: 'handkerchief',
 9: 'sunrise',
 10: 'fare',
 11: 'were',
 12: 'french',
 13: 'roll',
 14: 'besides',
 15: 'while',
 16: 'glancing',
 17: 'promiscuously',
 18: 'lord',
 19: 'vaticans',
 20: "'",
 21: 'aloft',
 22: 'grammar',
 23: 'sea',
 24: 'to',
 25: 'version',
 26: 'could',
 27: ')',
 28: 'verbal',
 29: 'valuable',
 30: 'apology',
 31: 'every',
 32: 'vast',
 33: 'etymology',
 34: 'dut',
 35: 'sallow',
 36: 'queer',
 37: 'slay',
 38: 'baleine',
 39: 'hampton',
 40: 'unpleasant',
 41: 'worm',
 42: 'seethe',
 43: 'hamlet',
 44: 'lins',
 45: 'will',
 46: 'street',
 47: 'pale',
 48: 'whale',
 49: 'school',
 50: 'one',
 51: 'take',
 52: 'as',
 53: 'ships',
 54: 'quantity',
 55: 'warm',
 56: 'that',
 57: 'davenant',
 58: 'mote',
 59: 'body',
 60: 'thro',
 61: 'sovereignest',
 62: 'described',
 63: 'been',
 64: 'scarcely',
 65: 'or',
 66: 'at',
 67: 'beast',
 68: 'own',
 69: 'nations',
 70: '

## 7. Prepare Data
- **nltk.ngrams(sequence, n, pad_lef = False, pad_right = False, pad_symbol = None)**
- http://madhukaudantha.blogspot.com/2015/05/nltk-tutorial03-n-gram.html

In [271]:
window_size = 3
windows = flatten([
    list(
        nltk.ngrams(['<DUMMY>'] * window_size + c + ['DUMMY'] * window_size,
                    window_size * 2 + 1)) for c in corpus
])
windows[0]

('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by')

In [297]:
# My code
train_data = []  # (center word, context word)

for win in windows:
    for i in win:
        if i == '<DUMMY>' or win[window_size] == i:  # << 이 부분 오류
            continue
        train_data.append((win[window_size], i))
print(train_data[:window_size * 2])

[('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]


**내 코드의 오류**<br>
- if조건문에서 win[window_size] == 1 부분을 보면 window에서 하나씩 출력한 단어가<br> window size에 해당하는 단어(window의 중심단어)와 같으면 생략하라는 코드였다.
- 하지만 이렇게 하면 window 내에 같은 단어가 쓰이는 경우도 생략해 버린다.
- 예를 들어 ('a', 'b', 'c', 'a', 'd', 'e', 'f') 처럼 중심단어 a와 첫 단어 a가 같아도 생략해 버린다.
- 따라서 잘못된 코드

In [302]:
# My code
train_data = []  # (center word, context word)

for win in windows:
    for i in range(window_size * 2 + 1):  # 수정된 부분
        if win[i] == '<DUMMY>' or i == window_size:  # 수정된 부분
            continue
        train_data.append((win[window_size], i))
print(train_data[:window_size * 2])

[('[', 4), ('[', 5), ('[', 6), ('moby', 2), ('moby', 4), ('moby', 5)]


In [306]:
print('Number of train_data : ', len(train_data))
print(train_data[0])

Number of train_data :  7606
('[', 4)


In [332]:
X_p = []
y_p = []

for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1)) # center word 2차 행렬로 만들기
    y_p.append(prepare_word(tr[1], word2index).view(1, -1)) # context word 2차 행렬로 만들기

In [337]:
train_data = list(zip(X_p, y_p)) # (tensor(conter word), tensor(context word))
len(train_data)

7606

 ## 8. Modeling

In [None]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projcetion_dim)
        
        self.embedding_v.weight.data.uniform_(-1, 1) # -1 ~ 1 값으로 초기화
        self.embedding_u.weight.data.uniform_(-1, 1) # -1 ~ 1 값으로 초기화
        #self.out = nn.Linear(procjection_dim, vocab_size)
        

## What is iterable?
iterable 의 의미는 member를 하나씩 차례로 반환 가능한 object를 말한다. 

 iterable 의 예로는 sequence type인 list, str, tuple 이 대표적이다. 



출처: http://bluese05.tistory.com/55 [ㅍㅍㅋㄷ]

## What is Python map()?
map() 함수는 built-in 함수로 list 나 dictionary 와 같은 iterable 한 데이터를 인자로 받아 list 안의 개별 item을 함수의 인자로 전달하여 결과를 list로 형태로 반환해 주는 함수이다. 글로 설명하면 복잡하니 아래 예제를 보자. 

출처: http://bluese05.tistory.com/58 [ㅍㅍㅋㄷ]

map(str, [1, 2, 3])와 같이 자료형 int, float, str 등을 넣었는데 plus_ten처럼 함수(클래스)를 직접 만들어서 넣어도 됩니다.

## lambda로 함수 만들기
https://dojang.io/mod/page/view.php?id=1059

In [173]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter

In [174]:
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [175]:
# Replacement of lambda 'flatten'
co = []
for a1 in corpus:
    for a2 in a1:
        co.append(a2)
cor = list(set(co))

In [176]:
print(torch.__version__)
print(nltk.__version__)

0.4.1
3.2.4


In [177]:
USE_CUDA = torch.cuda.is_available()
USE_CUDA

False

In [178]:
if USE_CUDA:
    FloatTensor = torch.cuda.FloatTensor 
else:
    FloatTensor = torch.FloatTensor
# Upper code can be converted more simple

In [179]:
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [180]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex : eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
        
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch


In [181]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w : word2index[w] if word2index.get(w) is not None else word2index['<UNK'], seq))
    return Variable(LongTensor(idxs))

In [182]:
def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index['<UNK>']]))
def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

## 1. Data Load
- Data : Gutenberg corpus

In [183]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [184]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100] # 텍스트를 scentence화
corpus = [[word.lower() for word in sent] for sent in corpus] # 단어들 소문자화

## 2. Extract stopwords from unigram distribution's tail

In [185]:
word_count = Counter(flatten(corpus))
word_count

Counter({'!': 5,
         '"': 26,
         "'": 20,
         '(': 3,
         ')': 3,
         ').': 1,
         ',': 96,
         '-': 24,
         '--': 27,
         '--(': 1,
         '.': 66,
         '."': 26,
         '...': 5,
         '1851': 1,
         '890': 1,
         ':': 1,
         ';': 12,
         '[': 1,
         ']': 1,
         'a': 21,
         'about': 1,
         'acres': 1,
         'affording': 1,
         'after': 1,
         'againe': 1,
         'against': 1,
         'alfred': 1,
         'all': 5,
         'allusions': 1,
         'almost': 1,
         'aloft': 1,
         'alone': 1,
         'also': 1,
         'altogether': 1,
         'am': 1,
         'among': 2,
         'an': 3,
         'ancient': 1,
         'and': 35,
         'anglo': 1,
         'animal': 1,
         'annals': 1,
         'any': 1,
         'anyways': 1,
         'apology': 1,
         'appeared': 1,
         'appearing': 1,
         'appears': 2,
         'arched': 1,
      

In [186]:
border = int(len(word_count) * 0.01)
border

5

In [187]:
word_count.most_common()[:border]

[(',', 96), ('.', 66), ('the', 58), ('of', 36), ('and', 35)]

In [188]:
list(reversed(word_count.most_common()))[:border]

[('man', 1), ('artificial', 1), ('civitas', 1), ('--(', 1), ('state', 1)]

In [189]:
stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border]
stopwords

[(',', 96),
 ('.', 66),
 ('the', 58),
 ('of', 36),
 ('and', 35),
 ('man', 1),
 ('artificial', 1),
 ('civitas', 1),
 ('--(', 1),
 ('state', 1)]

In [190]:
stopwords = [s[0] for s in stopwords]

In [191]:
stopwords

[',', '.', 'the', 'of', 'and', 'man', 'artificial', 'civitas', '--(', 'state']

## 3. Build vocab

In [192]:
len(set(flatten(corpus)))
len(set(stopwords))

10

In [193]:
vocab = list(set(flatten(corpus)) - set(stopwords))
vocab.append('<UNK>')

In [194]:
print(len(set(flatten(corpus))), len(vocab))

592 583


In [195]:
word2index = {'<UNK>' : 0}

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}
index2word

{0: '<UNK>',
 1: 'with',
 2: 'on',
 3: 'please',
 4: 'vessel',
 5: 'which',
 6: 'an',
 7: 'his',
 8: 'handkerchief',
 9: 'sunrise',
 10: 'fare',
 11: 'were',
 12: 'french',
 13: 'roll',
 14: 'besides',
 15: 'while',
 16: 'glancing',
 17: 'promiscuously',
 18: 'lord',
 19: 'vaticans',
 20: "'",
 21: 'aloft',
 22: 'grammar',
 23: 'sea',
 24: 'to',
 25: 'version',
 26: 'could',
 27: ')',
 28: 'verbal',
 29: 'valuable',
 30: 'apology',
 31: 'every',
 32: 'vast',
 33: 'etymology',
 34: 'dut',
 35: 'sallow',
 36: 'queer',
 37: 'slay',
 38: 'baleine',
 39: 'hampton',
 40: 'unpleasant',
 41: 'worm',
 42: 'seethe',
 43: 'hamlet',
 44: 'lins',
 45: 'will',
 46: 'street',
 47: 'pale',
 48: 'whale',
 49: 'school',
 50: 'one',
 51: 'take',
 52: 'as',
 53: 'ships',
 54: 'quantity',
 55: 'warm',
 56: 'that',
 57: 'davenant',
 58: 'mote',
 59: 'body',
 60: 'thro',
 61: 'sovereignest',
 62: 'described',
 63: 'been',
 64: 'scarcely',
 65: 'or',
 66: 'at',
 67: 'beast',
 68: 'own',
 69: 'nations',
 70: '

## 4. Prepare train data

In [279]:
WINDOW_SIZE = 3
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])
windows

[('<DUMMY>', '<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by'),
 ('<DUMMY>', '<DUMMY>', '[', 'moby', 'dick', 'by', 'herman'),
 ('<DUMMY>', '[', 'moby', 'dick', 'by', 'herman', 'melville'),
 ('[', 'moby', 'dick', 'by', 'herman', 'melville', '1851'),
 ('moby', 'dick', 'by', 'herman', 'melville', '1851', ']'),
 ('dick', 'by', 'herman', 'melville', '1851', ']', '<DUMMY>'),
 ('by', 'herman', 'melville', '1851', ']', '<DUMMY>', '<DUMMY>'),
 ('herman', 'melville', '1851', ']', '<DUMMY>', '<DUMMY>', '<DUMMY>'),
 ('<DUMMY>', '<DUMMY>', '<DUMMY>', 'etymology', '.', '<DUMMY>', '<DUMMY>'),
 ('<DUMMY>', '<DUMMY>', 'etymology', '.', '<DUMMY>', '<DUMMY>', '<DUMMY>'),
 ('<DUMMY>', '<DUMMY>', '<DUMMY>', '(', 'supplied', 'by', 'a'),
 ('<DUMMY>', '<DUMMY>', '(', 'supplied', 'by', 'a', 'late'),
 ('<DUMMY>', '(', 'supplied', 'by', 'a', 'late', 'consumptive'),
 ('(', 'supplied', 'by', 'a', 'late', 'consumptive', 'usher'),
 ('supplied', 'by', 'a', 'late', 'consumptive', 'usher', 'to'),
 ('by', 'a', 'late', 'c

In [277]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>':
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))
train_data

[('[', 'moby'),
 ('[', 'dick'),
 ('[', 'by'),
 ('moby', '['),
 ('moby', 'dick'),
 ('moby', 'by'),
 ('moby', 'herman'),
 ('dick', '['),
 ('dick', 'moby'),
 ('dick', 'by'),
 ('dick', 'herman'),
 ('dick', 'melville'),
 ('by', '['),
 ('by', 'moby'),
 ('by', 'dick'),
 ('by', 'herman'),
 ('by', 'melville'),
 ('by', '1851'),
 ('herman', 'moby'),
 ('herman', 'dick'),
 ('herman', 'by'),
 ('herman', 'melville'),
 ('herman', '1851'),
 ('herman', ']'),
 ('melville', 'dick'),
 ('melville', 'by'),
 ('melville', 'herman'),
 ('melville', '1851'),
 ('melville', ']'),
 ('1851', 'by'),
 ('1851', 'herman'),
 ('1851', 'melville'),
 ('1851', ']'),
 (']', 'herman'),
 (']', 'melville'),
 (']', '1851'),
 ('etymology', '.'),
 ('.', 'etymology'),
 ('(', 'supplied'),
 ('(', 'by'),
 ('(', 'a'),
 ('supplied', '('),
 ('supplied', 'by'),
 ('supplied', 'a'),
 ('supplied', 'late'),
 ('by', '('),
 ('by', 'supplied'),
 ('by', 'a'),
 ('by', 'late'),
 ('by', 'consumptive'),
 ('a', '('),
 ('a', 'supplied'),
 ('a', 'by'),
 (

In [278]:
len(train_data)

7606

In [200]:
X_p = []
y_p = []

In [201]:
for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1))
    y_p.append(prepare_word(tr[1], word2index).view(1, -1))

In [202]:
train_data = list(zip(X_p, y_p))

In [203]:
len(train_data)

7606

In [52]:
a = [1,2,3]
b = [4,5,6]
a+ b

[1, 2, 3, 4, 5, 6]

In [53]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)

        self.embedding_v.weight.data.uniform_(-1, 1) # init
        self.embedding_u.weight.data.uniform_(0, 0) # init
        #self.out = nn.Linear(projection_dim,vocab_size)
    def forward(self, center_words,target_words, outer_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        target_embeds = self.embedding_u(target_words) # B x 1 x D
        outer_embeds = self.embedding_u(outer_words) # B x V x D
        
        scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1xD * BxDx1 => Bx1
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # BxVxD * BxDx1 => BxV
        
        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        
        return nll # negative log likelihood
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds 

## 5. Train

In [54]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100

In [55]:
losses = []
model = Skipgram(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [62]:
loss.data.tolist()

6.368189811706543

In [63]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab))  # B x V
        model.zero_grad()

        loss = model(inputs, targets, vocabs)
        
        loss.backward()
        optimizer.step()
   
        losses.append(loss.data.tolist())

    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch,np.mean(losses)))
        losses = []

Epoch : 0, mean_loss : 6.17
Epoch : 10, mean_loss : 4.37
Epoch : 20, mean_loss : 3.48
Epoch : 30, mean_loss : 3.31
Epoch : 40, mean_loss : 3.26
Epoch : 50, mean_loss : 3.24
Epoch : 60, mean_loss : 3.22
Epoch : 70, mean_loss : 3.21
Epoch : 80, mean_loss : 3.21
Epoch : 90, mean_loss : 3.20


In [64]:
def word_similarity(target, vocab):
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target, word2index))
    else:
        target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10] # sort by similarity

In [67]:
test = random.choice(list(vocab))
test

'eyes'

In [68]:
word_similarity(test, vocab)

[['empty', 0.8058958649635315],
 ['full', 0.7151128649711609],
 ['fegee', 0.6264484524726868],
 ['ponderous', 0.5834770202636719],
 ['t', 0.5713217854499817],
 ['raising', 0.5608559846878052],
 ['his', 0.5588749051094055],
 ['etymology', 0.5560470819473267],
 ['greek', 0.5502840280532837],
 ['erromangoan', 0.5272930264472961]]

## 새롭게 알게 된 내용
### 1. 두 list의 차집합
- 그냥 set(list1) - set(list2)를 빼면 된다.
- 단, 더 많은 list에서 빼야 확인 가능!