In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\test\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [2]:
print(torch.__version__)
print(nltk.__version__)

0.4.1
3.3


In [4]:
USE_CUDA = torch.cuda.is_available()
#gpus = [0]
#torch.cuda.set_device(gpus[0])
# if you want to train model with GPU, implement this code

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [6]:
def getBatch(batch_size, train_data): #dataloader를 써라 ~
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch #yield는 매번 뱉는다
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [65]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

In [8]:
nltk.download('gutenberg')
nltk.corpus.gutenberg.fileids()

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\test\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [16]:
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100] # sampling sentences for test
corpus = [[word.lower() for word in sent] for sent in corpus] #for lowercase

[['[', 'Moby', 'Dick', 'by', 'Herman', 'Melville', '1851', ']'],
 ['ETYMOLOGY', '.'],
 ['(',
  'Supplied',
  'by',
  'a',
  'Late',
  'Consumptive',
  'Usher',
  'to',
  'a',
  'Grammar',
  'School',
  ')'],
 ['The',
  'pale',
  'Usher',
  '--',
  'threadbare',
  'in',
  'coat',
  ',',
  'heart',
  ',',
  'body',
  ',',
  'and',
  'brain',
  ';',
  'I',
  'see',
  'him',
  'now',
  '.'],
 ['He',
  'was',
  'ever',
  'dusting',
  'his',
  'old',
  'lexicons',
  'and',
  'grammars',
  ',',
  'with',
  'a',
  'queer',
  'handkerchief',
  ',',
  'mockingly',
  'embellished',
  'with',
  'all',
  'the',
  'gay',
  'flags',
  'of',
  'all',
  'the',
  'known',
  'nations',
  'of',
  'the',
  'world',
  '.'],
 ['He',
  'loved',
  'to',
  'dust',
  'his',
  'old',
  'grammars',
  ';',
  'it',
  'somehow',
  'mildly',
  'reminded',
  'him',
  'of',
  'his',
  'mortality',
  '.'],
 ['"',
  'While',
  'you',
  'take',
  'in',
  'hand',
  'to',
  'school',
  'others',
  ',',
  'and',
  'to',
  'te

In [17]:
word_count = Counter(flatten(corpus)) # count each word
border = int(len(word_count) * 0.01) # border of stopwords

In [28]:
list(reversed(word_count.most_common()))[:border]

[('man', 1),
 ('artificial', 1),
 ('Civitas', 1),
 ('Latin', 1),
 ('--(', 1),
 ('State', 1)]

In [18]:
stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border] # most common words and most rare words

In [19]:
stopwords = [s[0] for s in stopwords]

In [22]:
stopwords

[',',
 '.',
 'the',
 'of',
 'and',
 '--',
 'man',
 'artificial',
 'Civitas',
 'Latin',
 '--(',
 'State']

In [29]:
vocab = list(set(flatten(corpus)) - set(stopwords))
vocab.append('<UNK>') #corpus에는 있는데 vocab에는 없으니까 <unknown으로>

In [30]:
word2index = {'<UNK>' : 0} 

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

index2word = {v:k for k, v in word2index.items()} 

In [36]:
WINDOW_SIZE = 3
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])
# how can we interpret this function?

In [59]:
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': 
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))

print(train_data[:WINDOW_SIZE * 2])
# extract all context words

[('[', 'Moby'), ('[', 'Dick'), ('[', 'by'), ('Moby', '['), ('Moby', 'Dick'), ('Moby', 'by')]


In [63]:
X_p = []
y_p = []

In [66]:
for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1))
    y_p.append(prepare_word(tr[1], word2index).view(1, -1))

In [67]:
train_data = list(zip(X_p, y_p))

In [68]:
len(train_data)

7606

In [69]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)

        self.embedding_v.weight.data.uniform_(-1, 1) # init
        self.embedding_u.weight.data.uniform_(0, 0) # init
        #self.out = nn.Linear(projection_dim,vocab_size)
    def forward(self, center_words,target_words, outer_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D, B는 batch size
        target_embeds = self.embedding_u(target_words) # B x 1 x D, 주변 단어 중 하나
        outer_embeds = self.embedding_u(outer_words) # B x V x D, 전체 단어
        
        scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1xD * BxDx1 => Bx1 , bmm은 Batch는 그대로 두고 곱해라
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # BxVxD * BxDx1 => BxV
        
        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax, unsqueeze는 X1을 뒤에다가 붙이는 것(차원 높이는)
        
        return nll # negative log likelihood
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds 

In [70]:
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100

In [71]:
losses = []
model = Skipgram(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [73]:
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs)
        targets = torch.cat(targets)
        vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab))  # B x V
        
        model.zero_grad() # 여기부터 4줄은 자주 쓰니 기억할 것
        loss = model(inputs, targets, vocabs)
        loss.backward()
        optimizer.step()
   
        losses.append(loss.data.tolist[0])

    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch,np.mean(losses)))
        losses = []

  app.launch_new_instance()


Epoch : 0, mean_loss : 6.26
Epoch : 10, mean_loss : 4.38
Epoch : 20, mean_loss : 3.41
Epoch : 30, mean_loss : 3.25
Epoch : 40, mean_loss : 3.20
Epoch : 50, mean_loss : 3.18
Epoch : 60, mean_loss : 3.17
Epoch : 70, mean_loss : 3.16
Epoch : 80, mean_loss : 3.15
Epoch : 90, mean_loss : 3.15


In [74]:
def word_similarity(target, vocab):
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target, word2index)) #useless
    else:
        target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] # .item으로 하면 된다, word vector를 뽑아낼 수 있다
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10] # sort by similarity

In [75]:
test = random.choice(list(vocab))
test

'poets'

In [76]:
word_similarity(test, vocab)

[['here', 0.7442889213562012],
 ['generally', 0.6463077068328857],
 ['appearing', 0.6241486668586731],
 ['these', 0.5828226208686829],
 ['as', 0.5827580094337463],
 ['well', 0.5705728530883789],
 ['ISAIAH', 0.5673191547393799],
 ['whether', 0.544781506061554],
 ['however', 0.5162895321846008],
 ['devil', 0.5149803161621094]]