# Implementation of Word2Vec(skip-gram)

1. prepare dataset


2. preprocess
    
    2-1. create word index
    
    2-2. subsampling
    
    2-3. ceter target & context generation
    
    2-4. negative sampling
    
    2-5. batch generation


3. model definition


4. train model


5. apply word embedding 

In [2]:
from mxnet import autograd, gluon, nd
from mxnet.gluon import data as gdata, loss as gloss, nn
from tqdm import tqdm_notebook
from konlpy.tag import Okt

import sys
import collections
import math
import random
import sys
import time
import zipfile
import mxnet as mx

## prepare dataset

In [7]:
with open('data/tworld_QA_dataset.csv','r') as f:
    lines = f.readlines()
    raw_dataset = [st.split() for st in lines[1:]]

print ('# sentences: {}'.format(len(raw_dataset)))

# sentences: 29087


### create word index

In [8]:
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x:x[1] >= 5, counter.items()))

idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk : idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx] for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
print ('# tokens: {}'.format(num_tokens))

# tokens: 337232


In [9]:
dataset

[[0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  8,
  10,
  8,
  11,
  12,
  13,
  14,
  15,
  11,
  16,
  17,
  8,
  18],
 [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 29, 18],
 [31, 32, 33, 34, 35, 36],
 [37, 38, 39, 35, 40, 41, 42],
 [43, 44, 45, 46, 47, 48, 49, 50, 51, 23, 52],
 [53, 54, 55, 56],
 [57, 58, 59, 34, 35, 60, 61, 62],
 [63, 64, 65, 66, 34, 67, 68, 69, 70, 71, 72],
 [73, 74, 75, 76, 77, 78, 79, 80, 81, 82],
 [83, 34, 84, 85, 85],
 [86, 34, 87, 88, 89, 90],
 [43, 91, 34, 92, 93, 94, 95],
 [96, 97, 34, 98, 40, 99, 100, 101],
 [43, 102, 103, 34, 98, 104, 105, 106, 85],
 [107, 108, 34, 35, 60, 93, 109, 110],
 [111,
  112,
  113,
  34,
  35,
  112,
  114,
  115,
  116,
  117,
  118,
  119,
  114,
  120,
  121,
  122,
  123,
  124,
  125,
  126],
 [127, 128, 129, 24, 130, 131, 132, 34, 35, 128, 133, 134, 135, 136],
 [133, 137, 138, 139, 34, 140, 141, 142, 85],
 [138, 131, 105, 143, 144, 145, 146, 147, 85],
 [148, 149, 150, 85],
 [151, 151, 152, 153, 154, 155],
 [156, 3

In [10]:
idx_to_token

['지원금',
 '얼마에요?',
 '★공시지원금',
 '및',
 '추가지원금(T지원금',
 '약정',
 '24개월)',
 '-공시지원금',
 ':',
 '-추가지원금',
 '-할부원금',
 '-월',
 '할부금:',
 '5.9%',
 '이자',
 '포함)',
 '청구',
 '예상금액',
 '입니다.',
 '할부이자는',
 '총',
 '얼만지',
 '알',
 '수',
 '있나요?',
 '24개월',
 '할부',
 '기준',
 '공시지원금',
 '경우',
 '선택약정',
 '좀',
 '더',
 '알아보고',
 '네',
 '고객님',
 '^^',
 '할부원금',
 '이하로',
 '되는게',
 '죄송합니다.',
 '오류가',
 '있어',
 '네.',
 '배정된',
 '전담',
 '상담사',
 '없다면',
 '채팅상담',
 '그대로',
 '안내',
 '받아보실',
 '있습니다.^^',
 '예를들면',
 '할부금이',
 '방식은',
 '아니지만',
 '또한,',
 '통신요금',
 '할인되는',
 '제가',
 '접수까지',
 '도움을',
 '요금제를',
 '올려도',
 '매월',
 '변동이',
 '그렇습니다.',
 '선택약정할인은',
 '요금제로',
 '할인을',
 '해드리는',
 '방식입니다.',
 '실버,',
 '그레이,',
 '블루',
 '순으로',
 '있습니다.',
 '레드는',
 '안타깝게도',
 '출시가',
 '되지',
 '않아',
 '확인부탁드립니다.',
 '알겠습니다.',
 '^',
 '언제쯤',
 '없으면',
 '내일',
 '수령도',
 '가능합니다~',
 '만약',
 '그냥',
 '대리점에',
 '방문을',
 '하시면',
 '주문내역에',
 '월',
 '맞습니다.',
 '아직',
 '적용이',
 '안되어서',
 '신청서',
 '작성후',
 '그래야',
 '대리점에서',
 '신청서가',
 '사용하려면',
 '가족할인',
 '전화를',
 '하여',
 '감사합니다.',
 'T기프트는',
 '내일이나',
 '개통',
 '익일',
 'T기프트',
 '센터로',
 

In [11]:
token_to_idx

{'지원금': 0,
 '얼마에요?': 1,
 '★공시지원금': 2,
 '및': 3,
 '추가지원금(T지원금': 4,
 '약정': 5,
 '24개월)': 6,
 '-공시지원금': 7,
 ':': 8,
 '-추가지원금': 9,
 '-할부원금': 10,
 '-월': 11,
 '할부금:': 12,
 '5.9%': 13,
 '이자': 14,
 '포함)': 15,
 '청구': 16,
 '예상금액': 17,
 '입니다.': 18,
 '할부이자는': 19,
 '총': 20,
 '얼만지': 21,
 '알': 22,
 '수': 23,
 '있나요?': 24,
 '24개월': 25,
 '할부': 26,
 '기준': 27,
 '공시지원금': 28,
 '경우': 29,
 '선택약정': 30,
 '좀': 31,
 '더': 32,
 '알아보고': 33,
 '네': 34,
 '고객님': 35,
 '^^': 36,
 '할부원금': 37,
 '이하로': 38,
 '되는게': 39,
 '죄송합니다.': 40,
 '오류가': 41,
 '있어': 42,
 '네.': 43,
 '배정된': 44,
 '전담': 45,
 '상담사': 46,
 '없다면': 47,
 '채팅상담': 48,
 '그대로': 49,
 '안내': 50,
 '받아보실': 51,
 '있습니다.^^': 52,
 '예를들면': 53,
 '할부금이': 54,
 '방식은': 55,
 '아니지만': 56,
 '또한,': 57,
 '통신요금': 58,
 '할인되는': 59,
 '제가': 60,
 '접수까지': 61,
 '도움을': 62,
 '요금제를': 63,
 '올려도': 64,
 '매월': 65,
 '변동이': 66,
 '그렇습니다.': 67,
 '선택약정할인은': 68,
 '요금제로': 69,
 '할인을': 70,
 '해드리는': 71,
 '방식입니다.': 72,
 '실버,': 73,
 '그레이,': 74,
 '블루': 75,
 '순으로': 76,
 '있습니다.': 77,
 '레드는': 78,
 '안타깝게도': 79,
 '출시가': 80,
 

### subsampling

In [12]:
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
print ('# subsampled tokens: {}'.format(sum([len(st) for st in subsampled_dataset])))

def compare_counts(token):
    before_count = sum([st.count(token_to_idx[token]) for st in dataset])
    after_count = sum([st.count(token_to_idx[token]) for st in subsampled_dataset])

    print ('token: {}, before count: {}, after_count: {}'.format(token, before_count, after_count))

#discard tokens test
compare_counts('네')
compare_counts('지원금')

# subsampled tokens: 188690
token: 네, before count: 8780, after_count: 523
token: 지원금, before count: 367, after_count: 116


### ceter target & context generation

In [14]:
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []

    for st in dataset:
        if len(st) < 2: continue

        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size), min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)
            contexts.append([st[idx] for idx in indices])

    return centers, contexts

# getting center & context test
tiny_dataset = [list(range(7)), list(range(7,10))]
print ('test dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print ('center', center, 'has contexts', context)

# real dataset
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

test dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [1]
center 1 has contexts [0, 2]
center 2 has contexts [1, 3]
center 3 has contexts [1, 2, 4, 5]
center 4 has contexts [3, 5]
center 5 has contexts [4, 6]
center 6 has contexts [4, 5]
center 7 has contexts [8, 9]
center 8 has contexts [7, 9]
center 9 has contexts [8]


### negative sampling

In [15]:
# negative sampling
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in tqdm_notebook(all_contexts, desc='getting negative samples'):
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                i, neg_candidates = 0, random.choices(population, sampling_weights, k=int(1e5))
            
            neg, i = neg_candidates[i], i + 1

            if neg not in set(contexts):
                negatives.append(neg)

        #print (negatives)       
        all_negatives.append(negatives)

    return all_negatives

sampling_weights = [counter[w] ** 0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

HBox(children=(IntProgress(value=0, description='getting negative samples', max=186759), HTML(value='')))




### batch generation

In [9]:
def batchify(data):
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1] * cur_len + [0] * ( max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]

    return (nd.array(centers).reshape((-1, 1)), nd.array(contexts_negatives), nd.array(masks), nd.array(labels))


batch_size = 512
num_workers = 0 if sys.platform.startswith('win32') else 4
dataset = gdata.ArrayDataset(all_centers, all_contexts, all_negatives)
data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True, batchify_fn = batchify, num_workers=num_workers)

for batch in data_iter:
    for name, data in zip(['centers','contexts_negatives','masks','labels'], batch):
        print (name, 'shape:', data.shape)
    break

centers shape: (512, 1)
contexts_negatives shape: (512, 60)
masks shape: (512, 60)
labels shape: (512, 60)


### model definition

In [10]:
# Embedding Layer test
embed = nn.Embedding(input_dim=20, output_dim=4)
embed.initialize()
print (embed.weight)

x = nd.array([[1,2,3],[4,5,6]])
print (embed(x))

Parameter embedding0_weight (shape=(20, 4), dtype=float32)

[[[ 0.01438687  0.05011239  0.00628365  0.04861524]
  [-0.01068833  0.01729892  0.02042518 -0.01618656]
  [-0.00873779 -0.02834515  0.05484822 -0.06206018]]

 [[ 0.06491279 -0.03182812 -0.01631819 -0.00312688]
  [ 0.0408415   0.04370362  0.00404529 -0.0028032 ]
  [ 0.00952624 -0.01501013  0.05958354  0.04705103]]]
<NDArray 2x3x4 @cpu(0)>


In [11]:
# batch_dot test
X = nd.ones((2, 1, 4))
Y = nd.ones((2, 4, 6))
print (nd.batch_dot(X, Y).shape)

(2, 1, 6)


In [16]:
# loss test
loss_fn = gloss.SigmoidBinaryCrossEntropyLoss()

print (loss_fn)

pred = nd.array([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]])
# 1 and 0 in the label variables label represent context words and the noise
# words, respectively
label = nd.array([[1, 0, 0, 0], [1, 1, 0, 0]])
mask = nd.array([[1, 1, 1, 1], [1, 1, 1, 0]])  # Mask variable
print (loss_fn(pred, label, mask) * mask.shape[1] / mask.sum(axis=1))

SigmoidBinaryCrossEntropyLoss(batch_axis=0, w=None)

[0.8739896 1.2099689]
<NDArray 2 @cpu(0)>


In [13]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = nd.batch_dot(v, u.swapaxes(1, 2))
    return pred

# Init Model Params
embed_size = 30 #BERT small model embedding size
net = nn.Sequential()
net.add(nn.Embedding(input_dim=len(idx_to_token), output_dim=embed_size), nn.Embedding(input_dim=len(idx_to_token), output_dim=embed_size))

### train model

In [14]:
def train(net, lr, num_epochs):
    ctx = mx.gpu(0) if mx.context.num_gpus() > 0 else mx.cpu(0)
    net.initialize(ctx=ctx, force_reinit=True)
    trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})

    for epoch in range(num_epochs):
        start, loss_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [data.as_in_context(ctx) for data in batch]
            with autograd.record():
                pred = skip_gram(center, context_negative, net[0], net[1])
                loss = (loss_fn(pred.reshape(label.shape), label, mask) * mask.shape[1] / mask.sum(axis=1))
            loss.backward()
            trainer.step(batch_size)
            loss_sum += loss.sum().asscalar()
            n += loss.size
        print ('epoch %d, loss %.2f, tim %.2fs' % (epoch + 1, loss_sum / n, time.time() - start))


train (net, 0.005, 10)

epoch 1, loss 0.52, tim 30.76s
epoch 2, loss 0.41, tim 30.56s
epoch 3, loss 0.37, tim 30.47s
epoch 4, loss 0.34, tim 30.39s
epoch 5, loss 0.32, tim 31.05s
epoch 6, loss 0.30, tim 30.40s
epoch 7, loss 0.29, tim 30.77s
epoch 8, loss 0.28, tim 30.41s
epoch 9, loss 0.27, tim 30.70s
epoch 10, loss 0.27, tim 30.41s


### apply word embedding

In [15]:
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data()
    x = W[token_to_idx[query_token]]
    # The added 1e-9 is for numerical stability
    cos = nd.dot(W, x) / (nd.sum(W * W, axis=1) * nd.sum(x * x) + 1e-9).sqrt()
    topk = nd.topk(cos, k=k+1, ret_typ='indices').asnumpy().astype('int32')
    for i in topk[1:]:  # Remove the input words
        print('cosine sim=%.3f: %s' % (cos[i].asscalar(), (idx_to_token[i])))

In [16]:
get_similar_tokens('요금제', 3, net[0])

cosine sim=0.818: 패밀리
cosine sim=0.735: YT
cosine sim=0.721: band
