In [1]:
import numpy as np
import json
from random import random
import math
from collections import OrderedDict,Counter


In [2]:
def generate_tokens(path):
    with open(path, 'r') as fp:
        buf = []
        while True:
            ch = fp.read(1)
            if ch == '':
                break
            elif ch.isspace():
                if buf:
                    yield ''.join(buf)
                    buf = []
            else:
                buf.append(ch)

def generate_tokens_ch(path):
    for w in generate_tokens(path):
        yield w.decode('utf-8')


In [4]:
wcount = Counter(generate_tokens_ch("poem.txt"))
word_dict = OrderedDict(sorted(wcount.items(), reverse=True, key=lambda x:x[1]))
fj = open("w2v_dict_poem.json", "w")
fj.write(json.dumps(word_dict, indent=4))
fj.close()


In [28]:
s_vocab = 3000
word_dict_list = sorted(word_dict.items(),reverse=True, key=lambda x:x[1])[:s_vocab-1]
word_dict = OrderedDict(map(lambda x: (x[1][0], x[0]) ,enumerate(word_dict_list)))
f2 = open("poem_idx.txt","w")
for w in generate_tokens_ch("poem.txt"):
    f2.write("%s "% word_dict.get(w, s_vocab-1))
f2.close()


In [33]:
word_id_freq_list = map(lambda x: (x[0], x[1][1]) ,enumerate(word_dict_list))
negsamp_max_count = max([x[1] for x in word_id_freq_list])
word_id_freq_list.append((len(word_dict_list), negsamp_max_count))
negsamp_array_size = sum([x[1] for x in word_id_freq_list])
negsamp_array = np.zeros(negsamp_array_size).astype(int)
offset = 0
for witem in word_id_freq_list:
    negsamp_array[offset:offset + witem[1]] = witem[0]
    offset += witem[1]


In [52]:
s_embed = 30
s_window = 3
eta_init = 1.
rho = 0.8
ite = 10
batch_size = 1000

In [None]:
W_emb = (0.5 - np.random.rand(s_vocab, s_embed)) / math.sqrt(s_vocab + s_embed)
W_o = (0.5 - np.random.rand(s_embed, s_vocab)) / math.sqrt(s_vocab + s_embed)

W_o_rp = 0.1 + np.zeros((s_embed, s_vocab))
W_emb_rp = 0.1 + np.zeros((s_vocab, s_embed))

context = []
toprint = 10000
avg_err = 0.
w_count = 0
for it in range(iter):
    print "iter", it
    # cost = 0
    for w in generate_tokens("poem_idx.txt"):
        eta = eta_init / math.log((w_count + 2),5)
        context.append(int(w))
        if len(context) > s_window:
            context.pop(0)
            i_pos = s_window / 2
            i_wid = context[i_pos]
            h_err = np.zeros((s_embed))
            for negcount in range(s_window + 1):
                if negcount == 0:
                    o_pos = int(math.floor(s_window * random()))
                    o_wid = context[o_pos]
                    o_golden = 1
                else:
                    o_wid = negsamp_array[int(math.floor(random() * negsamp_array_size))]
                    while o_wid in context:
                        o_wid = negsamp_array[int(math.floor(random() * negsamp_array_size))]
                    o_golden = 0
                o_pred = 1 / (1 + np.exp(- np.dot(W_emb[i_wid, :], W_o[:, o_wid])))
                o_err = o_pred - o_golden
                h_err += o_err * W_o[:, o_wid]
                W_o[:, o_wid] -= eta * o_err * W_emb[i_wid]
                avg_err += abs(o_err)
            W_emb[i_wid, :] -= eta * h_err
            w_count += 1
            if w_count % toprint == 0:
                print "count: %s, eta %s"  % (w_count,eta) 
                print "avg_err: %s" % ( avg_err / float(toprint * (s_window + 1)) )
                avg_err = 0.
    model_name = "w2v_model_%s.json" % (it)
    print "save model: %s" %(model_name)
    fm = open(model_name, "w")
    fm.write(json.dumps(W_emb.tolist(), indent=4))
    fm.close()

iter 0
count: 10000, eta 0.174740603957
avg_err: 0.498393238625
count: 20000, eta 0.162511416158
avg_err: 0.465046838269
count: 30000, eta 0.156119904854
avg_err: 0.342064729645
count: 40000, eta 0.151881626193
avg_err: 0.360685890166
count: 50000, eta 0.148749345428
avg_err: 0.378291523453
count: 60000, eta 0.146284390945
avg_err: 0.350175375407
count: 70000, eta 0.144263150824
avg_err: 0.343263973645
count: 80000, eta 0.142556883985
avg_err: 0.325994073362
count: 90000, eta 0.141085003853
avg_err: 0.314083784991
count: 100000, eta 0.139793879444
avg_err: 0.305908434273
count: 110000, eta 0.138646104479
avg_err: 0.300968859986
count: 120000, eta 0.137614602125
avg_err: 0.300016839212
count: 130000, eta 0.136679172915
avg_err: 0.299559380643
count: 140000, eta 0.13582436701
avg_err: 0.293834065939
count: 150000, eta 0.135038116289
avg_err: 0.296055602717
count: 160000, eta 0.134310823367
avg_err: 0.294560170406
count: 170000, eta 0.133634736416
avg_err: 0.289266871915
count: 180000, et

In [63]:
f2 = open("w2v_model_2.json","r") #"w2v_result_1.json"
w2v_model = np.array(json.loads("".join(f2.readlines())))
f2.close()
word_dict_reverse = OrderedDict([(x[1],x[0]) for x in word_dict.items() ])

def get_top(word):
    dot_result = np.dot(w2v_model, np.expand_dims(w2v_model[word_dict.get(word)],axis=1))
    final_result = sorted([(x[0], x[1][0]) for x in enumerate(dot_result)], key=lambda x:x[1], reverse=True)
    print word
    for x in final_result[:10]:
        print word_dict_reverse.get(x[0]),x[1]



In [80]:
get_top(u"山")
get_top(u"峰")
get_top(u"河")
get_top(u"日")
get_top(u"母")

山
山 6.89817657156
仞 5.48508210203
邙 5.39748526383
嶂 5.37567603867
峴 5.15190922531
巔 5.02733352642
巒 5.01471489602
岫 4.91171794409
瀑 4.76158334431
隈 4.72631456419
峰
峰 5.64263792097
仞 5.11608931768
嶂 5.05273391601
巔 5.04045708199
梧 4.84342319956
聳 4.74719410906
萊 4.67863771074
岫 4.57765406941
嵯 4.52987783543
巒 4.48762491327
河
河 8.3437007543
浸 5.99621961846
浙 5.80059411055
漳 5.74756745907
塞 5.52111699118
脈 5.41092866703
堤 5.4010228908
涇 5.34611187256
渭 5.34433910114
滔 5.29767971582
日
日 7.53227209683
昨 5.64449264962
今 5.26162105001
曛 5.2149646021
柘 5.09228546713
曈 4.9494310479
終 4.81157483618
暮 4.61797751569
夕 4.39978962394
噪 4.33253007418
母
母 7.70722898138
阿 5.77156595399
父 5.11348880257
儂 5.1105969784
王 5.01025728547
氏 4.66142323047
慈 4.58391209858
妾 4.34954229719
萊 4.30786085723
荊 4.25904308548


In [86]:
def get_calculated_top(w1, w2, w3):
    v1 = w2v_model[word_dict.get(w1)]
    v2 = w2v_model[word_dict.get(w2)]
    v3 =w2v_model[word_dict.get(w3)]
    dot_result = np.dot(w2v_model, np.expand_dims(v1+(v2-v3),axis=1))
    final_result = sorted([(x[0], x[1][0]) for x in enumerate(dot_result)], key=lambda x:x[1], reverse=True)
    print "%s + %s - %s" %(w1,w2,w3)
    for x in final_result[:10]:
        print word_dict_reverse.get(x[0]),x[1]


In [87]:
get_calculated_top(u"女",u"父",u"男")

女 + 父 - 男
父 6.89753825887
陶 6.02543823868
公 5.79068975061
邀 5.54295744312
訪 5.34483994806
馮 5.32571235076
崔 5.2276940566
漁 5.22450417756
女 5.14851257927
稚 5.13421773684
