In [1]:
import json
import numpy as np
import pickle

In [3]:
from gensim.models.word2vec import Word2Vec

In [4]:
# load word2vec model
wv_model = Word2Vec.load("word2vec_w5_d300.model")

In [9]:
tests = ['電影', '導演', '好看', '冷場', '精彩']
for t in tests:
    print(wv_model.wv.most_similar(t))

[('影片', 0.72809237241745), ('本片', 0.6814365386962891), ('恐怖片', 0.6719830632209778), ('該片', 0.668270468711853), ('恐怖電影', 0.6650274991989136), ('歌舞片', 0.6590173840522766), ('片子', 0.6352458000183105), ('此片', 0.6315019130706787), ('經典電影', 0.626384437084198), ('喜劇片', 0.6222329139709473)]
[('執導', 0.7640059590339661), ('編劇', 0.7586658000946045), ('製片人', 0.695167064666748), ('編導', 0.6375844478607178), ('製片', 0.6323392391204834), ('名導', 0.5960099697113037), ('監製', 0.5837352275848389), ('製作人', 0.5760589838027954), ('導演獎', 0.5740428566932678), ('蔡明亮', 0.5559418201446533)]
[('好笑', 0.6956039071083069), ('難看', 0.6751818060874939), ('好聽', 0.6735363602638245), ('不好意思', 0.6708608865737915), ('很棒', 0.6529251933097839), ('好吃', 0.6468552350997925), ('嚇人', 0.6464645862579346), ('很帥', 0.6401540040969849), ('發笑', 0.6363775730133057), ('我喜歡', 0.6327226758003235)]
[('發笑', 0.6970230937004089), ('好笑', 0.6831991672515869), ('笑點', 0.6590982675552368), ('無釐頭', 0.6452662348747253), ('入戲', 0.6396948099136353), ('過癮',

In [5]:
# load movie review sentences (tokenized)
with open('data/mid_to_reviews_in_sents_token.json', 'r') as fin:
    mid_to_reviews = json.load(fin)

In [24]:
# mid_to_reviews['4334']
# 4334', '5722', '3722', '5596', '2908', '3058', '2851', '3616', '2703', '3291', '6206', '2585', '6433', '2687', '3796', '3520', '3278', '3658', '4592', '3125', '3998', '5202', '6773', '4926', '3719', '2893', '6634', '6356', '3753', '5934', '4067', '4749', '5062', '4382', '6143', '3764', '6091', '3497', '6822', '4376', '3624', '2966', '6248', '4838', '3365', '4920', '2846', '6271', '4833', '3793', '2569', '3772', '3725', '3653', '4258', '3957', '4971', '4150', '4687', '4553', '2881', '5458', '4053', '5267', '2764', '2369', '3526', '5852', '5067', '2973', '3306', '6064', '5904', '3462', '4634', '6162', '6353', '3634', '4029'

In [11]:
all_sentences = []
for review_lst in mid_to_reviews.values():
    for review in review_lst:
        for sent in review:
            all_sentences.append(sent.split())

In [14]:
# continute training with review sentences data
wv_model.train(all_sentences, total_examples=len(all_sentences), epochs=wv_model.epochs )

  


(16537100, 21970355)

In [16]:
wv_model.save('word2vec_w5_d300_movies.model')

In [15]:
tests = ['電影', '導演', '好看', '冷場', '精彩']
for t in tests:
    print(wv_model.wv.most_similar(t))

[('片子', 0.7098431587219238), ('影片', 0.6068821549415588), ('娛樂片', 0.5257107615470886), ('片會', 0.5106683969497681), ('恐怖片', 0.5097385048866272), ('國片', 0.5052480697631836), ('好片', 0.49426159262657166), ('恐怖電影', 0.4899733066558838), ('武打片', 0.4879536032676697), ('商業片', 0.4859732985496521)]
[('編劇', 0.6366939544677734), ('麥可貝', 0.5947068929672241), ('執導', 0.5490449666976929), ('李安', 0.5106890797615051), ('導的', 0.508110761642456), ('劇本', 0.5044748783111572), ('導戲', 0.5009427070617676), ('製片人', 0.4949568510055542), ('編導', 0.4848693907260895), ('溫子仁', 0.47281962633132935)]
[('難看', 0.5881759524345398), ('好太多', 0.5857882499694824), ('好好看', 0.5855975151062012), ('值得一看', 0.5660040378570557), ('有感覺', 0.5510066747665405), ('爛的', 0.5458192825317383), ('精采', 0.5360756516456604), ('神作', 0.5235428214073181), ('沒看', 0.5116943120956421), ('說真的', 0.5116703510284424)]
[('高潮迭起', 0.5892795324325562), ('懸念', 0.538292646408081), ('爆點', 0.5344079732894897), ('目不轉睛', 0.5216884613037109), ('精采', 0.5146604776382446

In [31]:
def sentence2vec(sentence, wv_model):
    if isinstance(sentence, str):
        sentence = sentence.split()
        
    count = 0
    vec_sum = np.zeros(wv_model.wv.vector_size)
    for term in sentence:
        try:
            vec = wv_model.wv[term]
            vec_sum += vec
            count += 1
        except KeyError as e:
            continue
            
    if count > 0:
        sentence_vec = vec_sum/count
    else:
        sentence_vec = np.zeros(wv_model.wv.vector_size) #如果都不在字典裡 先給0向量..
    
    return sentence_vec
    

In [44]:
sent = '電影 好看 !'
print(sentence2vec(sent, wv_model)[:10])
print(wv_model.wv['電影'][:10])
print(wv_model.wv['好看'][:10])


[-1.50207072 -0.55481391  0.25416207  1.38143179  0.94366115 -1.43416676
 -0.13643944  0.09388094  0.14647318  1.39382511]
[-2.1676483  -1.1425434   0.23905104  1.8591454   0.9915221  -2.2312312
  0.19820139 -0.07185328  0.20347176  2.0366611 ]
[-0.83649313  0.03291561  0.2692731   0.9037182   0.89580023 -0.6371023
 -0.47108027  0.25961515  0.0894746   0.7509891 ]


In [46]:
mid_to_review_in_sents_vector = {}
for mid, review_lst in mid_to_reviews.items():
    mid_to_review_in_sents_vector[mid] = []
    for review in review_lst:
        sent_vectors = []
        for sent in review:
            vec = sentence2vec(sent, wv_model)
            sent_vectors.append(vec)
        mid_to_review_in_sents_vector[mid].append(sent_vectors)

In [55]:
print(len(mid_to_review_in_sents_vector['4334']))
print(len(mid_to_reviews['4334']))
print(mid_to_review_in_sents_vector['4334'][2])
print(mid_to_reviews['4334'][2])

367
367
[array([-1.56203955e+00,  2.74699003e-01,  2.68455751e-01,  3.14681631e-01,
        1.05318385e+00, -8.97030346e-01, -4.06093299e-02,  5.52427411e-01,
       -1.86006725e-02,  2.59720519e-01, -5.84490046e-01,  1.08964387e-02,
       -1.15836960e+00,  3.86526063e-01, -4.64236438e-02,  2.86321029e-01,
       -4.12603408e-01, -2.31736585e-01, -9.35240924e-01, -2.13365529e-01,
       -1.05786321e+00, -4.85775784e-01, -4.19464111e-02,  2.27475762e-02,
        2.78569609e-02,  7.00375248e-01, -6.31806180e-01, -3.33746672e-02,
       -7.30010718e-01, -1.87807806e-01,  3.63525271e-01,  2.15715587e-01,
        1.03701636e+00,  4.23004776e-01, -6.27119958e-01,  3.19458496e-01,
        4.27933997e-01, -2.98955485e-01, -3.90526578e-01,  2.21329074e-01,
       -1.45003557e+00,  7.35550426e-01, -6.71250224e-02,  4.19246353e-01,
       -7.18680829e-01,  2.79566154e-01,  4.91223708e-02, -4.31338504e-01,
       -5.23147088e-02, -4.89702195e-01,  4.92193952e-01, -6.27267897e-01,
        1.145159

In [57]:
with open('data/mid_to_reviews_in_sents_vector.bin', 'wb') as fout:
    pickle.dump(mid_to_review_in_sents_vector, fout)