In [1]:
import os
import jieba
jieba.set_dictionary('jieba_data/dict.txt.big')


In [2]:
from gensim.models import word2vec

In [3]:
sentences = word2vec.LineSentence("segDone.txt")

In [4]:
import time
start_time = time.time()
model = word2vec.Word2Vec(sentences, size=300, iter=100, sg=0, window=15, workers=3)
print("--- spend %s seconds ---" % (time.time() - start_time))
print(model)

--- spend 604.0763547420502 seconds ---
Word2Vec(vocab=44371, size=300, alpha=0.025)


In [5]:
model['刺激']

  """Entry point for launching an IPython kernel.


array([ 1.5832433 ,  0.8411537 ,  1.257613  ,  1.5754268 , -2.6639652 ,
       -2.4317436 , -0.10831738,  0.79727536, -3.336154  , -0.10593793,
       -0.65166944, -0.71367204,  1.3416454 ,  2.1186647 , -2.8434203 ,
       -0.87519515,  1.3488885 , -2.6940336 , -0.8632555 ,  2.1478722 ,
        0.21606117, -1.0137422 , -2.411381  , -0.64776915,  3.49      ,
        1.3262768 ,  0.9221236 , -0.04997907, -0.03895083, -0.10094943,
       -0.25060692, -0.35674128, -1.5411103 , -2.312758  , -0.52727556,
       -1.9426056 ,  0.6505088 ,  0.28770596,  0.85416734,  0.78241146,
       -0.95047826,  1.3027853 ,  2.1053474 , -0.9640512 , -0.10532653,
        1.0535754 ,  1.7231354 ,  0.62891674, -0.36159062,  0.85371625,
        2.0247977 , -1.0596825 , -1.8171668 , -2.4876697 ,  1.0215887 ,
       -2.178407  ,  1.5153672 ,  0.7010719 ,  1.2207719 ,  0.87790626,
       -0.8777417 ,  1.265752  , -0.35122204, -1.321759  ,  0.9077327 ,
        2.6806881 ,  2.6606054 ,  0.4225234 ,  2.0995789 ,  0.79

In [6]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling

In [7]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    for word in model.wv.vocab:
        vectors.append(model.wv[word])
        labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

In [8]:
def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)

In [9]:
model.wv.similarity('刺激','動作')

0.35098907

In [10]:
model.wv.similarity('刺激','緊張')

0.3002588

In [11]:
model.wv.similar_by_word('刺激')

[('緊湊', 0.4762323498725891),
 ('精采', 0.42637738585472107),
 ('精彩', 0.42366698384284973),
 ('打鬥', 0.4167701005935669),
 ('驚險刺激', 0.38420557975769043),
 ('槍戰', 0.3737303912639618),
 ('懸疑', 0.37154996395111084),
 ('場面', 0.3622686266899109),
 ('驚悚', 0.35343724489212036),
 ('動作', 0.35098907351493835)]

In [12]:
model.wv.similar_by_word('劇情')

[('內容', 0.6205664873123169),
 ('情節', 0.5480560064315796),
 ('故事', 0.5061472654342651),
 ('故事情節', 0.46323686838150024),
 ('部分', 0.4580608904361725),
 ('整部', 0.44470202922821045),
 ('劇本', 0.41570115089416504),
 ('據情', 0.41197270154953003),
 ('不夠', 0.40096765756607056),
 ('鋪陳', 0.3961719274520874)]

In [13]:
model.wv.similar_by_word('好看')

[('難看', 0.5933089256286621),
 ('真的', 0.5526486039161682),
 ('還好', 0.4982813000679016),
 ('好笑', 0.4918505847454071),
 ('覺得', 0.47926071286201477),
 ('看過', 0.4752006530761719),
 ('無聊', 0.4715016484260559),
 ('好好看', 0.4190765619277954),
 ('不錯', 0.4148150384426117),
 ('精采', 0.3798362612724304)]