# the application of word2vec

In [1]:
from gensim.models import word2vec
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import os

## turn back to main directory
os.chdir("../")
os.getcwd()

'/project/ta-hsi/hsinchu_courses/NLP'

In [2]:
## load word2vec model (由 part1 03_word2vec_build.ipynb 產生)
model = word2vec.Word2Vec.load('word2vec_model/CBOW')

## similarity

In [3]:
## get most similarity with given words
model.wv.most_similar('KMT')

[('DPP', 0.6578985452651978),
 ('kmt', 0.6458199620246887),
 ('dpp', 0.6360074877738953),
 ('國民黨', 0.6352492570877075),
 ('執政', 0.5434491634368896),
 ('阿扁', 0.5327768325805664),
 ('民進黨', 0.52995765209198),
 ('時力', 0.5226317048072815),
 ('白色恐怖', 0.5202020406723022),
 ('李登輝', 0.5197052955627441)]

In [4]:
## get most similarity with given words's relationship
model.wv.most_similar(positive=['KMT', '綠吱'], negative=['DPP'])

[('周處', 0.43969738483428955),
 ('八旗', 0.40904492139816284),
 ('包壺', 0.40349867939949036),
 ('紂王', 0.4023209512233734),
 ('鼠輩', 0.3923807740211487),
 ('賣國賊', 0.3919026255607605),
 ('明末', 0.3907436430454254),
 ('三姓', 0.38343679904937744),
 ('先祖', 0.37808701395988464),
 ('異端', 0.3770984411239624)]

## clustering

In [5]:
## create a dictionary: words as key ; count as values
words = {word: vocab.count for word, vocab in model.wv.vocab.items()}

In [6]:
## sort and select the top 10000 count of words
words = sorted(words.items(), key=lambda x: x[1], reverse=True)
words = words[:10000]
words = np.array(words)[:, 0]
words

array(['人', '八卦', '有沒有', ..., '唯有', '考到', '派遣'], dtype='<U20')

In [7]:
## extract the word vectors 
vecs = model.wv[words]

In [8]:
## run clustering algorithm
kmeans = KMeans(n_clusters=50)
cluster = kmeans.fit_predict(vecs)

In [9]:
## print the result
df = pd.DataFrame([words.tolist(), cluster.tolist()], index=['words', 'no. cluster']).T
df.head(n=5)

Unnamed: 0,words,no. cluster
0,人,23
1,八卦,23
2,有沒有,23
3,說,23
4,好,19


In [10]:
## print every cluster of words
data = pd.concat([d['words'].reset_index(drop=True).rename(columns={0: k}) for k, d in df.groupby('no. cluster')], axis=1)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,字,最近,突然,相關,一個,肥宅,一堆,my,喜歡,錢,...,卻,冷氣,買,神,發文,當兵,板規,政治,穿,抓
1,討論,題,走,禁止,看到,from,幹,iPhone,可愛,萬,...,世界,停電,手機,seafood,版,單位,之內,支持,衣服,殺
2,寫,前,站,需要,中,on,根本,ASUS,正妹,元,...,社會,發電,賣,宗教,兩篇,役,兩則,事件,一件,攻擊
3,英文,剛剛,裡,無法,發現,問卦,死,Asus,長,賺,...,希望,反核,電腦,信,板,訓練,仔細閱讀,媒體,內褲,派
4,意思,小弟,旁邊,一種,小,JPTT,搞,HTC,臉,花,...,生活,台電,便宜,師父,被刪,國軍,の,行為,戴,地震
5,內容,安安,小心,認為,算,QQ,笑,Sony,正,一年,...,機會,環保,貴,信徒,標題,退,い,覺青,t,救
6,名字,時,外面,使用,只,喔,罵,Samsung,年輕,超過,...,人生,限電,一台,師傅,文,替代,な,自由,穿著,阻止
7,教,以前,後面,重要,下,推,垃圾,the,好看,一個月,...,邊緣,核能,技術,信仰,文章,長官,ん,歧視,褲子,槍
8,研究,之前,聲音,能力,開,欸,整天,of,妹,房子,...,未來,核電,價格,妙禪,PTT,軍,る,風向,制服,斬
9,中文,本魯,坐,所有,一些,老婆,不行,I,帥哥,億,...,願意,缺電,設計,上帝,ptt,軍人,っ,團體,黑色,死亡
