# the application of word2vec

In [1]:
from gensim.models import word2vec
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import os

## turn back to main directory
os.chdir("../")
os.getcwd()

'/home/jovyan/Davis_Practice/NLP'

In [2]:
## load word2vec model (由 part1 03_word2vec_build.ipynb 產生)
model = word2vec.Word2Vec.load('part2/word2vec_model/1999_CBOW')

## similarity

In [3]:
## get most similarity with given words
model.wv.most_similar('系統')

[('項目', 0.8080171346664429),
 ('點選', 0.7968190908432007),
 ('首頁', 0.7850562334060669),
 ('畫面', 0.7785091400146484),
 ('連結', 0.7778873443603516),
 ('介面', 0.7551491260528564),
 ('按鈕', 0.7540853023529053),
 ('停留', 0.7499284744262695),
 ('頁面', 0.7397742867469788),
 ('卻', 0.7371840476989746)]

In [13]:
## get most similarity with given words's relationship
model.wv.most_similar(positive=['MRO'], negative=['系統'])

[('ITEM', 0.5975996255874634),
 ('P', 0.5881931185722351),
 ('cancel', 0.5705850124359131),
 ('GR', 0.54735267162323),
 ('PO', 0.5463650226593018),
 ('N', 0.5440875291824341),
 ('金額', 0.515935480594635),
 ('数量', 0.5047597885131836),
 ('提供', 0.5003150105476379),
 ('以下', 0.49566370248794556)]

In [16]:
model.wv.doesnt_match("SAP無法用")

'無'

## clustering

In [6]:
## create a dictionary: words as key ; count as values
words = {word: vocab.count for word, vocab in model.wv.vocab.items()}

In [7]:
## sort and select the top 10000 count of words
words = sorted(words.items(), key=lambda x: x[1], reverse=True)
words = words[:10000]
words = np.array(words)[:, 0]
words

array(['無法', 'SAP', '登入', ..., '將其', '本機', 'Critix'], dtype='<U14')

In [8]:
## extract the word vectors 
vecs = model.wv[words]

In [9]:
## run clustering algorithm
kmeans = KMeans(n_clusters=50)
cluster = kmeans.fit_predict(vecs)

In [10]:
## print the result
df = pd.DataFrame([words.tolist(), cluster.tolist()], index=['words', 'no. cluster']).T
df.head(n=5)

Unnamed: 0,words,no. cluster
0,無法,37
1,SAP,37
2,登入,6
3,協助,36
4,請,28


In [11]:
## print every cluster of words
data = pd.concat([d['words'].reset_index(drop=True).rename(columns={0: k}) for k, d in df.groupby('no. cluster')], axis=1)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,檔案,No,中,in,key,煩請,登入,EC,申請,license,...,未,T,BOM,新增,新,做,can,開啟,詢問,列印
1,上傳,CS12,內部,not,down,人員,進入,Dear,請問,Citrix,...,找,,說,更新,重新,執行,,E,,設定
2,,F,應用,issue,excel,同仁,密碼,Sharry,部門,提示,...,看到,,已經,文件,今天,跑,,Pro,,印表機
3,,行,myquanta,is,抓,,連線,Shirley,目前,Logon,...,採購,,看不到,相同,需,SO,,License,,
4,,Hi,icon,system,Client,,電腦,,現在,不見,...,自動,,PDM,客人,完,開,,e,,
5,,S,Quanta,create,bom,,安裝,,人,KEY,...,廠商,,轉到,加入,成功,動作,,PRO,,
6,,CS20,Myquanta,fail,item,,輸入,,同事,System,...,發現,,同步,原本,工作,價格,,pro,,
7,,Item,首頁,file,檔,,CAMP,,大陸,入,...,抽單,,,以前,軟體,delivery,,,,
8,,CS50,在家,of,QH10,,工號,,手機,登入後,...,產生,,,好,直接,invoice,,,,
9,,CS40,Icon,on,裡面,,公司,,,亂碼,...,內容,,,確定,mail,cost,,,,
