# 단어 임베딩 튜토리얼

In [1]:
import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
import os
import sys

sys.path.insert(0, os.path.abspath('/notebooks/embedding'))
sys.path.insert(0, os.path.abspath('/notebooks/embedding/models'))

os.chdir('/notebooks/embedding')
#sys.path.append('/notebooks/embedding')

In [3]:
# ! wget https://github.com/dongjun-Lee/kor2vec/raw/master/test_dataset/kor_ws353.csv -P /notebooks/embedding/data/raw

In [4]:
from models.word_eval import WordEmbeddingEvaluator

## 코드 4-5 Word2Vec 코사인 유사도 상위 단어 목록 체크

In [23]:
vecs_txt_fname='/notebooks/embedding/data/word-embeddings/word2vec/word2vec'
method='word2vec'
dim=100
tokenizer_name='mecab'

In [24]:
from models.word_eval import WordEmbeddingEvaluator

In [25]:
model = WordEmbeddingEvaluator(vecs_txt_fname=vecs_txt_fname, 
                               method=method, 
                               dim=dim, 
                               tokenizer_name=tokenizer_name)

In [6]:
model.most_similar('희망', topn=5)

[('소망', 0.79329586),
 ('행복', 0.7861444),
 ('희망찬', 0.76918393),
 ('꿈', 0.76410115),
 ('열망', 0.7336163)]

In [27]:
model._is_in_vocabulary('서울특벌시')

False

In [26]:
model.most_similar('서울특벌시', topn=5)

[('특별시', 0.7592423437826079),
 ('동대문', 0.7466565542196317),
 ('관악', 0.7372439826844083),
 ('영등포', 0.7357625674365258),
 ('서대문', 0.7306555874308116)]

## 코드 4-9 FastText Skip-gram 모델의 코사인 유사도 상위 단어 목록 체크

In [8]:
vecs_txt_fname='/notebooks/embedding/data/word-embeddings/fasttext/fasttext.vec'
vecs_bin_fname='/notebooks/embedding/data/word-embeddings/fasttext/fasttext.bin'
method='fasttext'
dim=100
tokenizer_name='mecab'

In [9]:
from models.word_eval import WordEmbeddingEvaluator

In [10]:
model = WordEmbeddingEvaluator(vecs_txt_fname=vecs_txt_fname, 
                               vecs_bin_fname=vecs_bin_fname, 
                               method=method, 
                               dim=dim, 
                               tokenizer_name=tokenizer_name)




In [11]:
model.most_similar('희망', topn=5)

[('행복', 0.779841546336235),
 ('희망찬', 0.7223989696190132),
 ('소망', 0.7158186282535396),
 ('땀방울', 0.6873366793848128),
 ('희망특강', 0.6866479743108849)]

## 그림 4-12 '하였다'와 가장 유사한 FastText 단어 목록

In [12]:
model.most_similar('하였다', topn=5)

[('하', 0.9295729665862918),
 ('다', 0.907324941314357),
 ('했', 0.8929994169029608),
 ('였으며', 0.8632510577839813),
 ('했으며', 0.8549427906656639)]

## 그림 4-13 미등록 단어에 대한 FastText 임베딩 체크

In [13]:
model._is_in_vocabulary('서울특벌시')

False

In [14]:
model.get_word_vector('서울특벌시')[:10]

array([-0.34943753,  0.35756463, -0.13093197, -0.2630131 , -0.46309552,
       -0.06662391, -0.01265321, -0.11453624,  0.20499213,  0.25899005],
      dtype=float32)

In [15]:
model.most_similar('서울특벌시', topn=5)

[('서울색', 0.7196167662285975),
 ('서울한강체', 0.661677125632246),
 ('서울새남굿', 0.6590039219164663),
 ('철화문', 0.65209296055566),
 ('서울서체', 0.6515671876001969)]

### 코드 4-12 한글 자소분해 예시

In [16]:
from preprocess import jamo_sentence, get_tokenizer
tokenizer = get_tokenizer("mecab")
tokens = " ".join(tokenizer.morphs("나는 학교에 간다"))
print(jamo_sentence(tokens))

ㄴㅏ- ㄴㅡㄴ ㅎㅏㄱㄱㅛ- ㅇㅔ- ㄱㅏㄴㄷㅏ-


### 코드 4-13 은전한닢  mecab으로 형태소 분석된 말뭉치를 자소 단위로 분해

In [37]:
import os
os.chdir('/notebooks/embedding')
! python preprocess/unsupervised_nlputils.py --preprocess_mode jamo \
            --input_path /notebooks/embedding/data/tokenized/corpus_mecab.txt \
            --output_path /notebooks/embedding/data/tokenized/corpus_mecab_jamo.txt

^C
Traceback (most recent call last):
  File "preprocess/unsupervised_nlputils.py", line 170, in <module>
    process_jamo(args.input_path, args.output_path)
  File "preprocess/unsupervised_nlputils.py", line 141, in process_jamo
    processed_sentence = jamo_sentence(sentence)
  File "preprocess/unsupervised_nlputils.py", line 128, in jamo_sentence
    if character_is_korean(char):
  File "/usr/local/lib/python3.5/dist-packages/soynlp/hangle/_hangle.py", line 94, in character_is_korean
    i = to_base(c)
  File "/usr/local/lib/python3.5/dist-packages/soynlp/hangle/_hangle.py", line 106, in to_base
    def to_base(c):
KeyboardInterrupt


In [2]:
import os
os.chdir('/notebooks/embedding')

In [None]:
! mkdir -p data/word-embeddings/fasttext-jamo
! models/fastText/fasttext skipgram \
  -input data/tokenized/corpus_mecab_jamo.txt \
  -output data/word-embeddings/fasttext-jamo/fasttext-jamo

### 코드 4-15 자소 단위 FastText Skip-gram 모델의 유사어 상위 목록 체크

In [17]:
vecs_txt_fname='/notebooks/embedding/data/word-embeddings/fasttext-jamo/fasttext-jamo.vec'
vecs_bin_fname='/notebooks/embedding/data/word-embeddings/fasttext-jamo/fasttext-jamo.bin'
method='fasttext-jamo'
dim=100
tokenizer_name='mecab'

In [18]:
from models.word_eval import WordEmbeddingEvaluator
model = WordEmbeddingEvaluator(vecs_txt_fname=vecs_txt_fname, 
                               vecs_bin_fname=vecs_bin_fname, 
                               method=method, 
                               dim=dim, 
                               tokenizer_name=tokenizer_name)




In [19]:
model.most_similar('희망', topn=5)

[('희망찬', 0.8146420631099898),
 ('행복', 0.782225588355358),
 ('희망특강', 0.7528447282282469),
 ('희망자', 0.7333699883345198),
 ('소망', 0.7317962077164233)]

### 코드 4-16 미등록 단어에 대한 자소 단위 FastText 임베딩 체크

In [20]:
model._is_in_vocabulary("서울특벌시")

False

In [21]:
model.get_word_vector("서울특벌시")[:10]

array([ 0.27308005, -0.03842273, -0.14564085, -0.6462154 ,  0.11989901,
        0.3353665 ,  0.03407207, -0.02902705,  0.38201326, -0.22358143],
      dtype=float32)

In [22]:
model.most_similar('서울특벌시', topn=5)

[('서울시', 0.7747652879791547),
 ('특별시', 0.767032326418248),
 ('서울특별시장', 0.7537271226369762),
 ('특별시세', 0.736772918595682),
 ('성동격서', 0.736006622807833)]

# LSA 잠재 의미 분석

## 코드 4-22  코사인 유사도 상위 단어 목록 체크 (단어-문맥 행렬 + LSA)

In [57]:
vecs_txt_fname='/notebooks/embedding/data/word-embeddings/lsa/lsa-cooc.vecs'
method='lsa'
dim=100
tokenizer_name='mecab'

In [58]:
from models.word_eval import WordEmbeddingEvaluator
model = WordEmbeddingEvaluator(vecs_txt_fname=vecs_txt_fname,
                               method=method, 
                               dim=dim, 
                               tokenizer_name=tokenizer_name)

In [59]:
model.most_similar('희망', topn=5)

[('진실', 0.9481062178606177),
 ('의식', 0.9450190576762693),
 ('즐거움', 0.9365101544335024),
 ('사냥', 0.933343812149678),
 ('인내심', 0.9328922247781231)]

## 코드 4-23 코사인 유사도 상위 단어 목록 체크 (PPMI + LSA)

In [60]:
vecs_txt_fname='/notebooks/embedding/data/word-embeddings/lsa/lsa-pmi.vecs'
method='lsa'
dim=100
tokenizer_name='mecab'

In [61]:
from models.word_eval import WordEmbeddingEvaluator
model = WordEmbeddingEvaluator(vecs_txt_fname=vecs_txt_fname,
                               method=method, 
                               dim=dim, 
                               tokenizer_name=tokenizer_name)

In [62]:
model.most_similar('희망', topn=5)

[('진실', 0.9483922027456372),
 ('의식', 0.9451445814676003),
 ('즐거움', 0.9365892430718368),
 ('사냥', 0.9333096097760885),
 ('인내심', 0.9329586789540774)]

# GloVe

## 코드 4-27 GloVe 모델의 코사인 유사도 상위 단어 체크

In [63]:
vecs_txt_fname='/notebooks/embedding/data/word-embeddings/glove/glove.txt'
method='glove'
dim=100
tokenizer_name='mecab'

In [65]:
model = WordEmbeddingEvaluator(vecs_txt_fname=vecs_txt_fname, 
                               method=method, 
                               dim=dim, 
                               tokenizer_name=tokenizer_name)

In [66]:
model.most_similar('희망', topn=5)

[('행복', 0.7593905742455305),
 ('꿈', 0.7190308221412974),
 ('사랑', 0.6961724218705125),
 ('미래', 0.6795513141617532),
 ('세상', 0.672559694141116)]

# Swivel

## 코드 4-31 Swivel 모델의 코사인 유사도 목록 체크

In [69]:
vecs_txt_fname='/notebooks/embedding/data/word-embeddings/swivel/row_embedding.tsv'
method='swivel'
dim=100
tokenizer_name='mecab'

In [70]:
model = WordEmbeddingEvaluator(vecs_txt_fname=vecs_txt_fname, 
                               method=method, 
                               dim=dim, 
                               tokenizer_name=tokenizer_name)

In [71]:
model.most_similar('희망', topn=5)

[('행복', 0.6927502964154575),
 ('꿈', 0.6193135728601431),
 ('우리', 0.5831879804568582),
 ('젊은이', 0.5795773384886462),
 ('사랑', 0.5489307751831267)]

In [16]:
# word2vec: spearman corr: 0.5770993871014621 , pearson corr: 0.5956751142850295 
#  fasttext: spearman corr: 0.5770993871014621 , pearson corr: 0.5956751142850295
# glove: spearman corr: 0.49029953452220065 , pearson corr: 0.5383746018370396
# swivel: spearman corr: 0.549541215508716 , pearson corr: 0.5727286333920304