# 10.1 Embedding

In [None]:
# 단어 및 문장 간 관련성 계산
# 의미적 혹은 문법적 정보의 함축

## 10.1.1 Sparse Representation based Embedding

In [2]:
import pandas as pd
class2 = pd.read_csv('data/chap10/class2.csv')

from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
onehot_encoder = preprocessing.OneHotEncoder()

train_x = label_encoder.fit_transform(class2['class2'])
train_x

array([2, 2, 1, 0, 1, 0])

In [None]:
""" One-hot Encoding의 단점 

1. 단어끼리의 관계성(유의어, 반의어) 업이 서로 독립적인 관계
2. 차원이 너무 커지는 문제가 발생

"""

## 10.1.2 Counting based Embedding

In [3]:
# Corpus에 counter vector 적용

from sklearn.feature_extraction.text import CountVectorizer
corpus = [
          'This is last chance.',
          'and if you do not have this chance.',
          'you will never get any chance.',
          'will you do get this one?',
          'please, get this chance',
]

vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'this': 13,
 'is': 7,
 'last': 8,
 'chance': 2,
 'and': 0,
 'if': 6,
 'you': 15,
 'do': 3,
 'not': 10,
 'have': 5,
 'will': 14,
 'never': 9,
 'get': 4,
 'any': 1,
 'one': 11,
 'please': 12}

In [4]:
vect.transform(['you will never get any chance.']).toarray()

array([[0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1]])

In [5]:
vect = CountVectorizer(stop_words=["and", "is", "please", "this"]).fit(corpus)
vect.vocabulary_

{'last': 6,
 'chance': 1,
 'if': 5,
 'you': 11,
 'do': 2,
 'not': 8,
 'have': 4,
 'will': 10,
 'never': 7,
 'get': 3,
 'any': 0,
 'one': 9}

In [None]:
# TF-IDF : Term Frequency-Inverse Document Frequency

# TF : 특정 문서 d에서 특정 단어 t의 등장 횟수
# DF : 특정 단어 t가가 포함된 문서 개수
# IDF : Inverse Document Frequency

# 키워드 검색을 기반으로 하는 검색 엔진
# 중요 키워드 분석
# 검색 엔진에서 검색 결과의 순위를 결정 

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

doc = ['I like machine learning', 'I love deep learning', 'I run everyday']
tfidf_vectorizer = TfidfVectorizer(min_df=1)
tfidf_matrix = tfidf_vectorizer.fit_transform(doc)
doc_distance = (tfidf_matrix * tfidf_matrix.T)
print('유사도를 위한', str(doc_distance.get_shape()[0]), 'x', str(doc_distance.get_shape()[1]), '행렬을 만들었습니다.')
print(doc_distance.toarray())

유사도를 위한 3 x 3 행렬을 만들었습니다.
[[1.       0.224325 0.      ]
 [0.224325 1.       0.      ]
 [0.       0.       1.      ]]


## 10.1.3 Prediction based Embedding

### Word2Vec

In [None]:
import nltk
nltk.download('punkt')

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings(action='ignore')
import gensim
from gensim.models import Word2Vec

sample = open('/content/peter.txt', "r", encoding='UTF8')
s = sample.read()

f = s.replace('\n', ' ')
data = []

for i in sent_tokenize(f):
  temp = []
  for j in word_tokenize(i):
    temp.append(j.lower())
  data.append(temp)

data[0]

['once',
 'upon',
 'a',
 'time',
 'in',
 'london',
 ',',
 'the',
 'darlings',
 'went',
 'out',
 'to',
 'a',
 'dinner',
 'party',
 'leaving',
 'their',
 'three',
 'children',
 'wendy',
 ',',
 'jhon',
 ',',
 'and',
 'michael',
 'at',
 'home',
 '.']

In [8]:
model1 = gensim.models.Word2Vec(data, min_count=1, size=100, window=5, sg=0)
print("Cosine similarity between 'peter' " + "'wendy' - CBOW : ", model1.similarity('peter', 'wendy'))

Cosine similarity between 'peter' 'wendy' - CBOW :  -0.059717968


In [9]:
print("Cosine similarity between 'peter' " + "'hook' - CBOW : ", model1.similarity('peter', 'hook'))

Cosine similarity between 'peter' 'hook' - CBOW :  0.17057094


In [10]:
model2 = gensim.models.Word2Vec(data, min_count=1, size=100, window=5, sg=1)
print("Cosine similarity between 'peter' " + "'wendy' - Skip Gram : ", model2.similarity('peter', 'wendy'))

Cosine similarity between 'peter' 'wendy' - Skip Gram :  0.23208164


In [11]:
print("Cosine similarity between 'peter' " + "'hook' - Skip Gram : ", model2.similarity('peter', 'hook'))

Cosine similarity between 'peter' 'hook' - Skip Gram :  0.5484486


In [12]:
from gensim.test.utils import common_texts
from gensim.models import FastText

model = FastText(data, size=4, window=3, min_count=1, iter=10)

In [13]:
sim_score = model.wv.similarity('peter', 'wendy')
print(sim_score)

0.6989395


In [14]:
sim_score = model.wv.similarity('peter', 'hook')
print(sim_score)

-0.23571663


In [13]:
from __future__ import print_function
from gensim.models import KeyedVectors

model_kr = KeyedVectors.load_word2vec_format('data/chap10/wiki.ko.vec')

In [14]:
find_similar_to = '노력'

for similar_word in model_kr.similar_by_word(find_similar_to):
  print("Word: {0}, Similarity: {1:.2f}".format(
      similar_word[0], similar_word[1]
  ))

Word: 노력함, Similarity: 0.80
Word: 노력중, Similarity: 0.75
Word: 노력만, Similarity: 0.72
Word: 노력과, Similarity: 0.71
Word: 노력의, Similarity: 0.69
Word: 노력가, Similarity: 0.69
Word: 노력이나, Similarity: 0.69
Word: 노력없이, Similarity: 0.68
Word: 노력맨, Similarity: 0.68
Word: 노력보다는, Similarity: 0.68


In [None]:
similarities = model_kr.wv.most_similar(positive=['동물', '육식동물'], negative=['사람'])
print(similarities)

## 10.1.4 Counting/Prediction based Embedding

In [None]:
# GloVe: Global Vectors for Word Representation

# LSA : Latent Semantic Analysis
# Word2Vec
# 두 Models의 단점을 보완

In [19]:
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.decomposition import PCA
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = datapath('/Users/mini/TensorFlow/data/chap10/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("/Users/mini/TensorFlow/data/chap10/glove.6B.100d.txt")
glove2word2vec(glove_file, word2vec_glove_file)

  glove2word2vec(glove_file, word2vec_glove_file)


(400000, 100)

In [20]:
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)
model.most_similar('bill')

[('legislation', 0.8072139620780945),
 ('proposal', 0.7306863069534302),
 ('senate', 0.7142540812492371),
 ('bills', 0.704440176486969),
 ('measure', 0.6958035230636597),
 ('passed', 0.6906244158744812),
 ('amendment', 0.6846879720687866),
 ('provision', 0.6845566630363464),
 ('plan', 0.6816462874412537),
 ('clinton', 0.6663140058517456)]

In [21]:
model.most_similar('cherry')

[('peach', 0.688809871673584),
 ('mango', 0.6838189959526062),
 ('plum', 0.6684104204177856),
 ('berry', 0.659035861492157),
 ('grove', 0.6581552028656006),
 ('blossom', 0.6503506302833557),
 ('raspberry', 0.6477391123771667),
 ('strawberry', 0.6442098021507263),
 ('pine', 0.6390928626060486),
 ('almond', 0.6379212141036987)]

In [22]:
model.most_similar(negative=['cherry'])

[('kazushige', 0.4834350347518921),
 ('askerov', 0.4778185784816742),
 ('lakpa', 0.46915262937545776),
 ('ex-gay', 0.45713332295417786),
 ('tadayoshi', 0.4522107243537903),
 ('turani', 0.44810065627098083),
 ('saglam', 0.4469599425792694),
 ('aijun', 0.4435270130634308),
 ('adjustors', 0.44235295057296753),
 ('nyum', 0.4423117935657501)]

In [23]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

queen: 0.7699


In [24]:
def analogy(x1, x2, y1):
  result = model.most_similar(positive=[y1, x2], negative=[x1])
  return result[0][0]

analogy('australia', 'beer', 'france')

'champagne'

In [25]:
analogy('tall', 'tallest', 'long')

'longest'

In [26]:
print(model.doesnt_match('breakfast cereal dinner lunch'.split()))

cereal
