# 단어의 임베딩
- 빈도수 계산: 빈도기반 - TF 상대빈도
- TDM : matrix TF를 행렬로 맘든 덧, 사전을 이용한 단순 빈도
- TF-IDF : TF x IDF
    - IDF : 역 문서 빈도

In [3]:
text = 'John likes to watch movies. Mary likes movies too. Mary also likes to watch football games.'
words = text.replace('.', '').split()
words

['John',
 'likes',
 'to',
 'watch',
 'movies',
 'Mary',
 'likes',
 'movies',
 'too',
 'Mary',
 'also',
 'likes',
 'to',
 'watch',
 'football',
 'games']

In [4]:
# 딕셔너리 TF생성
import numpy as np
word_count = np.unique(words, return_counts=True)
word_count

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
        'to', 'too', 'watch'], dtype='<U8'),
 array([1, 2, 1, 1, 1, 3, 2, 2, 1, 2]))

In [5]:
word_to_cnt = {}
for word, cnt in zip(*word_count):
    word_to_cnt[word] = cnt
word_to_cnt

{np.str_('John'): np.int64(1),
 np.str_('Mary'): np.int64(2),
 np.str_('also'): np.int64(1),
 np.str_('football'): np.int64(1),
 np.str_('games'): np.int64(1),
 np.str_('likes'): np.int64(3),
 np.str_('movies'): np.int64(2),
 np.str_('to'): np.int64(2),
 np.str_('too'): np.int64(1),
 np.str_('watch'): np.int64(2)}

In [6]:
word_to_cnt['movies']

np.int64(2)

# TDM
- 단어의 빈도수를 계산하여 행렬로!

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['John likes to watch movies. Mary likes movies too.' ,
          'Mary also likes to watch football games.']

vector = CountVectorizer()
tdm_array = vector.fit_transform(corpus).toarray()
tdm_array

array([[0, 0, 0, 1, 2, 1, 2, 1, 1, 1],
       [1, 1, 1, 0, 1, 1, 0, 1, 0, 1]])

In [8]:
tf_dic = vector.vocabulary_
print(tf_dic)

{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [9]:
import pandas as pd

tf_dic_sorted = dict(sorted(tf_dic.items(), key=lambda item: item[1]))
tf_dic_sorted

{'also': 0,
 'football': 1,
 'games': 2,
 'john': 3,
 'likes': 4,
 'mary': 5,
 'movies': 6,
 'to': 7,
 'too': 8,
 'watch': 9}

In [10]:
df = pd.DataFrame(tdm_array, columns=tf_dic_sorted.keys())
df

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0,0,0,1,2,1,2,1,1,1
1,1,1,1,0,1,1,0,1,0,1


# TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
tfidf_array

array([[0.        , 0.        , 0.        , 0.32369906, 0.46062909,
        0.23031454, 0.64739811, 0.23031454, 0.32369906, 0.23031454],
       [0.44610081, 0.44610081, 0.44610081, 0.        , 0.3174044 ,
        0.3174044 , 0.        , 0.3174044 , 0.        , 0.3174044 ]])

In [14]:
tfidf_dic = tfidf_vec.vocabulary_
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(), key=lambda item: item[1]))
tfidf_tdm = pd.DataFrame(tfidf_array, columns=tfidf_dic_sorted.keys())
tfidf_tdm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0.0,0.0,0.0,0.323699,0.460629,0.230315,0.647398,0.230315,0.323699,0.230315
1,0.446101,0.446101,0.446101,0.0,0.317404,0.317404,0.0,0.317404,0.0,0.317404


In [2]:
from gensim.models import Word2Vec

corpus = ['John likes to watch movies. Mary likes movies too.' ,
          'Mary also likes to watch football games.']

word_list = []

In [3]:
for word in corpus:
    word_list.append(word.replace('.', '').split())
    
word_list

[['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too'],
 ['Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']]

In [8]:
model = Word2Vec(word_list, sg=0, vector_size=100, window=3, min_count=1)
model.wv.most_similar('John', 'games')

[('likes', 0.18567818403244019),
 ('too', -0.01069658063352108),
 ('football', -0.011570061556994915),
 ('also', -0.0238940492272377),
 ('movies', -0.0652000680565834),
 ('Mary', -0.07073411345481873),
 ('watch', -0.2023678719997406),
 ('to', -0.2099374383687973)]