### TF

In [1]:
text = "John likes to watch movies. Mary likes movies too.\
 Mary also likes to watch football games."

In [2]:
words = text.replace('.', '').split()
print(words)

['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too', 'Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']


In [3]:
import numpy as np
word_count = np.unique(words, return_counts=True)
print(word_count)

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
       'to', 'too', 'watch'], dtype='<U8'), array([1, 2, 1, 1, 1, 3, 2, 2, 1, 2]))


In [4]:
word_to_cnt = {}
for word, cnt in zip(*word_count):
    word_to_cnt[word] = cnt
print(word_to_cnt)

{'John': 1, 'Mary': 2, 'also': 1, 'football': 1, 'games': 1, 'likes': 3, 'movies': 2, 'to': 2, 'too': 1, 'watch': 2}


### DTM

In [5]:
corpus = [
    "John likes to watch movies. Mary likes movies too.",
    "Mary also likes to watch football games."
]

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
dtm_array = vector.fit_transform(corpus).toarray()
tf_dic = vector.vocabulary_
print(dtm_array)
print(tf_dic)

[[0 0 0 1 2 1 2 1 1 1]
 [1 1 1 0 1 1 0 1 0 1]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [7]:
import pandas as pd
tf_dic_sorted = dict(sorted(tf_dic.items(),
                            key=lambda item: item[1]))
df = pd.DataFrame(dtm_array, columns=tf_dic_sorted.keys())
print(df)

   also  football  games  john  likes  mary  movies  to  too  watch
0     0         0      0     1      2     1       2   1    1      1
1     1         1      1     0      1     1       0   1    0      1


### TF-IDF

In [8]:
pd.set_option("display.max_columns", 100)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
tfidf_dic = tfidf_vec.vocabulary_
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(),
                               key=lambda item: item[1]))
tfidf_dtm = pd.DataFrame(tfidf_array,
                         columns=tfidf_dic.keys())
print(tfidf_dtm)

       john     likes        to     watch    movies      mary       too  \
0  0.000000  0.000000  0.000000  0.323699  0.460629  0.230315  0.647398   
1  0.446101  0.446101  0.446101  0.000000  0.317404  0.317404  0.000000   

       also  football     games  
0  0.230315  0.323699  0.230315  
1  0.317404  0.000000  0.317404  


### Word2Vec

In [10]:
!pip install --upgrade gensim==3.8.3

Collecting gensim==3.8.3
  Downloading gensim-3.8.3.tar.gz (23.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gensim
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for gensim (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for gensim[0m[31m
[0m[?25h  Running setup.py clean for gensim
Failed to build gensim
[31mERROR: Could not build wheels for gensim, which is required to install pyproject.toml-based projects[0m[31m
[0m

In [12]:
corpus = ["John likes to watch movies. Mary likes movies too",
          "Mary also likes to watch football games."]

word_list = []
for word in corpus:
    word_list.append(word.replace('.', '').split())

from gensim.models import Word2Vec
model = Word2Vec(word_list, sg=0, vector_size=100,
                 window=3, min_count=1)

print(model.wv.most_similar('likes'))
print(model.wv.similarity('movies', 'games'))

[('John', 0.21617142856121063), ('also', 0.09291722625494003), ('too', 0.027057476341724396), ('football', 0.016134677454829216), ('Mary', -0.010840574279427528), ('to', -0.02775036357343197), ('movies', -0.05234673246741295), ('games', -0.059876296669244766), ('watch', -0.111670583486557)]
0.0640898


In [1]:
!pip install --upgrade gensim==3.8.3

Collecting gensim==3.8.3
  Using cached gensim-3.8.3.tar.gz (23.4 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gensim
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for gensim (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for gensim[0m[31m
[0m[?25h  Running setup.py clean for gensim
Failed to build gensim
[31mERROR: Could not build wheels for gensim, which is required to install pyproject.toml-based projects[0m[31m
[0m

In [1]:
# https://github.com/Kyubyong/wordvectors
#  -> 화면 아래 Pre-trained models에서 Korean (w) 클릭 후 zip 파일 내려받으세요.
# 압축 풀어서 ko.bin 파일을 런타임에 드래그해서 업로드 한 후 실행하세요.
from gensim.models import Word2Vec
model = Word2Vec.load("/content/ko.bin")

ERROR:gensim.models.word2vec:Model load error. Was model saved using code from an older Gensim Version? Try loading older model using gensim-3.8.3, then re-saving, to restore compatibility with current code.


AttributeError: 'Word2Vec' object has no attribute 'wv'

In [None]:
print(model.wv.most_similar("인공지능"))