<a href="https://colab.research.google.com/github/ghostfm3/ksks/blob/master/SVD02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab > /dev/null
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git > /dev/null 
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n > /dev/null 2>&1
!pip install mecab-python3 > /dev/null
!echo mecab-config --dicdir"/mecab-ipadic-neologd"

Cloning into 'mecab-ipadic-neologd'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 75 (delta 5), reused 54 (delta 0), pack-reused 0[K
Unpacking objects: 100% (75/75), done.
mecab-config --dicdir/mecab-ipadic-neologd


In [3]:
pip install unidic-lite

Collecting unidic-lite
[?25l  Downloading https://files.pythonhosted.org/packages/74/d2/a4233f65f718f27065a4cf23a2c4f05d8bd4c75821e092060c4efaf28e66/unidic-lite-1.0.7.tar.gz (47.3MB)
[K     |████████████████████████████████| 47.3MB 96kB/s 
[?25hBuilding wheels for collected packages: unidic-lite
  Building wheel for unidic-lite (setup.py) ... [?25l[?25hdone
  Created wheel for unidic-lite: filename=unidic_lite-1.0.7-cp36-none-any.whl size=47556593 sha256=ae750557ec79efdcc72d8185b286914d43a5b7dd4a4320d46990c504fc3cd94a
  Stored in directory: /root/.cache/pip/wheels/a8/82/7d/086724645e33a575aafd0b1dae2835c37d2c00c6a0a96ee3a0
Successfully built unidic-lite
Installing collected packages: unidic-lite
Successfully installed unidic-lite-1.0.7


In [4]:
import sys
sys.path.append('..')
import numpy as np
import matplotlib.pyplot as plt
import MeCab

def preprocess(text):
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')

    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word

    corpus = np.array([word_to_id[w] for w in words])

    return corpus, word_to_id, id_to_word

def create_co_matrix(corpus, vocab_size, window_size=1):
    '''共起行列の作成
    :param corpus: コーパス（単語IDのリスト）
    :param vocab_size:語彙数
    :param window_size:ウィンドウサイズ（ウィンドウサイズが1のときは、単語の左右1単語がコンテキスト）
    :return: 共起行列
    '''
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)

    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size + 1):
            left_idx = idx - i
            right_idx = idx + i

            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1

            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1

    return co_matrix

def ppmi(C, verbose=False, eps = 1e-8):
    '''PPMI（正の相互情報量）の作成
    :param C: 共起行列
    :param verbose: 進行状況を出力するかどうか
    :return:
    '''
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0

    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
            M[i, j] = max(0, pmi)

            if verbose:
                cnt += 1
                if cnt % (total//100 + 1) == 0:
                    print('%.1f%% done' % (100*cnt/total))
    return M

m = MeCab.Tagger('-Owakati')
result = m.parse('PPIとはプロトンポンプ阻害薬のことです')

text = result
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(id_to_word)
C = create_co_matrix(corpus, vocab_size, window_size=1)
W = ppmi(C)

U, S, V = np.linalg.svd(W)

In [5]:
print(C[0])

[0 1 0 0 0 0 0 0 0 0 0]


In [6]:
print(W[0])

[0.       3.321928 0.       0.       0.       0.       0.       0.
 0.       0.       0.      ]


In [7]:
print(U[0])
print(U[1])
print(U[2])
print(U[3])
print(U[4])
print(U[5])
print(U[6])
print(U[7])
print(U[8])

[ 0.32511944  0.         -0.4528665   0.          0.          0.4448483
 -0.4435661   0.         -0.44317868  0.          0.3133066 ]
[ 0.         -0.45558053  0.         -0.6049245  -0.5054269   0.
  0.         -0.36615074  0.          0.19232789  0.        ]
[ 0.44819164  0.         -0.5081365   0.          0.          0.18513913
  0.20218253  0.          0.5146336   0.         -0.44824043]
[ 0.         -0.4429389   0.         -0.36615074  0.20448261  0.
  0.          0.6049245   0.         -0.51185155  0.        ]
[ 0.43979722  0.         -0.19159645  0.          0.         -0.5175263
  0.51222193  0.         -0.19683726  0.          0.44824043]
[ 0.0000000e+00 -4.3875167e-01  0.0000000e+00 -6.1062266e-16
  6.3675821e-01  0.0000000e+00  0.0000000e+00 -4.1993215e-16
  0.0000000e+00  6.3406307e-01  0.0000000e+00]
[ 0.43979722  0.          0.19159645  0.          0.         -0.5175263
 -0.51222193  0.         -0.19683726  0.         -0.44824043]
[ 0.         -0.4429389   0.          0.