In [3]:
import gensim
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import MeCab
from collections import defaultdict
from sklearn.mixture import GaussianMixture
%matplotlib inline
sns.set()

In [4]:
model = gensim.models.KeyedVectors.load_word2vec_format('../entity_vector/entity_vector.model.bin', binary=True)

In [10]:
mecab = MeCab.Tagger('-Ochasen')
mecab.parse('')

'EOS\n'

In [8]:
with open('./dogura_magura.txt') as file:
    lines = file.readlines()

In [11]:
words = []
for line in lines:
    text = ''
    line = line.replace('\n', '')
    for p in mecab.parse(line).splitlines()[:-1]:
        parsed = p.split('\t')
        if not '非自立' in parsed[3] and '名詞' in parsed[3]:
            text += parsed[2]
            text += ','
    if text != '':
        words.append(text[:-1])

In [12]:
def get_word_vector(word):
    try:
        return model[word]
    except:
        return np.zeros(200, )

In [13]:
def calc_idf(n, word_freq):
    return np.log2(n / word_freq) + 1.0

In [14]:
def calc_freq(words):
    frequency = defaultdict(int)
    for i, word in enumerate(words):
        for token in word.split(','):
            frequency[token] += 1
    return frequency

In [15]:
def calc_GMM(freq, n_cluster):
    gmm = GaussianMixture(n_components=n_cluster, covariance_type='tied', max_iter=500)
    X_train = []
    for word in freq:
        vec = get_word_vector(word)
        if vec != '':
            vec = vec.reshape(1, 200)
            X_train.append(vec[0])
    gmm.fit(np.array(X_train))
    return gmm

In [None]:
# word frequency
freq = calc_freq(words)
N = len(freq)

# GMM train
cluster = 3
gmm = calc_GMM(freq, cluster)

In [17]:
wtv = []
for row in words:
    wtv_i = np.zeros(cluster*200, )
    for word in row.split(','):
        # word vector
        wv_vector = get_word_vector(word)
        if np.all(wv_vector == 0): continue
        # idf
        if freq[word] == 0: continue
        idf = calc_idf(N, freq[word])
        # gmm prob
        word_probs = gmm.predict_proba(wv_vector.reshape(1, 200))
        
        # wcv_ik
        wcv_ik = [wv_vector * word_prob for word_prob in word_probs[0]]
        
        # wtv_i
        con = np.concatenate((wcv_ik[0], wcv_ik[1]))
        if len(wcv_ik) < 3: continue
        for wcv in wcv_ik[2:]:
            con = np.concatenate((con, wcv))
        wtv_i += con * idf
    wtv.append(wtv_i/len(row.split(',')))

  


In [20]:
wtv[5]

array([-1.83967228e+00,  2.42057900e+00, -4.47635649e+00,  1.70168588e+00,
        6.56745765e+00,  1.86969229e+00, -6.61973635e-02,  1.38817411e-01,
        2.67943371e+00, -1.02770899e+00,  1.02821139e+00,  2.34375061e+00,
        1.17281003e+00, -1.58644483e+00,  1.01944699e+01,  8.42663056e-01,
        2.34063101e+00,  1.39468206e+00, -2.11685867e+00,  2.59672629e+00,
       -3.13173341e+00, -1.44002525e+00, -5.16789517e+00,  1.70253091e+00,
       -1.61085870e+00,  2.33653479e+00,  1.46787976e+00,  4.71103867e+00,
       -2.70253313e+00, -4.72814717e+00,  9.43150155e-01, -2.92969471e+00,
       -7.44254566e-01,  5.56580147e+00, -3.79341257e+00,  1.64360835e+00,
       -3.33838191e-01,  2.11610519e+00,  2.34752223e+00,  2.43326764e+00,
       -1.12978322e+00, -1.46368183e+00, -4.39874721e+00,  6.27188162e-01,
        2.89060392e+00, -1.05335365e+00,  2.73523534e+00, -3.14068353e+00,
       -3.20657091e+00,  4.58366845e+00,  4.61255681e+00, -9.93299832e-01,
        6.38991635e+00,  