In [None]:
import gensim
import pandas as pd
import numpy as np 
import scipy as sp
from collections import defaultdict
from sklearn.mixture import GaussianMixture

In [None]:
# word vector のモデルファイルの読み込み
def load_model(model_type='word2vec'):
    if model_type == 'fasttext':
        return gensim.models.KeyedVectors.load_word2vec_format('../fastText/build/model.vec', binary=False)
    elif model_type == 'word2vec':
        return gensim.models.KeyedVectors.load_word2vec_format('./entity_vector/wiki.model.bin', binary=True)

In [None]:
# モデルを使って言葉のベクトルを取得 .binバージョン
def get_word_vector(words, vec_size):
    word_vec = defaultdict(list)
    for word in words:
        try:
            word_vec[word].append(model[word])
        except:
            word_vec[word].append(np.zeros(vec_size, ))
    return word_vec

In [None]:
# 出現頻度の数え上げ(1単語は1データに1回)
def count_freq(words):
    frequency = defaultdict(int)
    for i, word in enumerate(words):
        word_list = []
        for token in word.split(','):
            if token in word_list:
                continue
            frequency[token] += 1
            word_list.append(token)
    return frequency

In [None]:
# IDFの計算
def calc_idf(n, word_freq):
    return np.log2(n / word_freq) + 1.0

In [None]:
# GMM
def training_GMM(wv, n_cluster, vec_size):
    x = [vec[0].reshape(1, vec_size)[0] for word, vec in wv.items() if not np.all(vec[0] == 0)]
    
    gmm = GaussianMixture(n_components=n_cluster, covariance_type='tied')
    gmm.fit(np.array(x))
    return gmm

In [None]:
# data
path = 'test.csv'
scdv_column = 'hoge'
words = pd.read_csv(path)[scdv_column].values

# 単語頻度
freq = count_freq(words)
N = len(words)
del words

# modelファイルの読み込み
model = load_model('fasttext')

# modelのベクトルサイズ
vec_size = model.vector_size
print('vector length:{0}'.format(vec_size))

# word vectorを取得
wv = get_word_vector(freq.keys(), vec_size)

# モデルの削除
del model

In [None]:
# GMM
cluster = 5
gmm = training_GMM(wv, n_cluster=cluster, vec_size=vec_size)

In [None]:
# SCDV計算
words = pd.read_csv(path)[scdv_column].values
sentence_vec = []
for row in words:
    wtv = np.zeros(cluster*vec_size, )
    for word in row.split(','):
        if np.all(wv[word] == 0): continue
        # idf
        idf = calc_idf(N, freq[word])
        # wcv_ik
        wcv_ik = [prob * wv[word][0] for prob in gmm.predict_proba(wv[word])[0]]
        # wtv_i
        con = np.concatenate((wcv_ik[0], wcv_ik[1]))
        if len(wcv_ik) > 2:
            for wcv in wcv_ik[2:]:
                con = np.concatenate((con, wcv))
        wtv_i = con * idf
        wtv += wtv_i
    sentence_vec.append(wtv/len(row.split(',')))
del words

In [None]:
scdv_df = pd.DataFrame()
for i, s in enumerate(sentence_vec):
    scdv_df = scdv_df.append(pd.Series(s, name='d_' + str(i)))

In [None]:
scdv_df.to_csv("output/test_output.csv", index=False)