In [1]:
#!pip install -U nltk
#nltk.download()

In [2]:
#from nltk.book import *

In [21]:
import nltk
import random
import numpy as np
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

'''
# 必要であれば変更してください
import os
os.chdir('/root/userspace/team9_project')

'''
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [4]:
import pandas as pd

  (fname, cnt))
  (fname, cnt))


In [5]:
def load_data(path):
    """『こころ』を読み込むための関数

    :param path: str, 『こころ』のパス
    :return text: list of list of str, 各文がトークナイズされた『こころ』
    """
    text = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            line = nltk.word_tokenize(line)
            text.append(line)
    return text

In [6]:
te = "hello, i am mako."
nltk.word_tokenize(te)

['hello', ',', 'i', 'am', 'mako', '.']

In [7]:
text = load_data("dataset/Material/recipe1M/recipe1M_0.txt")
#text = load_data("dataset/Material/recipes2.txt")
#text = load_data("dataset/kokoro.txt")

In [26]:
text += load_data("dataset/Material/recipe1M/recipe1M_1.txt")

In [28]:
text += load_data("dataset/Material/recipe1M/recipe1M_2.txt")

In [29]:
# 特殊なトークンとそのIDは事前に定義しておきます。
PAD_TOKEN = '<PAD>' # あとで説明するpaddingに使います
UNK_TOKEN = '<UNK>' # 辞書にない単語は全てこのUNKトークンに置き換えます。(UNKの由来はunkownです)
PAD = 0 # <PAD>のID
UNK = 1 # <UNK>のID

In [30]:
# 辞書の初期化
word2id = {
    PAD_TOKEN: PAD,
    UNK_TOKEN: UNK,
}

# 辞書に含める単語の最低出現回数
# 今回はコーパスのサイズが小さいので、全ての単語を辞書に含めることにします
MIN_COUNT = 10

In [31]:
class Vocab(object):
    """語彙を管理するためのクラス"""
    def __init__(self, word2id={}):
        """
        :param word2id: 単語(str)をインデックス(int)に変換する辞書
        """
        self.word2id = dict(word2id)
        self.id2word = {v: k for k, v in self.word2id.items()}    

    def build_vocab(self, sentences, min_count=1):
        """コーパスから語彙の辞書を構築するメソッド

        :param sentences: list of list of str, コーパス
        :param min_count: int, 辞書に含める単語の最小出現回数
        """
        # 各単語の出現回数をカウントする辞書を作成します
        word_counter = {}
        for sentence in sentences:
            for word in sentence:
                # dict.get(key, 0)はdictにkeyがあればdict[key]を、なければ0を返すメソッドです
                word_counter[word] = word_counter.get(word, 0) + 1

        # min_count回以上出現する単語のみ語彙に加えます
        # 出現回数の高い単語から順にword2idに追加していきます
        # 出現回数に-1をかけた値でsortすることで出現回数の降順になるようにしています
        for word, count in sorted(word_counter.items(), key=lambda x: -x[1]):
            if count < min_count:
                break
            _id = len(self.word2id)
            self.word2id.setdefault(word, _id)
            self.id2word[_id] = word

        # 語彙に含まれる単語の出現回数を保持します（あとで使います）
        self.raw_vocab = {w: word_counter[w] for w in self.word2id.keys() if w in word_counter}

In [32]:
vocab = Vocab(word2id=word2id)
vocab.build_vocab(text, min_count=MIN_COUNT)
print("語彙数:", len(vocab.word2id))

語彙数: 18322


In [33]:
def sentence_to_ids(vocab, sen):
    """
    単語のリストをIDのリストに変換する関数

    :param vocab: class `Vocab` object
    :param sen: list of str, 文を分かち書きして得られた単語のリスト
    :return out: list of int, 単語IDのリスト
    """
    out = [vocab.word2id.get(word, UNK) for word in sen] # 辞書にない単語にはUNKのIDを割り振ります
    return out

In [34]:
# 日本語のテキストを単語IDに変換します。
id_text = [sentence_to_ids(vocab, sen) for sen in text]

In [35]:
print(len(text))
print(len(id_text))

3
3


In [36]:
def pad_seq(seq, max_length):
    """Paddingを行う関数

    :param seq: list of int, 単語のインデックスのリスト
    :param max_length: int, バッチ内の系列の最大長
    :return seq: list of int, 単語のインデックスのリスト
    """
    seq += [PAD for i in range(max_length - len(seq))]
    return seq

## CBOW

In [38]:
# Hyper Parameters
batch_size = 64 # ミニバッチのサイズ
n_batches = 500 # 今回学習するミニバッチの数
vocab_size = len(vocab.word2id) # 語彙の総数
embedding_size = 300 # 各単語に割り当てるベクトルの次元数

In [39]:
class TestIter(object):
    def __init__(self):
        self.iter = 0
        self.max_iter = 5
    
    def __iter__(self): # 必須
        print("iter関数が呼び出されました")
        return self
    
    def __next__(self):
        self.iter += 1
        print("next関数が呼び出されました({}回目)".format(self.iter))
        if self.iter < self.max_iter:
            return None
        else:
            print("max_iterに達したので終了します")
            raise StopIteration

In [40]:
class DataLoaderCBOW(object):
    """CBOWのデータローダー"""
    def __init__(self, text, batch_size, window=3):
        """
        :param text: list of list of int, 単語をIDに変換したデータセット
        :param batch_size: int, ミニバッチのサイズ
        :param window: int, 周辺単語とターゲットの単語の最大距離
        """
        self.text = text
        self.batch_size = batch_size
        self.window = window
        self.s_pointer = 0 # データセット上を走査する文単位のポインタ
        self.w_pointer = 0 # データセット上を走査する単語単位のポインタ
        self.max_s_pointer = len(text) # データセットに含まれる文の総数

    def __iter__(self):
        return self

    def __next__(self):
        batch_X = []
        batch_Y = []
        while len(batch_X) < self.batch_size:
            # 走査する対象の文
            sen = self.text[self.s_pointer]
            
            # 予測すべき単語
            word_Y = sen[self.w_pointer]
            
            # 入力となる単語群を取得
            start = max(0, self.w_pointer - self.window)
            word_X = sen[start:self.w_pointer] + \
                sen[self.w_pointer + 1:self.w_pointer + self.window + 1]
            word_X = pad_seq(word_X, self.window * 2)
            
            batch_X.append(word_X)
            batch_Y.append(word_Y)
            self.w_pointer += 1
            
            if self.w_pointer >= len(sen):
                # 文を走査し終わったら次の文の先頭にポインタを移行する
                self.w_pointer = 0
                self.s_pointer += 1
                if self.s_pointer >= self.max_s_pointer:
                    # 全ての文を走査し終わったら終了する
                    self.s_pointer = 0
                    raise StopIteration

        # データはtorch.Tensorにする必要があります。dtype, deviceも指定します。
        batch_X = torch.tensor(batch_X, dtype=torch.long, device=device)
        batch_Y = torch.tensor(batch_Y, dtype=torch.long, device=device)

        return batch_X, batch_Y

In [41]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(CBOW, self).__init__()
        """
        :param vocab_size: int, 語彙の総数
        :param embedding_size: int, 単語埋め込みベクトルの次元
        """
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        # 埋め込み層
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        # 全結合層(バイアスなし)
        self.linear = nn.Linear(self.embedding_size, self.vocab_size, bias=False)
        
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.linear.weight)

    def forward(self, batch_X, batch_Y):
        """
        :param batch_X: torch.Tensor(dtype=torch.long), (batch_size, window*2)
        :param batch_Y: torch.Tensor(dtype=torch.long), (batch_size,)
        :return loss: torch.Tensor(dtype=torch.float), CBOWのloss
        """
        emb_X = self.embedding(batch_X) # (batch_size, window*2, embedding_size)
        # paddingした部分を無視するためにマスクをかけます
        emb_X = emb_X * (batch_X != PAD).float().unsqueeze(-1) # (batch_size, window*2, embedding_size)
        sum_X = torch.sum(emb_X, dim=1) # (batch_size, embedding_size)
        lin_X = self.linear(sum_X) # (batch_size, vocab_size)
        log_prob_X = F.log_softmax(lin_X, dim=-1) # (batch_size, vocab_size)
        loss = F.nll_loss(log_prob_X, batch_Y)
        return loss

In [42]:
# モデル
cbow = CBOW(vocab_size, embedding_size).to(device) # iLectで実行する場合warning (GPU is too old) が出ますが, 実行に問題はないので気にせず進めてください.
# optimizer
optimizer_cbow = optim.Adam(cbow.parameters())
# データローダー
dataloader_cbow = DataLoaderCBOW(id_text, batch_size)

In [43]:
def compute_loss(model, inputs, optimizer=None, is_train=True):
    """lossを計算するための関数
    
    is_train=Trueならモデルをtrainモードに、
    is_train=Falseならモデルをevaluationモードに設定します
    
    :param model: 学習させるモデル
    :param inputs: モデルへの入力
    :param optimizer: optimizer
    :param is_train: bool, モデルtrainさせるか否か
    """
    model.train(is_train)

    # lossを計算します。
    loss = model(*inputs)

    if is_train:
        # .backward()を実行する前にmodelのparameterのgradientを全て0にセットします
        optimizer.zero_grad()
        # parameterのgradientを計算します。
        loss.backward()
        # parameterのgradientを用いてparameterを更新します。
        optimizer.step()

    return loss.item()

In [44]:
start_at = time.time()

for batch_id, (batch_X, batch_Y) in enumerate(dataloader_cbow):
    loss = compute_loss(cbow, (batch_X, batch_Y), optimizer=optimizer_cbow, is_train=True)
    if batch_id % 100 == 0:
        print("batch:{}, loss:{:.4f}".format(batch_id, loss))
    if batch_id >= n_batches:
        break

end_at = time.time()

print("Elapsed time: {:.2f} [sec]".format(end_at - start_at))

batch:0, loss:9.8155
batch:100, loss:6.6067
batch:200, loss:6.3358
batch:300, loss:6.5026
batch:400, loss:5.8339
batch:500, loss:6.1636
Elapsed time: 4.95 [sec]


In [45]:
# 埋め込み層のパラメータのみを保存する
torch.save(cbow.embedding.weight.data.cpu().numpy(),  "dataset/recipe_en/recipe_en_cbow_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))
#torch.save(cbow.embedding.weight.data.cpu().numpy(),  "recipe_cbow_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))

## Skipgram

In [46]:
class DataLoaderSG(object):
    """Skipgramのためのデータローダー"""
    def __init__(self, text, batch_size, window=3):
        """
        :param text: list of list of int, 単語をIDに変換したデータセット
        :param batch_size: int, ミニバッチのサイズ
        :param window: int, 周辺単語と入力単語の最大距離
        """
        self.text = text
        self.batch_size = batch_size
        self.window = window
        self.s_pointer = 0 # データセット上を走査する文単位のポインタ
        self.w_pointer = 0 # データセット上を走査する単語単位のポインタ
        self.max_s_pointer = len(text) # データセットに含まれる文の総数

    def __iter__(self):
        return self

    def __next__(self):
        batch_X = []
        batch_Y = []

        while len(batch_X) < self.batch_size:
            sen = self.text[self.s_pointer]
            
            # Skipgramでは入力が1単語
            word_X = sen[self.w_pointer]

            # 出力は周辺単語
            start = max(0,self.w_pointer-self.window)
            word_Y = sen[start:self.w_pointer]+sen[self.w_pointer+1:self.w_pointer+self.window+1]
            word_Y = pad_seq(word_Y,self.window*2)

            batch_X.append(word_X)
            batch_Y.append(word_Y)
            self.w_pointer += 1

            if self.w_pointer >= len(sen):
                self.w_pointer = 0
                self.s_pointer += 1
                if self.s_pointer >= self.max_s_pointer:
                    self.s_pointer = 0
                    raise StopIteration

        batch_X = torch.tensor(batch_X, dtype=torch.long, device=device)
        batch_Y = torch.tensor(batch_Y, dtype=torch.long, device=device)

        return batch_X, batch_Y

In [47]:
class Skipgram(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        """
        :param vocab_size: int, 語彙の総数
        :param embedding_size: int, 単語埋め込みベクトルの次元
        """
        super(Skipgram, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        self.linear = nn.Linear(self.embedding_size, self.vocab_size, bias=False)

    def forward(self, batch_X, batch_Y):
        """
        :param batch_X: torch.Tensor(dtype=torch.long), (batch_size,)
        :param batch_Y: torch.Tensor(dtype=torch.long), (batch_size, window*2)
        :return loss: torch.Tensor(dtype=torch.float), Skipgramのloss
        """
        emb_X = self.embedding(batch_X)
        lin_X = self.linear(emb_X)
        log_prob_X = F.log_softmax(lin_X,dim=-1)
        log_prob_X = torch.gather(log_prob_X, 1, batch_Y) # (batch_size, window*2)
        # paddingした単語のlossは計算しないようにマスクをかけます(=lossの該当部分を0にします)
        log_prob_X = log_prob_X * (batch_Y != PAD).float() # (batch_size, window*2)
        loss = log_prob_X.sum(1).mean().neg()
        return loss

In [48]:
sg = Skipgram(vocab_size, embedding_size).to(device)
optimizer_sg = optim.Adam(sg.parameters())
dataloader_sg = DataLoaderSG(id_text, batch_size)

In [49]:
start_at = time.time()
for batch_id, (batch_X, batch_Y) in enumerate(dataloader_sg):
    loss = compute_loss(sg, (batch_X, batch_Y), optimizer=optimizer_sg, is_train=True)
    if batch_id % 100 == 0:
        print("batch:{}, loss:{:.4f}".format(batch_id, loss))
    if batch_id >= n_batches:
        break
end_at = time.time()
print("Elapsed time: {:.2f} [sec]".format(end_at - start_at))

batch:0, loss:58.9649
batch:100, loss:50.3753
batch:200, loss:48.0131
batch:300, loss:44.3007
batch:400, loss:39.4146
batch:500, loss:44.1356
Elapsed time: 4.89 [sec]


In [50]:
# 埋め込み層のパラメータのみを保存する
torch.save(sg.embedding.weight.data.cpu().numpy(),  "dataset/recipe_en/recipe_en_sg_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))
#torch.save(sg.embedding.weight.data.cpu().numpy(),  "recipe_sg_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))

## 4. Skipgram with Negative Sampling

In [51]:
# negative samplingに使う確率分布
weights = np.power([0, 0] + list(vocab.raw_vocab.values()), 0.75)
weights = weights / weights.sum()

In [52]:
class DataLoaderSGNS(object):
    def __init__(self, text, batch_size, window=3, n_negative=5, weights=None):
        """
        :param text: list of list of int, 単語をIDに変換したデータセット
        :param batch_size: int, ミニバッチのサイズ
        :param window: int, 周辺単語と入力単語の最大距離
        :param n_negative: int, 負例の数
        :param weights: numpy.ndarray, Negative Samplingで使う確率分布
        """
        self.text = text
        self.batch_size = batch_size
        self.window = window
        self.n_negative = n_negative
        self.weights = None
        if weights is not None:
            self.weights = torch.FloatTensor(weights) # negative samplingに使う確率分布
        self.s_pointer = 0 # 文のポインタ
        self.w_pointer = 0 # 単語のポインタ
        self.max_s_pointer = len(text)

    def __iter__(self):
        return self
    
    def __next__(self):
        batch_X = []
        batch_Y = []
        batch_N = [] # 負例
        while len(batch_X) < self.batch_size:
            sen = self.text[self.s_pointer]
            start = max(0, self.w_pointer - self.window)
            word_X = sen[self.w_pointer]
            word_Y = sen[start:self.w_pointer] + \
                sen[self.w_pointer + 1:self.w_pointer + self.window + 1]
            word_Y = pad_seq(word_Y, self.window * 2)
            batch_X.append(word_X)
            batch_Y.append(word_Y)

            # 多項分布で負例をサンプリング
            # 実装を簡略化するために、正例の除去は行っていません
            negative_samples = torch.multinomial(self.weights, self.n_negative) # (n_negative,)
            batch_N.append(negative_samples.unsqueeze(0)) # (1, n_negative)

            self.w_pointer += 1
            if self.w_pointer >= len(sen):
                self.w_pointer = 0
                self.s_pointer += 1
                if self.s_pointer >= self.max_s_pointer:
                    self.s_pointer = 0
                    raise StopIteration

        batch_X = torch.tensor(batch_X, dtype=torch.long, device=device)
        batch_Y = torch.tensor(batch_Y, dtype=torch.long, device=device)
        batch_N = torch.cat(batch_N, dim=0).to(device) # (batch_size, n_negative)

        return batch_X, batch_Y, batch_N

In [53]:
class SGNS(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        """
        :param vocab_size: int, 語彙の総数
        :param embedding_size: int, 単語埋め込みベクトルの次元
        """
        super(SGNS, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        # 入力単語の埋め込み層
        self.i_embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        # 出力単語の埋め込み層
        self.o_embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        
        nn.init.xavier_uniform_(self.i_embedding.weight)
        nn.init.xavier_uniform_(self.o_embedding.weight)

    def forward(self, batch_X, batch_Y, batch_N):
        """
        :param batch_x: torch.Tensor(dtype=torch.long), (batch_size,)
        :param batch_y: torch.Tensor(dtype=torch.long), (batch_size, window*2)
        :param batch_n: torch.Tensor(dtype=torch.long), (batch_size, n_negative)
        """
        embed_X = self.i_embedding(batch_X).unsqueeze(2) # (batch_size, embedding_size, 1)
        embed_Y = self.o_embedding(batch_Y) # (batch_size, window*2, embedding_size)
        embed_N = self.o_embedding(batch_N).neg() # (batch_size, n_negative, embedding_size)
        loss_Y = torch.bmm(embed_Y, embed_X).squeeze().sigmoid().log() # (batch_size, window*2)
        loss_Y = loss_Y * (batch_Y != PAD).float() # (batch_size, window*2)
        loss_Y = loss_Y.sum(1) # (batch_size,)
        loss_N = torch.bmm(embed_N, embed_X).squeeze().sigmoid().log().sum(1) # (batch_size,)
        return -(loss_Y + loss_N).mean()

In [54]:
sgns = SGNS(vocab_size, embedding_size).to(device)
optimizer_sgns = optim.Adam(sgns.parameters())
dataloader_sgns = DataLoaderSGNS(id_text, batch_size, n_negative=5, weights=weights)
start_at = time.time()
for batch_id, (batch_X, batch_Y, batch_N) in enumerate(dataloader_sgns):
    loss = compute_loss(sgns, (batch_X, batch_Y, batch_N), optimizer=optimizer_sgns, is_train=True)
    if batch_id % 100 == 0:
        print("batch:{}, loss:{:.4f}".format(batch_id, loss))
    if batch_id >= n_batches:
        break
end_at = time.time()
print("Elapsed time: {:.2f} [sec]".format(end_at - start_at))

batch:0, loss:7.5590
batch:100, loss:7.4037
batch:200, loss:7.2772
batch:300, loss:7.3338
batch:400, loss:6.9210
batch:500, loss:7.2747
Elapsed time: 44.34 [sec]


In [55]:
# Embeddingのパラメータのみを保存する
torch.save(sgns.i_embedding.weight.data.cpu().numpy(),  "dataset/recipe_en/recipe_en_sgns_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))
#torch.save(sgns.i_embedding.weight.data.cpu().numpy(),  "recipe_sgns_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))

In [56]:
def compute_word_similarity(embedding_path, word, n):
    """
    与えられた単語に最も似ている単語とcos類似度を返す関数

    :param embedding_path: str, 保存した埋め込み層のパラメータのパス
    :param word: str, 単語
    :param n: int
    :return out: str, 上位n個の類似単語とそのcos類似度
    """
    embedding = torch.load(embedding_path)

    # 単語ベクトルを全て単位ベクトルにする
    norm = np.linalg.norm(embedding, ord=2, axis=1, keepdims=True)
    norm = np.where(norm==0, 1, norm) # 0で割ることを避ける
    embedding /= norm
    e = embedding[vocab.word2id[word]]

    # 単語ベクトル同士のcos類似度を計算する
    cos_sim = np.dot(embedding, e.reshape(-1, 1)).reshape(-1,)
    most_sim = np.argsort(cos_sim)[::-1][1:n+1] # 自分は除く
    most_sim_words = [vocab.id2word[_id] for _id in most_sim]
    top_cos_sim = cos_sim[most_sim]
    out = ", ".join([w+"({:.4f})".format(v) for w, v in zip(most_sim_words, top_cos_sim)])
    return out

In [58]:
models = ["cbow", "sg", "sgns"]
for model in models:
    print(model+"\t:", compute_word_similarity(
        "dataset/recipe_en/recipe_en_"+model + "_embedding_500b_min_10_dim_300.pth", "salt", 5))

cbow	: green(0.8927), sugar(0.8760), parsley(0.8740), powder(0.8670), pepper(0.8665)
sg	: another(0.2469), 4mg(0.1997), 2min(0.1919), U.S.(0.1883), 450*F.(0.1882)
sgns	: pepper(0.8969), sugar(0.8910), red(0.8875), black(0.8870), season(0.8787)


In [59]:
models = ["cbow", "sg", "sgns"]
for model in models:
    print(model+"\t:", compute_word_similarity(
        "dataset/recipe_en/recipe_en_"+model + "_embedding_500b_min_10_dim_300.pth", "mushrooms", 5))

cbow	: honey(0.9445), eggs(0.9438), apple(0.9429), rosemary(0.9419), fridge(0.9407)
sg	: various(0.2140), undissolved(0.2120), cocotte(0.2094), sec(0.2000), Cabernet(0.1960)
sgns	: thyme(0.8698), bacon(0.8666), onions(0.8619), honey(0.8613), peppers(0.8608)


In [60]:
def cosine_similarity(e1, e2):
    """
    与えられた単語に最も似ている単語とcos類似度を返す関数

    :param embedding_path: str, 保存した埋め込み層のパラメータのパス
    :param word: str, 単語
    :param n: int
    :return out: str, 上位n個の類似単語とそのcos類似度
    """

    norm2 = np.linalg.norm(e2, ord=2)
    norm2 = np.where(norm2==0, 1, norm2) # 0で割ることを避ける
    e2 /= norm2
    
    norm1 = np.linalg.norm(e1, ord=2)
    norm1 = np.where(norm1==0, 1, norm1) # 0で割ることを避ける
    e1 /= norm1
    
    # 単語ベクトル同士のcos類似度を計算する
    cos_sim = np.dot(e1, e2)
 
    return cos_sim

In [61]:
embedding_path = "dataset/recipe_en/recipe_en_cbow_embedding_500b_min_10_dim_300.pth"
embedding = torch.load(embedding_path)
print(embedding.shape)

(18322, 300)


In [62]:
e1 = embedding[vocab.word2id['salt']]
e2 = embedding[vocab.word2id['pepper']]

In [63]:
cosine_similarity(e1,e2)

0.86653715

In [64]:
clusterVec = [embedding[0]]     # tracks sum of vectors in a cluster
clusterIdx = []    # array of index arrays. e.g. [[1, 3, 5], [2, 4, 6]]
ncluster = 0

In [65]:
# probablity to create a new table if new customer
# is not strongly "similar" to any existing table
pnew = 1.0/ (1 + ncluster)  
N = len(embedding)
#rands = random.rand(N)         # N rand variables sampled from U(0, 1)
print(N)

18322


In [66]:
v = embedding[0]
sim = cosine_similarity(v, clusterVec[0])
print(sim)

1.0


In [73]:
 for i in range(N):
    maxSim = -float('inf')
    maxIdx = 0
    v = embedding[i]
    for j in range(ncluster):
        sim = cosine_similarity(v, clusterVec[j])
        if sim > maxSim:
            maxIdx = j
            maxSim = sim
    # 新しいクラスタを作成
    if maxSim < pnew:
        if random.random() < pnew:
            clusterVec.append(v)
            clusterIdx.append([i])
            ncluster += 1
            pnew = 1.0 / (1 + ncluster)
            continue
    clusterVec[maxIdx] = clusterVec[maxIdx] +v
    clusterIdx[maxIdx].append(i)

In [74]:
print(len(clusterIdx))

24


In [75]:
for i in range(0,len(clusterIdx)):
    print(len(clusterIdx[i]))

1712
3318
1964
1540
1551
1482
1497
1545
1449
1545
1552
1555
1519
1551
1464
3241
1429
1365
1339
1258
1570
812
369
17


In [76]:
for i in range(0, len(clusterIdx[1])):
    print(vocab.id2word[clusterIdx[1][i]])

<UNK>
the
Add
salt
)
it
add
water
oil
sugar
butter
pepper
cook
sauce
cheese
chicken
then
top
mix
flour
stir
remaining
cream
together
garlic
cup
dough
skillet
boil
small
Pour
Mix
out
as
Serve
Cook
Heat
dish
be
aside
Combine
Cover
serve
not
serving
can
cooking
bread
Sprinkle
cake
warm
Cut
I
dry
min
20
this
time
hour
bake
use
1/4
but
Let
tomatoes
minute
Bring
taste
If
8
melted
When
batter
inch
down
Put
bring
that
sprinkle
Transfer
6
so
pour
till
whisk
beat
using
tablespoons
rack
browned
Drain
evenly
liquid
spoon
pasta
completely
Season
occasionally
slightly
desired
they
saute
another
temperature
coat
soup
Top
broth
grill
Set
beans
To
thick
toss
Cool
drain
bacon
some
chopped
The
least
two
room
pork
clean
350F
comes
Refrigerate
wire
Roll
inches
casserole
Once
Then
Make
Fold
-
25
sure
container
possibly
overnight
may
chill
uncovered
skin
Repeat
Simmer
cilantro
Use
whites
coconut
balls
muffin
piece
floured
heated
again
topping
fold
electric
vegetable
soy
platter
size
honey
microwave
Divide
ad

Bakes
State
dealing
350deg
darkest
Straight
pithy
spits
bobotie
chantilly
Foam
Nancy
choco
8-by-11-inch
passion-fruit
70mg
3-count
9x11-inch
spatual
yomogi
milliliters
Buzz
ship
bleached
*At
boulettes
335F
waterproof
2min
dishwashing
eith
210F
bowl..
Rigatoni
11-inch-diameter
Burrito
13in
butter/
satchel
28-33
closes
Plantains
zag
cuisines
Fillings
hug
minutes.Preheat
C/375
elevate
Spreadable
outlined
Wakame
pf
shoestrings
tapas
crisp-
JELLY
clingwrap
mezcal
Jiggle
garage
cheating
Ed
origin
Covering
Blanco
accident
attempts
owned
egg-wash
insane
Garlic-Parmesan
cents
gramflour
Pounds
carambola
Shirataki
doness
surimi
137
Divid
CREAMY
chilles
JARS
lenghthwise
Puerto
2-3mins
paillard
atta
Thomas
BTW
dew
majoram
stand-up
cookie/baking
munutes
reincorporate
time-consuming
LASAGNA
Salisbury
Marjoram
tracing
fern
radiating
lable
paper/foil
browed
shaper
cashewnut
laksa
tomato-meat
dishtowels
8X4
vinigar
4-to
9-1/2-inch
OnlineAbout
kasseri
wildly
well-distributed
vintage
Concord
stems
feta
Wo

In [91]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

  'Matplotlib is building the font cache using fc-list. '


In [78]:
#!pip install scikit.learn

Collecting scikit.learn
[?25l  Downloading https://files.pythonhosted.org/packages/5e/82/c0de5839d613b82bddd088599ac0bbfbbbcbd8ca470680658352d2c435bd/scikit_learn-0.20.3-cp36-cp36m-manylinux1_x86_64.whl (5.4MB)
[K    100% |████████████████████████████████| 5.4MB 10.9MB/s ta 0:00:01
[?25hCollecting scipy>=0.13.3 (from scikit.learn)
[?25l  Downloading https://files.pythonhosted.org/packages/7f/5f/c48860704092933bf1c4c1574a8de1ffd16bf4fde8bab190d747598844b2/scipy-1.2.1-cp36-cp36m-manylinux1_x86_64.whl (24.8MB)
[K    100% |████████████████████████████████| 24.8MB 2.8MB/s eta 0:00:01
Installing collected packages: scipy, scikit.learn
Successfully installed scikit.learn scipy-1.2.1


In [129]:
clusters = KMeans(n_clusters=5, random_state=0).fit_predict(embedding)

In [131]:
clusters

array([2, 4, 0, ..., 1, 0, 0], dtype=int32)

In [132]:
clusters==1

array([False, False, False, ...,  True, False, False])

In [136]:
for i in range(len(np.where(clusters == 0)[0])):
    print(vocab.id2word[np.where(clusters==0)[0][i]])

.
!
by
F.
adding
between
along
made
than
color
custard
sandwich
stems
candy
chile
strainer
portion
griddle
springform
1/8
larger
...
finished
Uncover
feta
basting
whipping
sherry
Working
200
sea
barely
delicious
lift
try
shred
cloth
looks
%
gas
14
rounded
six
heatproof
invert
maker
what
6-8
chickpeas
liqueur
head
jalapenos
regular
tight
blade
hole
whatever
de
corner
crusty
*
internal
discarding
?
against
crusts
lasagna
drops
different
punch
ending
remainder
Frost
preferably
dinner
brownie
garnished
vodka
crepe
remain
following
keeping
popcorn
save
chestnuts
pipe
throughout
quick
absorb
250
manufacturer
move
tenderloin
fairly
wo
We
Skim
ravioli
Cooking
had
moisten
Try
oysters
oyster
stage
continuously
frost
High
wait
ten
fryer
fraiche
risen
loose
Calories
espresso
[
Alternatively
medium-rare
solid
splash
logs
chunky
recommend
One
length
Grind
crepes
cracked
bird
paring
evaporates
pecan
Evenly
Pam
okra
vary
available
simply
figs
pouring
martini
souffle
our
already
alongside
wipe
layering

server
dough-like
Version
zip-loc
ornament
-3
pinkish
permeate
Thousand
D.
TOMATO
67
fly
southern
Herman
lava
began
MUFFINS
12x8
Ryan
integrate
Holiday
asked
vital
overflowing
Four
E
nonpareils
MORE
altered
otoshibuta
VEGETABLES
robust
sauced
feathery
Except
SIDE
soboro
taters
Rhubarb
pleasant
Smart
Ok
Pampered
8x11
Procedure
cabinet
Continuing
remaning
appeal
F/180
Levi
well-incorporated
windows
precise
velvet
SURE
Ale
hamburg
jambalaya
STORE
Refrigerated
270
Reese
even-sized
peppered
guarantee
leafs
mineral
overmixing
dhal
semi-circle
Gourmet
shoestring
Package
done-ness
extended
Kale
Ghirardelli
Offer
poussins
thinking
partway
Meals
Oct
yoke
satisfaction
bash
food-storage
thermos
intended
Rolling
Hours
lunches
RINSE
venting
KEEP
beige
eighth
nestling
COMPLEMENT
crossing
finding
refrigerateat
Christopher
caserole
DELICIOUS
chesse
chrysanthemum
ladlefuls
Wear
pilot
mentaiko
dishwasher
tortelloni
2015
fanning
Kneed
energy
hinge
spokes
375*F
ove
crunchiness
whir
Obviously
dies
lying
abs

Toothpick
tableside
Bonnet
storebought
boarder
dolmades
TATER
Puck
groceries
grasping
leaven
mummy
saury
PUMPKIN
Above
cocotte
tracks
grew
Tater
sleep
Firm
kidding
Sub
Pickapeppa
tug
Creating
blotting
intake
additives
serious
napoleon
letters
2/3-cup
fourteen
watched
350oF
babas
decision
retrieve
lightly-oiled
NIGHT
actively
cross-section
28-ounce
cream-cheese
amout
mmm
tea-towel
*Daily
1minute
onions/garlic
stubborn
cubing
temperture
neccesary
namul
Triscuits
Advance
brightly
located
Lawrys
dances
Treats
Herbes
230F
overlapped
breeze
Crank
Tranfer
mono
photograph
pupil
MASH
lan
trembles
pisco
400-degrees
smack
MELTED
class
User
reliable
Pepperoni
5-gallon
3-1/2-
ingreds
mellows
shed
lb..
158
madeira
ala
pellets
torpedo
minutos
gound
drainage
Everyday
grease-free
200C/gas
Vermouth
crystallizing
ozs
golfball
cram
7-in
off-set
Places
12in
Cobbler
joes
tedious
mnutes
lait
shielding
cinamon
Sweetener
danish
fade
bouillabaisse
City
Lunch
speeds
dove
grows
helper
Lard
marshmellow
parley
80F


Ziti
grill/broil
accross
wobbles
8x
baths
darkest
Cloves
bakeries
Straight
-8
four-inch
4-pound
potpie
east
425-degree
***This
160-170
Bird
Twice
medium-hight
reshaping
Rotisserie
cheese/spinach
Lighly
spits
pomelo
kneadable
strawberry-rhubarb
remainig
voluminous
pokes
*Variation
Piri
large-holed
Sold
chantilly
teakettle
BEER
Fava
350^F
resulted
credit
Write
roll-style
millimeters
1.7g
mam
dishpan
severed
satisfactory
tamping
OLIVE
.3
Nancy
glistens
vase
abundance
mile
domestic
saffron-infused
pastelike
wringing
flour/baking
Finger
passion-fruit
un-mold
falafels
well-stocked
orgeat
samosa
blowing
Show
yoks
lists
minichips
E-Z
TABASCO
sous
REFRIGERATOR
EDGES
138
carrier
raab
3-count
3775
eyeballed
BACK
tissues
mead
butterflies
Ceviche
saurkraut
semi-circles
spatual
Peruvian
PERFECT
chirashi
Steel
wired
no-boil
sea-salted
smooth.2
milliliters
hoja
approximatley
filberts
poatoes
vapor
wok/pan
warm.Add
mallows
1.Heat
originated
Oregon
Rupp
piano
ship
mimic
Kimchi
masalas
Papaya
Bologna
jam

In [114]:
len(np.where(clusters == 1))

1

In [113]:
vocab.id2word[np.where(clusters==1)[0][2]]

','

In [124]:
for i in range(4):
    print(len(np.where(clusters == i)[0]))

5377
2349
5153
5443
