In [1]:
#!pip install -U nltk
#nltk.download()

In [2]:
#from nltk.book import *

In [14]:
import nltk
import random
import numpy as np
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

'''
# 必要であれば変更してください
import os
os.chdir('/root/userspace/team9_project')

'''
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## 形態要素解析

In [15]:
import pandas as pd

In [16]:
def load_data(path):
    """『こころ』を読み込むための関数

    :param path: str, 『こころ』のパス
    :return text: list of list of str, 各文がトークナイズされた『こころ』
    """
    text = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            line = nltk.word_tokenize(line)
            text.append(line)
    return text

### レシピ文章を読み込む

In [91]:
raw = pd.read_csv(r"dataset/Material/title_recipe/title_recipe_60.csv", index_col=0 )
#text = load_data("dataset/Material/recipe1M/recipe1M_0.txt")
#text = load_data("dataset/Material/recipes2.txt")
#text = load_data("dataset/kokoro.txt")

In [26]:
#text += load_data("dataset/Material/recipe1M/recipe1M_1.txt")

In [132]:
#text += load_data("dataset/Material/recipe1M/recipe1M_2.txt")

In [133]:
raw.shape[0]

134921

In [134]:
all_recipe = []
for i in range(raw.shape[0]):
    all_recipe.append(nltk.word_tokenize(raw.iloc[i, 1].lower()))

In [135]:
text = all_recipe

In [136]:
print(text[1100])

['combine', 'the', 'salt', ',', 'sugar', ',', 'rum', ',', 'cola', ',', 'and', 'honey', 'until', 'they', 'are', 'completely', 'dissolved', '.', 'then', ',', 'add', 'the', 'pork', 'tenderloin', 'and', 'marinate', 'in', 'the', 'refrigerator', 'for', '48', 'hours', '.', 'preheat', 'a', 'grill', '.', 'cut', 'into', '2-inch', 'medallions', 'and', 'grill', 'for', '5', 'to', '7', 'minutes', 'on', 'each', 'side', '.']


In [137]:
# 特殊なトークンとそのIDは事前に定義しておきます。
PAD_TOKEN = '<PAD>' # あとで説明するpaddingに使います
UNK_TOKEN = '<UNK>' # 辞書にない単語は全てこのUNKトークンに置き換えます。(UNKの由来はunkownです)
PAD = 0 # <PAD>のID
UNK = 1 # <UNK>のID

In [141]:
# 辞書の初期化
word2id = {
    PAD_TOKEN: PAD,
    UNK_TOKEN: UNK,
}

# 辞書に含める単語の最低出現回数
# 今回はコーパスのサイズが小さいので、全ての単語を辞書に含めることにします
MIN_COUNT = 3

In [142]:
class Vocab(object):
    """語彙を管理するためのクラス"""
    def __init__(self, word2id={}):
        """
        :param word2id: 単語(str)をインデックス(int)に変換する辞書
        """
        self.word2id = dict(word2id)
        self.id2word = {v: k for k, v in self.word2id.items()}    

    def build_vocab(self, sentences, min_count=1):
        """コーパスから語彙の辞書を構築するメソッド

        :param sentences: list of list of str, コーパス
        :param min_count: int, 辞書に含める単語の最小出現回数
        """
        # 各単語の出現回数をカウントする辞書を作成します
        word_counter = {}
        for sentence in sentences:
            for word in sentence:
                # dict.get(key, 0)はdictにkeyがあればdict[key]を、なければ0を返すメソッドです
                word_counter[word] = word_counter.get(word, 0) + 1

        # min_count回以上出現する単語のみ語彙に加えます
        # 出現回数の高い単語から順にword2idに追加していきます
        # 出現回数に-1をかけた値でsortすることで出現回数の降順になるようにしています
        for word, count in sorted(word_counter.items(), key=lambda x: -x[1]):
            if count < min_count:
                break
            _id = len(self.word2id)
            self.word2id.setdefault(word, _id)
            self.id2word[_id] = word

        # 語彙に含まれる単語の出現回数を保持します（あとで使います）
        self.raw_vocab = {w: word_counter[w] for w in self.word2id.keys() if w in word_counter}

In [143]:
vocab = Vocab(word2id=word2id)
vocab.build_vocab(text, min_count=MIN_COUNT)
print("語彙数:", len(vocab.word2id))

語彙数: 25291


In [144]:
def sentence_to_ids(vocab, sen):
    """
    単語のリストをIDのリストに変換する関数

    :param vocab: class `Vocab` object
    :param sen: list of str, 文を分かち書きして得られた単語のリスト
    :return out: list of int, 単語IDのリスト
    """
    out = [vocab.word2id.get(word, UNK) for word in sen] # 辞書にない単語にはUNKのIDを割り振ります
    return out

In [145]:
# 日本語のテキストを単語IDに変換します。
id_text = [sentence_to_ids(vocab, sen) for sen in text]

In [146]:
print(text[2])
print(len(id_text))

['add', 'the', 'tomatoes', 'to', 'a', 'food', 'processor', 'with', 'a', 'pinch', 'of', 'salt', 'and', 'puree', 'until', 'smooth', '.', 'combine', 'the', 'onions', ',', 'bell', 'peppers', 'and', 'cucumbers', 'with', 'the', 'tomato', 'puree', 'in', 'a', 'large', 'bowl', '.', 'chill', 'at', 'least', '1', 'hour', '.', 'drizzle', 'with', 'olive', 'oil', ',', 'garnish', 'with', 'chopped', 'basil', 'and', 'serve', '.']
134921


In [147]:
def pad_seq(seq, max_length):
    """Paddingを行う関数

    :param seq: list of int, 単語のインデックスのリスト
    :param max_length: int, バッチ内の系列の最大長
    :return seq: list of int, 単語のインデックスのリスト
    """
    seq += [PAD for i in range(max_length - len(seq))]
    return seq

## CBOW

### Hyper Parameters

In [148]:
batch_size = 64 # ミニバッチのサイズ
n_batches = 500 # 今回学習するミニバッチの数
vocab_size = len(vocab.word2id) # 語彙の総数
embedding_size = 300 # 各単語に割り当てるベクトルの次元数

In [149]:
class TestIter(object):
    def __init__(self):
        self.iter = 0
        self.max_iter = 5
    
    def __iter__(self): # 必須
        print("iter関数が呼び出されました")
        return self
    
    def __next__(self):
        self.iter += 1
        print("next関数が呼び出されました({}回目)".format(self.iter))
        if self.iter < self.max_iter:
            return None
        else:
            print("max_iterに達したので終了します")
            raise StopIteration

In [150]:
class DataLoaderCBOW(object):
    """CBOWのデータローダー"""
    def __init__(self, text, batch_size, window=3):
        """
        :param text: list of list of int, 単語をIDに変換したデータセット
        :param batch_size: int, ミニバッチのサイズ
        :param window: int, 周辺単語とターゲットの単語の最大距離
        """
        self.text = text
        self.batch_size = batch_size
        self.window = window
        self.s_pointer = 0 # データセット上を走査する文単位のポインタ
        self.w_pointer = 0 # データセット上を走査する単語単位のポインタ
        self.max_s_pointer = len(text) # データセットに含まれる文の総数

    def __iter__(self):
        return self

    def __next__(self):
        batch_X = []
        batch_Y = []
        while len(batch_X) < self.batch_size:
            # 走査する対象の文
            sen = self.text[self.s_pointer]
            
            # 予測すべき単語
            word_Y = sen[self.w_pointer]
            
            # 入力となる単語群を取得
            start = max(0, self.w_pointer - self.window)
            word_X = sen[start:self.w_pointer] + \
                sen[self.w_pointer + 1:self.w_pointer + self.window + 1]
            word_X = pad_seq(word_X, self.window * 2)
            
            batch_X.append(word_X)
            batch_Y.append(word_Y)
            self.w_pointer += 1
            
            if self.w_pointer >= len(sen):
                # 文を走査し終わったら次の文の先頭にポインタを移行する
                self.w_pointer = 0
                self.s_pointer += 1
                if self.s_pointer >= self.max_s_pointer:
                    # 全ての文を走査し終わったら終了する
                    self.s_pointer = 0
                    raise StopIteration

        # データはtorch.Tensorにする必要があります。dtype, deviceも指定します。
        batch_X = torch.tensor(batch_X, dtype=torch.long, device=device)
        batch_Y = torch.tensor(batch_Y, dtype=torch.long, device=device)

        return batch_X, batch_Y

In [151]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(CBOW, self).__init__()
        """
        :param vocab_size: int, 語彙の総数
        :param embedding_size: int, 単語埋め込みベクトルの次元
        """
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        # 埋め込み層
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        # 全結合層(バイアスなし)
        self.linear = nn.Linear(self.embedding_size, self.vocab_size, bias=False)
        
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.linear.weight)

    def forward(self, batch_X, batch_Y):
        """
        :param batch_X: torch.Tensor(dtype=torch.long), (batch_size, window*2)
        :param batch_Y: torch.Tensor(dtype=torch.long), (batch_size,)
        :return loss: torch.Tensor(dtype=torch.float), CBOWのloss
        """
        emb_X = self.embedding(batch_X) # (batch_size, window*2, embedding_size)
        # paddingした部分を無視するためにマスクをかけます
        emb_X = emb_X * (batch_X != PAD).float().unsqueeze(-1) # (batch_size, window*2, embedding_size)
        sum_X = torch.sum(emb_X, dim=1) # (batch_size, embedding_size)
        lin_X = self.linear(sum_X) # (batch_size, vocab_size)
        log_prob_X = F.log_softmax(lin_X, dim=-1) # (batch_size, vocab_size)
        loss = F.nll_loss(log_prob_X, batch_Y)
        return loss

In [152]:
# モデル
cbow = CBOW(vocab_size, embedding_size).to(device) # iLectで実行する場合warning (GPU is too old) が出ますが, 実行に問題はないので気にせず進めてください.
# optimizer
optimizer_cbow = optim.Adam(cbow.parameters())
# データローダー
dataloader_cbow = DataLoaderCBOW(id_text, batch_size)

In [153]:
def compute_loss(model, inputs, optimizer=None, is_train=True):
    """lossを計算するための関数
    
    is_train=Trueならモデルをtrainモードに、
    is_train=Falseならモデルをevaluationモードに設定します
    
    :param model: 学習させるモデル
    :param inputs: モデルへの入力
    :param optimizer: optimizer
    :param is_train: bool, モデルtrainさせるか否か
    """
    model.train(is_train)

    # lossを計算します。
    loss = model(*inputs)

    if is_train:
        # .backward()を実行する前にmodelのparameterのgradientを全て0にセットします
        optimizer.zero_grad()
        # parameterのgradientを計算します。
        loss.backward()
        # parameterのgradientを用いてparameterを更新します。
        optimizer.step()

    return loss.item()

In [154]:
start_at = time.time()

for batch_id, (batch_X, batch_Y) in enumerate(dataloader_cbow):
    loss = compute_loss(cbow, (batch_X, batch_Y), optimizer=optimizer_cbow, is_train=True)
    if batch_id % 100 == 0:
        print("batch:{}, loss:{:.4f}".format(batch_id, loss))
    if batch_id >= n_batches:
        break

end_at = time.time()

print("Elapsed time: {:.2f} [sec]".format(end_at - start_at))

batch:0, loss:10.1377
batch:100, loss:6.5077
batch:200, loss:6.1347
batch:300, loss:6.0533
batch:400, loss:5.7315
batch:500, loss:6.6532
Elapsed time: 59.91 [sec]


In [155]:
# 埋め込み層のパラメータのみを保存する
torch.save(cbow.embedding.weight.data.cpu().numpy(),  "dataset/recipe_en/recipe_en_cbow_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))
#torch.save(cbow.embedding.weight.data.cpu().numpy(),  "recipe_cbow_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))

## Skipgram

In [156]:
class DataLoaderSG(object):
    """Skipgramのためのデータローダー"""
    def __init__(self, text, batch_size, window=3):
        """
        :param text: list of list of int, 単語をIDに変換したデータセット
        :param batch_size: int, ミニバッチのサイズ
        :param window: int, 周辺単語と入力単語の最大距離
        """
        self.text = text
        self.batch_size = batch_size
        self.window = window
        self.s_pointer = 0 # データセット上を走査する文単位のポインタ
        self.w_pointer = 0 # データセット上を走査する単語単位のポインタ
        self.max_s_pointer = len(text) # データセットに含まれる文の総数

    def __iter__(self):
        return self

    def __next__(self):
        batch_X = []
        batch_Y = []

        while len(batch_X) < self.batch_size:
            sen = self.text[self.s_pointer]
            
            # Skipgramでは入力が1単語
            word_X = sen[self.w_pointer]

            # 出力は周辺単語
            start = max(0,self.w_pointer-self.window)
            word_Y = sen[start:self.w_pointer]+sen[self.w_pointer+1:self.w_pointer+self.window+1]
            word_Y = pad_seq(word_Y,self.window*2)

            batch_X.append(word_X)
            batch_Y.append(word_Y)
            self.w_pointer += 1

            if self.w_pointer >= len(sen):
                self.w_pointer = 0
                self.s_pointer += 1
                if self.s_pointer >= self.max_s_pointer:
                    self.s_pointer = 0
                    raise StopIteration

        batch_X = torch.tensor(batch_X, dtype=torch.long, device=device)
        batch_Y = torch.tensor(batch_Y, dtype=torch.long, device=device)

        return batch_X, batch_Y

In [157]:
class Skipgram(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        """
        :param vocab_size: int, 語彙の総数
        :param embedding_size: int, 単語埋め込みベクトルの次元
        """
        super(Skipgram, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        self.linear = nn.Linear(self.embedding_size, self.vocab_size, bias=False)

    def forward(self, batch_X, batch_Y):
        """
        :param batch_X: torch.Tensor(dtype=torch.long), (batch_size,)
        :param batch_Y: torch.Tensor(dtype=torch.long), (batch_size, window*2)
        :return loss: torch.Tensor(dtype=torch.float), Skipgramのloss
        """
        emb_X = self.embedding(batch_X)
        lin_X = self.linear(emb_X)
        log_prob_X = F.log_softmax(lin_X,dim=-1)
        log_prob_X = torch.gather(log_prob_X, 1, batch_Y) # (batch_size, window*2)
        # paddingした単語のlossは計算しないようにマスクをかけます(=lossの該当部分を0にします)
        log_prob_X = log_prob_X * (batch_Y != PAD).float() # (batch_size, window*2)
        loss = log_prob_X.sum(1).mean().neg()
        return loss

In [158]:
sg = Skipgram(vocab_size, embedding_size).to(device)
optimizer_sg = optim.Adam(sg.parameters())
dataloader_sg = DataLoaderSG(id_text, batch_size)

In [159]:
start_at = time.time()
for batch_id, (batch_X, batch_Y) in enumerate(dataloader_sg):
    loss = compute_loss(sg, (batch_X, batch_Y), optimizer=optimizer_sg, is_train=True)
    if batch_id % 100 == 0:
        print("batch:{}, loss:{:.4f}".format(batch_id, loss))
    if batch_id >= n_batches:
        break
end_at = time.time()
print("Elapsed time: {:.2f} [sec]".format(end_at - start_at))

batch:0, loss:60.9421
batch:100, loss:50.3137
batch:200, loss:44.8178
batch:300, loss:42.8776
batch:400, loss:39.9723
batch:500, loss:44.4829
Elapsed time: 59.80 [sec]


In [160]:
# 埋め込み層のパラメータのみを保存する
torch.save(sg.embedding.weight.data.cpu().numpy(),  "dataset/recipe_en/recipe_en_sg_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))
#torch.save(sg.embedding.weight.data.cpu().numpy(),  "recipe_sg_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))

## Skipgram with Negative Sampling

In [161]:
# negative samplingに使う確率分布
weights = np.power([0, 0] + list(vocab.raw_vocab.values()), 0.75)
weights = weights / weights.sum()

In [162]:
class DataLoaderSGNS(object):
    def __init__(self, text, batch_size, window=3, n_negative=5, weights=None):
        """
        :param text: list of list of int, 単語をIDに変換したデータセット
        :param batch_size: int, ミニバッチのサイズ
        :param window: int, 周辺単語と入力単語の最大距離
        :param n_negative: int, 負例の数
        :param weights: numpy.ndarray, Negative Samplingで使う確率分布
        """
        self.text = text
        self.batch_size = batch_size
        self.window = window
        self.n_negative = n_negative
        self.weights = None
        if weights is not None:
            self.weights = torch.FloatTensor(weights) # negative samplingに使う確率分布
        self.s_pointer = 0 # 文のポインタ
        self.w_pointer = 0 # 単語のポインタ
        self.max_s_pointer = len(text)

    def __iter__(self):
        return self
    
    def __next__(self):
        batch_X = []
        batch_Y = []
        batch_N = [] # 負例
        while len(batch_X) < self.batch_size:
            sen = self.text[self.s_pointer]
            start = max(0, self.w_pointer - self.window)
            word_X = sen[self.w_pointer]
            word_Y = sen[start:self.w_pointer] + \
                sen[self.w_pointer + 1:self.w_pointer + self.window + 1]
            word_Y = pad_seq(word_Y, self.window * 2)
            batch_X.append(word_X)
            batch_Y.append(word_Y)

            # 多項分布で負例をサンプリング
            # 実装を簡略化するために、正例の除去は行っていません
            negative_samples = torch.multinomial(self.weights, self.n_negative) # (n_negative,)
            batch_N.append(negative_samples.unsqueeze(0)) # (1, n_negative)

            self.w_pointer += 1
            if self.w_pointer >= len(sen):
                self.w_pointer = 0
                self.s_pointer += 1
                if self.s_pointer >= self.max_s_pointer:
                    self.s_pointer = 0
                    raise StopIteration

        batch_X = torch.tensor(batch_X, dtype=torch.long, device=device)
        batch_Y = torch.tensor(batch_Y, dtype=torch.long, device=device)
        batch_N = torch.cat(batch_N, dim=0).to(device) # (batch_size, n_negative)

        return batch_X, batch_Y, batch_N

In [163]:
class SGNS(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        """
        :param vocab_size: int, 語彙の総数
        :param embedding_size: int, 単語埋め込みベクトルの次元
        """
        super(SGNS, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        # 入力単語の埋め込み層
        self.i_embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        # 出力単語の埋め込み層
        self.o_embedding = nn.Embedding(self.vocab_size, self.embedding_size)
        
        nn.init.xavier_uniform_(self.i_embedding.weight)
        nn.init.xavier_uniform_(self.o_embedding.weight)

    def forward(self, batch_X, batch_Y, batch_N):
        """
        :param batch_x: torch.Tensor(dtype=torch.long), (batch_size,)
        :param batch_y: torch.Tensor(dtype=torch.long), (batch_size, window*2)
        :param batch_n: torch.Tensor(dtype=torch.long), (batch_size, n_negative)
        """
        embed_X = self.i_embedding(batch_X).unsqueeze(2) # (batch_size, embedding_size, 1)
        embed_Y = self.o_embedding(batch_Y) # (batch_size, window*2, embedding_size)
        embed_N = self.o_embedding(batch_N).neg() # (batch_size, n_negative, embedding_size)
        loss_Y = torch.bmm(embed_Y, embed_X).squeeze().sigmoid().log() # (batch_size, window*2)
        loss_Y = loss_Y * (batch_Y != PAD).float() # (batch_size, window*2)
        loss_Y = loss_Y.sum(1) # (batch_size,)
        loss_N = torch.bmm(embed_N, embed_X).squeeze().sigmoid().log().sum(1) # (batch_size,)
        return -(loss_Y + loss_N).mean()

In [164]:
sgns = SGNS(vocab_size, embedding_size).to(device)
optimizer_sgns = optim.Adam(sgns.parameters())
dataloader_sgns = DataLoaderSGNS(id_text, batch_size, n_negative=5, weights=weights)
start_at = time.time()
for batch_id, (batch_X, batch_Y, batch_N) in enumerate(dataloader_sgns):
    loss = compute_loss(sgns, (batch_X, batch_Y, batch_N), optimizer=optimizer_sgns, is_train=True)
    if batch_id % 100 == 0:
        print("batch:{}, loss:{:.4f}".format(batch_id, loss))
    if batch_id >= n_batches:
        break
end_at = time.time()
print("Elapsed time: {:.2f} [sec]".format(end_at - start_at))

batch:0, loss:7.5602
batch:100, loss:7.3962
batch:200, loss:7.2146
batch:300, loss:7.0158
batch:400, loss:6.9683
batch:500, loss:7.5047
Elapsed time: 87.98 [sec]


In [165]:
# Embeddingのパラメータのみを保存する
torch.save(sgns.i_embedding.weight.data.cpu().numpy(),  "dataset/recipe_en/recipe_en_sgns_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))
#torch.save(sgns.i_embedding.weight.data.cpu().numpy(),  "recipe_sgns_embedding_{}b_min_{}_dim_{}.pth".format(n_batches,MIN_COUNT,embedding_size))

## 評価

### 類語が正しく出るか

In [166]:
def compute_word_similarity(embedding_path, word, n):
    """
    与えられた単語に最も似ている単語とcos類似度を返す関数

    :param embedding_path: str, 保存した埋め込み層のパラメータのパス
    :param word: str, 単語
    :param n: int
    :return out: str, 上位n個の類似単語とそのcos類似度
    """
    embedding = torch.load(embedding_path)

    # 単語ベクトルを全て単位ベクトルにする
    norm = np.linalg.norm(embedding, ord=2, axis=1, keepdims=True)
    norm = np.where(norm==0, 1, norm) # 0で割ることを避ける
    embedding /= norm
    e = embedding[vocab.word2id[word]]

    # 単語ベクトル同士のcos類似度を計算する
    cos_sim = np.dot(embedding, e.reshape(-1, 1)).reshape(-1,)
    most_sim = np.argsort(cos_sim)[::-1][1:n+1] # 自分は除く
    most_sim_words = [vocab.id2word[_id] for _id in most_sim]
    top_cos_sim = cos_sim[most_sim]
    out = ", ".join([w+"({:.4f})".format(v) for w, v in zip(most_sim_words, top_cos_sim)])
    return out

In [173]:
models = ["cbow", "sg", "sgns"]
for model in models:
    print(model+"\t:", compute_word_similarity(
        "dataset/recipe_en/recipe_en_"+model + "_embedding_500b_min_3_dim_300.pth", "spinach", 5))

cbow	: mushrooms(0.9583), apple(0.9581), cornstarch(0.9570), outside(0.9544), eggs(0.9524)
sg	: chanterelles(0.2160), evenings(0.2059), glued(0.2043), 371(0.2000), sterilised(0.1981)
sgns	: milk(0.9160), peanut(0.9157), chocolate(0.9105), vanilla(0.9089), tomatoes(0.9084)


In [174]:
models = ["cbow", "sg", "sgns"]
for model in models:
    print(model+"\t:", compute_word_similarity(
        "dataset/recipe_en/recipe_en_"+model + "_embedding_500b_min_3_dim_300.pth", "salt", 5))

cbow	: pepper(0.9212), sugar(0.9201), green(0.9139), powder(0.9053), cinnamon(0.8951)
sg	: trima(0.2444), sensor(0.2376), high-sided(0.2329), merrier(0.2315), fruits(0.2240)
sgns	: pepper(0.9186), black(0.8966), powder(0.8907), season(0.8831), green(0.8814)


### 2単語間のcos類似度

In [171]:
def cosine_similarity(e1, e2):
    """
    2単語間のcos類似度を返す関数

    :param embedding_path: str, 保存した埋め込み層のパラメータのパス
    :param word: str, 単語
    :param n: int
    :return out: str, 上位n個の類似単語とそのcos類似度
    """

    norm2 = np.linalg.norm(e2, ord=2)
    norm2 = np.where(norm2==0, 1, norm2) # 0で割ることを避ける
    e2 /= norm2
    
    norm1 = np.linalg.norm(e1, ord=2)
    norm1 = np.where(norm1==0, 1, norm1) # 0で割ることを避ける
    e1 /= norm1
    
    # 単語ベクトル同士のcos類似度を計算する
    cos_sim = np.dot(e1, e2)
 
    return cos_sim

embeddingは全単語のベクトル

In [175]:
embedding_path = "dataset/recipe_en/recipe_en_sgns_embedding_500b_min_3_dim_300.pth"
embedding = torch.load(embedding_path)
print(embedding.shape)

(25291, 300)


## クラスタリング1

In [176]:
clusterVec = [embedding[0]]     # tracks sum of vectors in a cluster
clusterIdx = []    # array of index arrays. e.g. [[1, 3, 5], [2, 4, 6]]
ncluster = 0

In [177]:
# probablity to create a new table if new customer
# is not strongly "similar" to any existing table
pnew = 1.0/ (1 + ncluster)  
N = len(embedding)
#rands = random.rand(N)         # N rand variables sampled from U(0, 1)
print(N)

25291


In [178]:
v = embedding[0]
sim = cosine_similarity(v, clusterVec[0])
print(sim)

1.0000001


In [179]:
 for i in range(N):
    maxSim = -float('inf')
    maxIdx = 0
    v = embedding[i]
    for j in range(ncluster):
        sim = cosine_similarity(v, clusterVec[j])
        if sim > maxSim:
            maxIdx = j
            maxSim = sim
    # 新しいクラスタを作成
    if maxSim < pnew:
        if random.random() < pnew:
            clusterVec.append(v)
            clusterIdx.append([i])
            ncluster += 1
            pnew = 1.0 / (1 + ncluster)
            continue
    clusterVec[maxIdx] = clusterVec[maxIdx] +v
    clusterIdx[maxIdx].append(i)

In [180]:
print(len(clusterIdx))

24


In [181]:
for i in range(0,len(clusterIdx)):
    print(len(clusterIdx[i]))

1539
1352
1216
2179
1180
1141
1047
1115
1062
1124
1055
1111
1052
1056
1100
1050
1040
1013
935
817
904
654
458
91


In [188]:
for i in range(0, len(clusterIdx[8])):
    print(vocab.id2word[clusterIdx[8][i]])

shred
sandwiches
could
10-inch
pancake
phyllo
incorporate
25-30
toppings
brownie
charred
corner
stack
7.
pretty
crusty
letting
've
proof
were
5-6
month
orzo
pecan
100
bird
pickle
basic
film
door
chutney
tent
enchilada
preferred
heart
wonton
candies
case
sizzle
canning
giblets
9x9
visible
properly
slurry
ever
savory
non-reactive
slivers
narrow
vermouth
cob
skin-side
scald
balance
blot
6-inch
rocks
soupy
her
reduction
matzo
worth
flavours
picante
deflate
bought
percent
say
well-greased
yams
flaky
gold
cases
normally
everyone
twelve
blackberry
wilts
shows
gingerroot
3-1/2
au
pots
bitters
purple
tangerine
pumpkins
2-
seeded
popping
clinging
emulsion
chickpea
9x5-inch
verde
twisting
eliminate
ladleful
seitan
goose
gills
shorter
3g
consomme
published
26
smoky
dribble
personally
parcel
hull
9-in
refrigerating
instructed
fajita
soba
courtesy
mini-muffin
piercing
knots
expect
boullion
well-drained
stripes
deviled
info
granny
roll-ups
th
unbleached
bombe
typical
ok.
tripe
arborio
caramelizes
swe

## クラスタリング2 (k-means)

In [189]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [190]:
clusters = KMeans(n_clusters=5, random_state=0).fit_predict(embedding)

In [196]:
for i in range(len(np.where(clusters == 4)[0])):
    print(vocab.id2word[np.where(clusters==4)[0][i]])

that
still
get
much
sweet
flakes
slow
lamb
there
3.
best
servings
custard
24
tart
see
baked
fingers
finish
crumble
we
how
because
served
fitted
fennel
better
cherry
vinaigrette
really
shred
beer
easy
1/8
yellow
springform
sticking
sprouts
reheat
setting
feta
create
pancakes
cm
pesto
ramekins
split
skewer
strawberry
basting
flame
bone
10-inch
log
gas
whipping
cooks
save
circles
six
registers
pita
vigorously
artichoke
pressure
bags
months
table
move
distribute
smoking
coals
brownie
what
finger
6-8
calories
cheesecloth
head
2-quart
corner
whatever
stack
forming
starch
450f
following
twist
near
clove
dash
polenta
power
fall
ending
internal
crusty
ravioli
remainder
wrapped
garnished
#
jello
molds
lasagna
oatmeal
brine
tap
letting
brisket
've
wo
chorizo
scallion
care
variation
5-6
alternatively
rectangles
tabasco
160
centers
month
eight
chunky
piping
taking
putting
omelet
available
baguette
bright
had
loose
orzo
ones
though
crescent
heavy-duty
mark
gone
risen
old
concentrate
know
teaspoonful

entries
grown-up
om
sweet-potato
13in
krab
penetration
325-degree
srinkle
womans
30+
sprinkly
drainer
war
stollen
canvas
acquired
zero
thinness
face-down
tendrils
torres
de-seeded
coffee-flavored
needing
162
beach
hazel
2o
compliments
calf
kishka
13.5
385
bratwursts
baja
lowered
model
chinese-style
alarm
1/4-in
mrs
*please
smack
vinaigrettes
blanquette
remelt
wisking
diablo
mediun
tators
destroy
10minutes
lettuce-covered
generations
well-incorporated
peperonata
dumped
pan*
borage
denbu
guzvar
considerable
layer-
visually
waterbath
clips
reposition
vineger
softball
pleasure
61
clever
marscapone
neely
partridges
storebought
poacher
rellenos
ink
variable
tbe
consistencey
cms
shortcuts
sates
donna
4qt
tasks
butter.add
bowman
40g
/gas
3rds
65mg
texture..
oven-dried
tiniest
basketball
s-l-o-w-l-y
cut-away
.then
restir
best-
4-serving
chesses
remeining
negras
mornings
butter/brown
mariniade
approximation
onion-celery
valentina
impressed
appareil
ka
coat.5
cheeseballs
actively
gingered
fine-ch

resident
yuba
fastened
7.5cm
mileage
dolce
bean-cooking
lush
brulees
cioppino
enriched
rums
24-48
pepers
infant
sensation
skin-side-down
unwavering
redefine
66
bill
asparagas
zipper-top
tamping
drowned
layout
nudged
navajo
mile
addd
rangoon
franco
pot.add
24hrs
scrapes
listening
*cooks
tipped
whizzing
counting
redcurrants
well-sprayed
teaspoonsfuls
multi-grain
controlled
butter-side-down
joining
cranks
breadcrumb-like
fraises
enjoy~
speciality
l0
sheet-
teatowel
pt
premix
flaking
1mg
careful-
unbelievably
strict
reuben
completley
mouli
carne
cookery
protecting
budget
subsequent
wholesome
wide-necked
reprocess
w/out
citrusy
theme
madeline
frankie
enuf
2oz
involtini
factory-made
rack.yield
serving-dish
hoisin-soy
top-proof
375degree
7in/18cm
stil
*fat
vregetables
baaking
marinaro
brown.meanwhile
robot-coup
chocalatey
attributed
carter
cripsness
symptoms
comforted
sirupy
loose-bottom
knfe
cheesecloths
rotini/broccoli
spinach/egg
clenching
rice/lentils
welded
richer-tasting
weils
16cmx13cm

**active
tendereat
vinchicken
white-braised
'nakkach
bombard
tia
refrigerateroast
ridiculous
northstar
14400-81400
everyones
cradling
shisho
proposal
affair
delivering
ensue
oiled/dusted
1/2-inch-in-diameter
madame
herb-
tomato-sprinkle
outersides
sugar.then
handle.make
smooth.heat
dnead
re-space
2-48
sugar-side
su-mei
therebut
aliminium
juice/jello
depressor
pistacchios
liquida
knepp
pizzettas
8qt
8minutes
okra.boil
base/pie
dish/plate
topping/dollop
onion/chop
jam-
j.r.
deperession
sikil
151/2x101/2
mintues-40
cake-in
cripsp-tender
situation
krusteaz
zip-lok
grown-ups
*addendum
melt-in-the-mouth
meiji
mash-looking
syrup-all
yogurts
whimsical
polka
overloaded
stips
120mg
adviser
riv
spinkled
12-by-6-inch
7/4/92
taipan
racquetballs
866-438-4642
*wheneverworking
de-sleeved
11.00
banana-flour
mixture.each
herbs-chives
whippedcream
secondsthe
m-high
ionions
broth.bring
13-by-8-inch
corning/pyrex
6-ounces
chinks
bananaz
fluidy
frustration
swallownests
marbilized
mid-rib
peach-y
sibzamini
o

kaluah
approximations
foodie
maintained
regualr
assuring
110*
6mg
herbs/seasoning
navels
sedanini
pan-grilled
2-cm
four-cheese
6-muffin
scored-side
talented
born
grillpan
semolina-dusted
platting
inevitably
evently
starch/bread
cook/stirring
ther
pre-baking
medium-coarse
preseasoning
claim
container.cover
frugal
subsiding
eet
~3-5
liq
rasher
85f/30c
eyeballed
glycemic
low-temperature
lovetobakeandcook
15in
meanwile
pour/spoon
detergent
addendum
over-crowd
porterhouse
9x11-inch
peans
gobs
thereabouts
greet
potato-onion
slics
wraped
bitsy
youngsters
11x15-inch
cinamon
submersion
april
carrying
plink
frill
imprints
pebble-sized
zucchine
thick-handled
transformed
speckling
corkscrew
steeps
chile-cheese
str
whitest
award
2.beat
3.bake
oblongs
cabbge
pintos
cloth-lined
bell-pepper
50.
salmonberries
upland
igredients
blind-bake
18-23
lover
amazu
sugar/water
whip/pudding
juanita
cou-cou
streams
fainted
veat
10x16
physalis
county
albondigas
ginger-lime
relleno
tomatilla
lawn
strozzapreti
guazze

In [195]:
for i in range(4):
    print(len(np.where(clusters == i)[0]))

5881
6346
6098
1795


## レシピを取ってくる

In [12]:
raw = pd.read_csv(r"dataset/Material/title_recipe/title_recipe_60.csv", index_col=0 )

In [52]:
raw.iloc[1,1]

'Cook macaroni according to package directions; drain well. Cold. Combine macaroni, cheese cubes, celery, green pepper and pimento. Blend together mayonnaise or possibly salad dressing, vinegar, salt and dill weed; add in to macaroni mix. Toss lightly. Cover and refrigeratewell. Serve salad in lettuce lined bowl if you like. Makes 6 servings. '

In [114]:
raw.shape[0]

134921

In [90]:
raw.iloc[130000,1]+raw.iloc[13000,1]

"Blend dry ingredients up in a food processor. Mix together peanut butter, applesauce, and honey or agave nectar in a bowl. Once the wet ingredients are mixed well, add the dry ingredients and mix together. I usually finish by mixing with my hands. Form small balls and set them on a cookie sheet after spraying it with a nonstick spray. Use a fork to press down on the ball to flatten the cookies. Bake in an oven for 10 minutes only at 350 degrees. Remember that peanut butter cookie's burn easily so you need to take them out right away even if they don't look done, because by the time they are brown on top they are usually burnt on the bottom. SPICY BUTTER melt butter then add fine chili powder and quick mix well then off heat PAN FRY HAM quick pan fry ham for half a minute on each side then set aside BAKE BREAD slice a loaf into half then cut a square in the bread put 1 slice of chedder cheese then add 1 eggs fisrt and broil it add another egg and broil till egg white set cut a square h