In [1]:
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import json
import word2vec
import itertools
import pickle
from sklearn import preprocessing
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
# Will allow us to embed images in the notebook
%matplotlib inline

In [None]:
"""
D1
"""
file_path = "../data/Im2Recipe/det_ingrs.json"
with open(file_path) as file_json:
    #ingredient-level
    im2recipe_recipes = json.load(file_json)

im2recipe_ingredients_set_lemmatized = set()

for recipe in tqdm(im2recipe_recipes):
    recipe_id = recipe['id']
    recipe_valid = recipe['valid']
    recipe_ingredients_dict = recipe['ingredients']
    
    recipe_ingredients = []
    
    for recipe_ingredient in recipe_ingredients_dict:
        ingredient = recipe_ingredient['text']
        recipe_ingredients.append(wordnet_lemmatizer.lemmatize(ingredient).lower().replace('-',' ').replace(' ','_'))
            
    for i, j in zip(recipe_ingredients, recipe_valid):
        # if detected_ingrs
        if j:
            # if string is not empty
            if i:
                im2recipe_ingredients_set_lemmatized.add(i)

In [None]:
"""
D2
"""
df_info = pd.read_csv("../data/FlavorDB/ingredients_info.csv", sep=",")
fdb_vocab = df_info['ingredient_name'].values.tolist()
fdb_vocab = [wordnet_lemmatizer.lemmatize(x).lower().replace('-',' ').replace(' ', '_') for x in fdb_vocab]
df_info['ingredient_name'] = fdb_vocab
df_info.to_csv("../data/FlavorDB/D2_fdb_info.csv", sep=",", index=False)

In [None]:
"""
D3
"""
im2recipe_embeddings = word2vec.load("../data/Im2Recipe/im2recipe-vocab.bin")

im2recipe_embeddings_pkl = {}
for ingr in im2recipe_embeddings.vocab:
    ingr_modified = wordnet_lemmatizer.lemmatize(ingr).lower().replace('-',' ').replace(' ', '_')
    im2recipe_embeddings_pkl[ingr_modified] = im2recipe_embeddings[ingr]    
pickle.dump(im2recipe_embeddings_pkl, open("../data/Im2Recipe/D3_im2recipe-vocab-vectors.pkl", 'wb'))
im2recipe_vocab = im2recipe_embeddings_pkl.keys()

In [None]:
"""
D4
"""
df = pd.read_csv("../data/FlavorNet/flavornet-vocab-compounds.csv", sep=",")
fnet_vocab = df['label'].values.tolist()
fnet_vocab = [wordnet_lemmatizer.lemmatize(x).lower().replace('-',' ').replace(' ', '_') for x in fnet_vocab]
df['label'] = fnet_vocab
df.to_csv("../data/FlavorNet/D4_fnet.csv", sep=",", index=False)

In [None]:
print(len(im2recipe_ingredients_set_lemmatized))
print(len(im2recipe_vocab))
print(len(fdb_vocab))
print(len(fnet_vocab))

In [None]:
D5 = set(im2recipe_ingredients_set_lemmatized).intersection(im2recipe_vocab)
pickle.dump(D5, open("../data/D5_ingredient_vocab.pkl", 'wb'))

In [None]:
D6 = set(D5).intersection(fdb_vocab)
pickle.dump(D6, open("../data/D6_ingredient_vocab.pkl", 'wb'))

In [None]:
"""
D7
D5에서 쓰레기 거르기
"""

with open("./data/vocab/D5_ingredient_vocab.pkl", "rb") as pickle_file:
    D5_vocab = pickle.load(pickle_file)
D5_vocab

## Dealing with Catgories

In [16]:
ingr2category = pd.read_csv("./data/ingr2category.csv", sep=",")
ingr2category = ingr2category.set_index('ingr').to_dict()['category']
pickle.dump(ingr2category, open("./data/ingr2category.pkl", 'wb'))

In [17]:
with open("./data/ingr2category.pkl", "rb") as pickle_file:
    ingr2category = pickle.load(pickle_file)
ingr2category['pappadams']

'Bakery/Dessert/Snack'

In [24]:
categories = set(pd.read_csv("./data/ingr2category.csv", sep=",")['category'].values.tolist())
category2rep = {}
for index, category in enumerate(categories):
    rep = [0]*len(categories)
    rep[index] = 1
    category2rep[category] = rep
    
pickle.dump(category2rep, open("./data/category2rep.pkl", 'wb'))

In [25]:
with open("./data/category2rep.pkl", "rb") as pickle_file:
    category2rep = pickle.load(pickle_file)

In [26]:
category2rep[ingr2category['pappadams']]

[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
import numpy as np

def loadGloveModel(gloveFile):
    print("Loading Glove Model...")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0].lower().replace('-',' ').replace(' ','_')
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [None]:
path = "./data/embeddings/glove.6B.300d.txt"

ingr2rep = loadGloveModel(path)
print(len(ingr2rep))

In [43]:
ingredients = D5_vocab
dict_ingredients = {}
for ingr in ingredients:
    if ingr in ingr2rep:
        dict_ingredients[ingr] = np.array(ingr2rep[ingr], dtype='float32')

    else:
        dim = 300
        rep = np.random.uniform(low=-1, high=1, size=(300,)).astype(np.float32)
        dict_ingredients[ingr] = np.array(rep, dtype='float32')

In [67]:
df = pd.read_csv("./data/pairings/P21_im2recipe_pairings_split_FINAL.csv", sep=",")
df = df.set_index(["ingr1","ingr2"])['split'].to_dict()
df

{('basil', 'sorrel'): 'valid',
 ('red_pepper_flakes', 'brown_rice'): 'valid',
 ('pepperoni', 'seasoning_salt'): 'train',
 ('tomato_puree', 'vodka'): 'train',
 ('amaretto', 'sugar'): 'train',
 ('dark_chocolate_chips', 'margarine'): 'train',
 ('garam_masala', 'bacon'): 'train',
 ('sweet_chili_sauce', 'sesame_oil'): 'train',
 ('turkey_gravy', 'thyme'): 'train',
 ('yeast', 'basil_leaves'): 'test',
 ('red_wine', 'sourdough_bread'): 'test',
 ('ground_turmeric', 'potato'): 'train',
 ('green_cabbage', 'shredded_cheddar_cheese'): 'train',
 ('basmati_rice', 'asafoetida_powder'): 'train',
 ('fresh_basil_leaves', 'yellow_onion'): 'train',
 ('plum_tomato', 'cayenne'): 'valid',
 ('red_wine_vinegar', 'seasoning'): 'train',
 ('dried_shrimp', 'red_chilies'): 'train',
 ('red_pepper', 'hot_pepper_flakes'): 'train',
 ('frozen_spinach', 'dill'): 'test',
 ('apple_cider', 'black_pepper'): 'train',
 ('honey', 'vanilla_flavoring'): 'train',
 ('cilantro', 'italian_parsley'): 'train',
 ('baking_soda', 'cognac'):

In [71]:
pair1 = "active_dry_yeast"
pair2 = "soymilk"

df[(pair1, pair2)]

'train'

In [72]:
df[(pair2, pair1)]

KeyError: ('soymilk', 'active_dry_yeast')