# Data Exploration of Recipes Dataset

In [1]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import numpy as np


import gensim 
from gensim.models import word2vec, phrases
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_numeric,\
                    strip_non_alphanum, strip_multiple_whitespaces, strip_short

In [2]:
# Loading the data
file = "../raw_data/recipes.csv"
recipes_df = pd.read_csv(file)

custom_stopwords_df = pd.read_csv("custom_stopwords.csv")
custom_stopwords = list(custom_stopwords_df['custom_stopwords'])

## Data sanity checks

In [3]:
recipes_df.describe()

Unnamed: 0.1,Unnamed: 0
count,13501.0
mean,6750.0
std,3897.547327
min,0.0
25%,3375.0
50%,6750.0
75%,10125.0
max,13500.0


In [4]:
recipes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13501 entries, 0 to 13500
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           13501 non-null  int64 
 1   Title                13496 non-null  object
 2   Ingredients          13501 non-null  object
 3   Instructions         13493 non-null  object
 4   Image_Name           13501 non-null  object
 5   Cleaned_Ingredients  13501 non-null  object
dtypes: int64(1), object(5)
memory usage: 633.0+ KB


In [5]:
recipes_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


## Checking for nulls and na

In [6]:
recipes_df.isna().sum()

Unnamed: 0             0
Title                  5
Ingredients            0
Instructions           8
Image_Name             0
Cleaned_Ingredients    0
dtype: int64

In [7]:
recipes_df[recipes_df["Title"].isna()]

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
11221,11221,,[],,roasted-game-hens-with-caramelized-root-vegeta...,['']
12373,12373,,[],,chicken-soup-with-rice-232605,['']
12378,12378,,[],,double-lemon-bars-232572,['']
12818,12818,,[],,pear-and-frangipane-crostata-with-raspberry-vi...,['']
12829,12829,,[],,hazelnut-shortbread-sticks-231311,['']


In [8]:
recipes_df[recipes_df["Instructions"].isna()]

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
4293,4293,Broccolini-Cheddar Gratin with Rye Breadcrumbs,"['1/4 loaf seeded rye bread, torn into 1"" piec...",,broccolini-cheddar-gratin-with-rye-breadcrumbs...,"['1/4 loaf seeded rye bread, torn into 1"" piec..."
9636,9636,Smoked Salmon with Egg Salad and Green beans,[],,smoked-salmon-with-egg-salad-and-green-beans-3...,['']
10356,10356,Royal Icing,"['Using electric mixer, beat 3 1/4 cups powder...",,royal-icing-240751,"['Using electric mixer', 'beat 3 1/4 cups powd..."
11221,11221,,[],,roasted-game-hens-with-caramelized-root-vegeta...,['']
12373,12373,,[],,chicken-soup-with-rice-232605,['']
12378,12378,,[],,double-lemon-bars-232572,['']
12818,12818,,[],,pear-and-frangipane-crostata-with-raspberry-vi...,['']
12829,12829,,[],,hazelnut-shortbread-sticks-231311,['']


## Cleaning the data

In [9]:
recipes_df = recipes_df.drop(columns ='Unnamed: 0')
recipes_df.dropna

<bound method DataFrame.dropna of                                                    Title  \
0      Miso-Butter Roast Chicken With Acorn Squash Pa...   
1                        Crispy Salt and Pepper Potatoes   
2                            Thanksgiving Mac and Cheese   
3                     Italian Sausage and Bread Stuffing   
4                                           Newton's Law   
...                                                  ...   
13496                               Brownie Pudding Cake   
13497  Israeli Couscous with Roasted Butternut Squash...   
13498  Rice with Soy-Glazed Bonito Flakes and Sesame ...   
13499                                        Spanakopita   
13500  Mexican Poblano, Spinach, and Black Bean "Lasa...   

                                             Ingredients  \
0      ['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...   
1      ['2 large egg whites', '1 pound new potatoes (...   
2      ['1 cup evaporated milk', '1 cup whole milk', ...   
3    

In [10]:
# Calculating the length of 'Cleaned_Ingredients' columns to identify any empty ingredients and then dropping them

recipes_df['clean_len'] = [len(i) for i in recipes_df["Cleaned_Ingredients"]]
recipes_df.drop(recipes_df[recipes_df['clean_len']<5].index, axis = 0, inplace= True)

In [11]:
# Function to get rid of punctuation, numbers and weird formatting

def clean_list(ingredient_list):
    '''function to clean ingredints list'''
    punctuation = string.punctuation
    
    # break string into list of individual items
    ingredient_list = ingredient_list.split("', \'")

    # iterate through each item in list to remove punctuation and non alpha characters
    for i in range(len(ingredient_list)):
        for punc in punctuation:
            ingredient_list[i] = ingredient_list[i].replace(punc, '')
        ingredient_list[i] = ''.join(char for char in ingredient_list[i] if char.isalpha() or char == ' ')
        ingredient_list[i] = ingredient_list[i].strip()
        ingredient_list[i] = ingredient_list[i].replace('  ', ' ')
        
    return ' '. join(ingredient_list)

recipes_df['Clean2'] = recipes_df['Cleaned_Ingredients'].map(clean_list)

## Bag of Word and Stop Words

In [12]:
# Setting bag of words scope - first 500

bag_of_ingredients = recipes_df["Clean2"][:2000]

In [13]:
# Removing regular stopwords

stop_words = set(stopwords.words('english')) 

for i in range(0, len(bag_of_ingredients)):
    word_tokens = word_tokenize(bag_of_ingredients[i])
    word_tokens = [w.lower() for w in word_tokens]
    full_stop_words = stop_words | set(custom_stopwords)
    bag_of_ingredients[i] = [w for w in word_tokens if not w in full_stop_words]

# Converting list to string
bag_of_ingredients = bag_of_ingredients.map(' '.join)

In [14]:
#Transform BoI in a list of list of strings
for i in range(len(bag_of_ingredients)):
    bag_of_ingredients[i] = bag_of_ingredients[i].split(" ")

In [15]:
#Test for fastText model:
#all_Sentences = []
#for sentence in bag_of_ingredients:
#    all_Sentences.extend(sentence)

In [16]:
#Test for fastText model:
#training fastText model on recipe
#from gensim.models import FastText
#model_fasttext = FastText(window=5, min_count=5, workers=4,sg=1)
#model_fasttext.build_vocab(corpus_iterable=all_Sentences)
#model_fasttext.train(corpus_iterable=all_Sentences, total_examples = model_fasttext.corpus_count, epochs=5)
#model_fasttext.wv.most_similar(positive=['chicken'], topn=10,restrict_vocab=50000)

In [75]:
#Implement Word2Vec model
from gensim.models import Word2Vec

# This line trains an entire embedding for the words in your train set
def word2vec_(bag_of_ingredients,vector_size=100,min_count=3):
    return Word2Vec(sentences=bag_of_ingredients,vector_size=vector_size, min_count=min_count)

In [76]:
bag_of_ingredients

0       [chicken, acorn, squash, sage, rosemary, butte...
1       [egg, whites, new, potatoes, ground, black, pe...
2       [evaporated, milk, milk, garlic, powder, onion...
3       [round, italian, loaf, olive, oil, sweet, ital...
4       [dark, brown, bourbon, fresh, lemon, juice, ap...
                              ...                        
1995    [olive, oil, white, onion, garlic, cloves, cor...
1996    [dried, chiles, árbol, seeds, ribs, tomatillos...
1997    [soy, sauce, sesame, oil, red, pepper, garlic,...
1998    [pork, shoulder, boston, butt, white, onion, r...
1999    [allpurpose, flour, stick, butter, granulated,...
Name: Clean2, Length: 2000, dtype: object

In [77]:
word2vec_(bag_of_ingredients)

<gensim.models.word2vec.Word2Vec at 0x7fb4e3e14880>

In [78]:
model = word2vec_(bag_of_ingredients)

In [79]:
list_of_words = model.wv.__dict__["index_to_key"]

In [82]:
#Test similarity of a given word
model.wv.most_similar('chicken', topn=10)

[('tomato', 0.9991054534912109),
 ('bay', 0.9990581274032593),
 ('onions', 0.9989761114120483),
 ('grill', 0.9989627003669739),
 ('aleppostyle', 0.9989341497421265),
 ('shallots', 0.9989337921142578),
 ('celery', 0.9988922476768494),
 ('shallot', 0.9988492727279663),
 ('peppers', 0.9987933039665222),
 ('broth', 0.9987426400184631)]

In [83]:
#Define a Recipe Embedding function: Check if is the best method posible!!

In [84]:
word2vec_keys = model.wv.__dict__["index_to_key"]

In [85]:
model.wv["chicken"]

array([ 3.66160125e-02,  4.45187449e-01,  2.22856730e-01, -5.71670644e-02,
        1.24682136e-01, -8.34624708e-01,  3.42595726e-01,  7.83286929e-01,
       -2.29389057e-01, -2.00646356e-01, -1.21000312e-01, -6.09387219e-01,
        3.62318195e-02,  2.15750486e-01,  2.21246511e-01, -2.58229226e-01,
        2.09623337e-01, -4.49134976e-01, -1.09358326e-01, -9.23055351e-01,
        3.48137885e-01,  3.45579684e-01,  1.46345258e-01, -1.30971566e-01,
       -1.98330134e-01, -9.31857228e-02, -2.75516301e-01, -4.24896896e-01,
       -2.89729357e-01, -1.67123321e-02,  4.27817047e-01,  1.12969369e-01,
        1.98928565e-01, -5.75489998e-01, -4.25539434e-01,  6.51699722e-01,
        2.05163732e-01, -3.96998674e-01, -9.97512639e-02, -8.55751872e-01,
        2.31737923e-02, -3.95603865e-01, -6.05092980e-02,  6.43986836e-02,
        5.51583767e-01, -1.23580731e-02, -5.27556956e-01, -2.43219465e-01,
        4.38484512e-02,  6.97516724e-02,  3.13758701e-01, -2.94866920e-01,
        1.71135083e-01, -

In [51]:
def getRecipeEmbedding(sentence):
    countFound = 0
    embeddingList = []
    for wordx in sentence:
        if wordx in word2vec_keys:
            vector1 = model.wv[wordx]
            embeddingList.append(vector1)
            countFound+=1
    return np.true_divide(sum(embeddingList), countFound)

In [52]:
sum(getRecipeEmbedding(bag_of_ingredients[899]))

1.1889913603663445

In [53]:
#Define a list of the embedded recipes:
def recipes_list(bag_of_ingredients):
    recipes_embed_list = []
    for i in bag_of_ingredients:
        if getRecipeEmbedding(i).size == 1:
            recipes_embed_list.append(np.zeros(50,))
        else:
            recipes_embed_list.append(getRecipeEmbedding(i))
    return recipes_embed_list

In [54]:
#recipes_embed_list = [getRecipeEmbedding(i) for i in bag_of_ingredients]

In [55]:
recipes_embed_list = recipes_list(bag_of_ingredients)

  return np.true_divide(sum(embeddingList), countFound)


In [87]:
recipes_embed_list

2000

In [57]:
aux = []
for i in range(0,2000):
    if sum(recipes_embed_list[i]) is False:
        aux.append(i)
aux

[]

In [88]:
ingredients = ["chicken","pasta","onion"]

In [89]:
ingredients_embedded = getRecipeEmbedding(ingredients)

In [90]:
from numpy import dot
from numpy.linalg import norm

def similar_recipe(embeddded_ingredients):
    cos_sim = []
    for i in range(0,len(recipes_embed_list)):
        if sum(recipes_embed_list[i]) == 0.0:
            cos_sim.append(0)
        else:    
            cos_sim.append(np.dot(ingredients_embedded,recipes_embed_list[i])/(norm(ingredients_embedded)*norm(recipes_embed_list[i])))
    dis_array = np.array(cos_sim)
    return dis_array



In [91]:
cos_sim = similar_recipe(ingredients_embedded)

In [92]:
(cos_sim).argsort()[:50]

array([898, 720, 571, 942, 464])

In [97]:
def getListofRecipes(cos_sim,n):
    n_index = (-cos_sim).argsort()[:n]
    titles = []
    for i in n_index:
        titles.append(recipes_df.iloc[i,0:2])
    return titles

In [106]:
new_df = pd.DataFrame(getListofRecipes(similar_recipe(ingredients_embedded),100))

In [112]:
new_df

Unnamed: 0,Title,Ingredients
1399,Arroz Caldo (Chicken Rice Porridge),"['200 g (7 oz / 1 cup) white glutinous rice', ..."
515,Turnips with Spicy Meyer Lemon Dressing,"['2 cups bonito flakes', '½ cup sugar', '½ cup..."
563,Coconut Milk–Braised Chicken,"['1 (13.5-oz.) can unsweetened coconut milk', ..."
1295,Ginger Spritz,"['1 thin slice peeled ginger', '2 oz. Lillet',..."
1877,Garlicky Peanut Dressing,"['1/4 cup natural creamy peanut butter', '1/4 ..."
...,...,...
1386,"Slow Cooker Corned Beef Brisket with Cabbage, ...","['2 cups apple juice', '2 tablespoons real map..."
1756,Thanksgiving Dinner for One,"['1 sweet potato', '1 teaspoon finely grated o..."
740,Skirt Steak with Spicy Coconut Dressing,"['1/2 red jalapeno, Fresno or serrano chile, f..."
294,Scallop Rice Bowls With Crunchy Spice Oil,"['1 lb. dry sea scallops, side muscles removed..."


In [121]:
list = [new_df["Title"].value[i] for i in new_df["Title"].values if "chicken" in new_df["Ingredients"].values]

In [122]:
list

[]