# Data Exploration of Recipes Dataset

In [65]:
# Imports

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

In [2]:
# Loading the data
file = "../raw_data/recipes.csv"
recipes_df = pd.read_csv(file)

## Data sanity checks

In [5]:
recipes_df.describe()

Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
count,13496,13501,13493,13501,13501
unique,13305,13473,13464,13472,13473
top,Potato Latkes,[],Place ingredients in blender in the order list...,#NAME?,['']
freq,5,12,5,30,12


In [6]:
recipes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13501 entries, 0 to 13500
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Title                13496 non-null  object
 1   Ingredients          13501 non-null  object
 2   Instructions         13493 non-null  object
 3   Image_Name           13501 non-null  object
 4   Cleaned_Ingredients  13501 non-null  object
dtypes: object(5)
memory usage: 527.5+ KB


In [4]:
recipes_df.head()

Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


## Checking for nulls and na

In [7]:
recipes_df.isna().sum()

Title                  5
Ingredients            0
Instructions           8
Image_Name             0
Cleaned_Ingredients    0
dtype: int64

In [8]:
recipes_df[recipes_df["Title"].isna()]

Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
11221,,[],,roasted-game-hens-with-caramelized-root-vegeta...,['']
12373,,[],,chicken-soup-with-rice-232605,['']
12378,,[],,double-lemon-bars-232572,['']
12818,,[],,pear-and-frangipane-crostata-with-raspberry-vi...,['']
12829,,[],,hazelnut-shortbread-sticks-231311,['']


In [9]:
recipes_df[recipes_df["Instructions"].isna()]

Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
4293,Broccolini-Cheddar Gratin with Rye Breadcrumbs,"['1/4 loaf seeded rye bread, torn into 1"" piec...",,broccolini-cheddar-gratin-with-rye-breadcrumbs...,"['1/4 loaf seeded rye bread, torn into 1"" piec..."
9636,Smoked Salmon with Egg Salad and Green beans,[],,smoked-salmon-with-egg-salad-and-green-beans-3...,['']
10356,Royal Icing,"['Using electric mixer, beat 3 1/4 cups powder...",,royal-icing-240751,"['Using electric mixer', 'beat 3 1/4 cups powd..."
11221,,[],,roasted-game-hens-with-caramelized-root-vegeta...,['']
12373,,[],,chicken-soup-with-rice-232605,['']
12378,,[],,double-lemon-bars-232572,['']
12818,,[],,pear-and-frangipane-crostata-with-raspberry-vi...,['']
12829,,[],,hazelnut-shortbread-sticks-231311,['']


## Cleaning the data

In [3]:
recipes_df = recipes_df.drop(columns ='Unnamed: 0')
recipes_df.dropna

In [11]:
# Calculating the length of 'Cleaned_Ingredients' columns to identify any empty ingredients and then dropping them

recipes_df['clean_len'] = [len(i) for i in recipes_df["Cleaned_Ingredients"]]
recipes_df.drop(recipes_df[recipes_df['clean_len']<5].index, axis = 0, inplace= True)

In [86]:
# Function to get rid of punctuation, numbers and weird formatting

def clean_list(ingredient_list):
    '''function to clean ingredints list'''
    punctuation = string.punctuation
    
    # break string into list of individual items
    ingredient_list = ingredient_list.split("', \'")

    # iterate through each item in list to remove punctuation and non alpha characters
    for i in range(len(ingredient_list)):
        for punc in punctuation:
            ingredient_list[i] = ingredient_list[i].replace(punc, '')
        ingredient_list[i] = ''.join(char for char in ingredient_list[i] if char.isalpha() or char == ' ')
        ingredient_list[i] = ingredient_list[i].strip()
        ingredient_list[i] = ingredient_list[i].replace('  ', ' ')
        
    return ' '. join(ingredient_list)

recipes_df['Clean2'] = recipes_df['Cleaned_Ingredients'].map(clean_list)

## Bag of Word and Stop Words

In [89]:
# Setting bag of words scope - first 500

bag_of_words = recipes_df["Clean2"][:500]

In [77]:
# Removing regular stopwords

stop_words = set(stopwords.words('english')) 

for i in range(0, len(bag_of_words)):
    word_tokens = word_tokenize(bag_of_words[i])
    bag_of_words[i] = [w for w in word_tokens if not w in stop_words] #or not w in custom-stop_words] 

# Converting list to string
bag_of_words = bag_of_words.map(' '.join)

In [111]:
#Creating the bag of words dataframe

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(bag_of_words)

bag_of_word_df = pd.DataFrame(X.toarray(),columns = vectorizer.get_feature_names_out())

In [123]:
#Doing a count of the bag of words to identify custom stop words
#Denise is doing n00: n50

bag_of_word_df.sum().sort_values(ascending=False)[800:850] # Starting with the first 50 rows 


bar             3
roti            3
oranges         3
fermented       3
barley          3
bulbs           3
spaghetti       3
sushi           3
runs            3
using           3
chartreuse      3
fresco          3
urad            3
elderflower     3
pastry          3
beer            3
halfmoons       3
broken          3
batons          3
ham             3
russet          3
mexican         3
french          3
yams            3
hard            3
bags            3
harissa         3
flaxseeds       3
gem             3
grinder         3
brussels        3
bass            3
nori            3
flaxseed        3
surface         3
achiote         2
possible        2
racks           2
loose           2
blindbaked      2
ashanti         2
blackberries    2
asafoetida      2
attached        2
roast           2
puna            2
bitter          2
pollen          2
lillet          2
nut             2
dtype: int64

In [None]:
custom_stopwords = ['cup',
'tsp',
'tbsp',
'cups',
'oz',
'teaspoon',
'plus',
'tablespoons',
'sliced',
'large',
'finely',
'divided',
'cut',
'thinly',
'lb',
'ml',
'tablespoon',
'serving',
'optional',
'small',
'packed',
'torn',
'piece',
'preferably',
'lengthwise',
'roughly',
'trimmed',
'andor',
'pinch',
'rinsed',
'drained',
'bunch',
'pounds',
'slices',
'homemade',
'drizzling',
'fl',
'powdered',
'flaky',
'quartered',
'equipment',
'beaten',
'scrubbed',
'diagonal',
'cracked',
'coarse',
'separated',
'package',
'squeezed',
'grams',
'simple',
'cm',
'handful',
'patted',
'serve',
'sifted',
'bitesize',
'left',
'softened',
'assorted ',
'firm', 
'rings',
'size',
'sharp',
'slice',
'smashed',
'oldfashioned',
'depending',
'scant', 
'follows',
'steamed', 
'brushing',
'unbleached',
'nonstick',
'leftover',  
'sprinkling',
'dice',
'meal',
'layers',
'preferred',
'greasing',
'stemmed',
'boiling',
'split', 
'ozg',
'fillet',
'like',
'firmly', 
'lbg',
'rolled',
'sheet', 
'wide', 
'goodquality',
'get',
'additional',
'tbspg', 
'bowl',
'mashed',
'intact',
'loosely',
'links',
'ﬁnely',
'wellstirred',
'dash', 
'big',
'using',
'hard',
'surface',
'possible',
'attached',
'grinder' 
            ]
