# Data Exploration of Recipes Dataset

In [1]:
# Imports

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import string
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

In [2]:
# Loading the data
file = "../raw_data/recipes.csv"
recipes_df = pd.read_csv(file)

## Data sanity checks

In [3]:
recipes_df.describe()

Unnamed: 0.1,Unnamed: 0
count,13501.0
mean,6750.0
std,3897.547327
min,0.0
25%,3375.0
50%,6750.0
75%,10125.0
max,13500.0


In [4]:
recipes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13501 entries, 0 to 13500
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           13501 non-null  int64 
 1   Title                13496 non-null  object
 2   Ingredients          13501 non-null  object
 3   Instructions         13493 non-null  object
 4   Image_Name           13501 non-null  object
 5   Cleaned_Ingredients  13501 non-null  object
dtypes: int64(1), object(5)
memory usage: 633.0+ KB


In [5]:
recipes_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,1,Crispy Salt and Pepper Potatoes,"['2 large egg whites', '1 pound new potatoes (...",Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,2,Thanksgiving Mac and Cheese,"['1 cup evaporated milk', '1 cup whole milk', ...",Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,3,Italian Sausage and Bread Stuffing,"['1 (¾- to 1-pound) round Italian loaf, cut in...",Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,4,Newton's Law,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


## Checking for nulls and na

In [6]:
recipes_df.isna().sum()

Unnamed: 0             0
Title                  5
Ingredients            0
Instructions           8
Image_Name             0
Cleaned_Ingredients    0
dtype: int64

In [7]:
recipes_df[recipes_df["Title"].isna()]

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
11221,11221,,[],,roasted-game-hens-with-caramelized-root-vegeta...,['']
12373,12373,,[],,chicken-soup-with-rice-232605,['']
12378,12378,,[],,double-lemon-bars-232572,['']
12818,12818,,[],,pear-and-frangipane-crostata-with-raspberry-vi...,['']
12829,12829,,[],,hazelnut-shortbread-sticks-231311,['']


In [8]:
recipes_df[recipes_df["Instructions"].isna()]

Unnamed: 0.1,Unnamed: 0,Title,Ingredients,Instructions,Image_Name,Cleaned_Ingredients
4293,4293,Broccolini-Cheddar Gratin with Rye Breadcrumbs,"['1/4 loaf seeded rye bread, torn into 1"" piec...",,broccolini-cheddar-gratin-with-rye-breadcrumbs...,"['1/4 loaf seeded rye bread, torn into 1"" piec..."
9636,9636,Smoked Salmon with Egg Salad and Green beans,[],,smoked-salmon-with-egg-salad-and-green-beans-3...,['']
10356,10356,Royal Icing,"['Using electric mixer, beat 3 1/4 cups powder...",,royal-icing-240751,"['Using electric mixer', 'beat 3 1/4 cups powd..."
11221,11221,,[],,roasted-game-hens-with-caramelized-root-vegeta...,['']
12373,12373,,[],,chicken-soup-with-rice-232605,['']
12378,12378,,[],,double-lemon-bars-232572,['']
12818,12818,,[],,pear-and-frangipane-crostata-with-raspberry-vi...,['']
12829,12829,,[],,hazelnut-shortbread-sticks-231311,['']


## Cleaning the data

In [9]:
recipes_df = recipes_df.drop(columns ='Unnamed: 0')
recipes_df.dropna

<bound method DataFrame.dropna of                                                    Title  \
0      Miso-Butter Roast Chicken With Acorn Squash Pa...   
1                        Crispy Salt and Pepper Potatoes   
2                            Thanksgiving Mac and Cheese   
3                     Italian Sausage and Bread Stuffing   
4                                           Newton's Law   
...                                                  ...   
13496                               Brownie Pudding Cake   
13497  Israeli Couscous with Roasted Butternut Squash...   
13498  Rice with Soy-Glazed Bonito Flakes and Sesame ...   
13499                                        Spanakopita   
13500  Mexican Poblano, Spinach, and Black Bean "Lasa...   

                                             Ingredients  \
0      ['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...   
1      ['2 large egg whites', '1 pound new potatoes (...   
2      ['1 cup evaporated milk', '1 cup whole milk', ...   
3    

In [10]:
# Calculating the length of 'Cleaned_Ingredients' columns to identify any empty ingredients and then dropping them

recipes_df['clean_len'] = [len(i) for i in recipes_df["Cleaned_Ingredients"]]
recipes_df.drop(recipes_df[recipes_df['clean_len']<5].index, axis = 0, inplace= True)

In [11]:
# Function to get rid of punctuation, numbers and weird formatting

def clean_list(ingredient_list):
    '''function to clean ingredints list'''
    punctuation = string.punctuation
    
    # break string into list of individual items
    ingredient_list = ingredient_list.split("', \'")

    # iterate through each item in list to remove punctuation and non alpha characters
    for i in range(len(ingredient_list)):
        for punc in punctuation:
            ingredient_list[i] = ingredient_list[i].replace(punc, '')
        ingredient_list[i] = ''.join(char for char in ingredient_list[i] if char.isalpha() or char == ' ')
        ingredient_list[i] = ingredient_list[i].strip()
        ingredient_list[i] = ingredient_list[i].replace('  ', ' ')
        
    return ' '. join(ingredient_list)

recipes_df['Clean2'] = recipes_df['Cleaned_Ingredients'].map(clean_list)

## Bag of Word and Stop Words

In [12]:
custom_stopwords = ['cup',
'tsp',
'tbsp',
'cups',
'oz',
'teaspoon',
'plus',
'tablespoons',
'sliced',
'large',
'finely',
'divided',
'cut',
'thinly',
'lb',
'ml',
'tablespoon',
'serving',
'optional',
'small',
'packed',
'torn',
'piece',
'preferably',
'lengthwise',
'roughly',
'trimmed',
'andor',
'pinch',
'rinsed',
'drained',
'bunch',
'pounds',
'slices',
'homemade',
'drizzling',
'fl',
'powdered',
'flaky',
'quartered',
'equipment',
'beaten',
'scrubbed',
'diagonal',
'cracked',
'coarse',
'separated',
'package',
'squeezed',
'grams',
'simple',
'cm',
'handful',
'patted',
'serve',
'sifted',
'bitesize',
'left',
'softened',
'assorted ',
'firm', 
'rings',
'size',
'sharp',
'slice',
'smashed',
'oldfashioned',
'depending',
'scant', 
'follows',
'steamed', 
'brushing',
'unbleached',
'nonstick',
'leftover',  
'sprinkling',
'dice',
'meal',
'layers',
'preferred',
'greasing',
'stemmed',
'boiling',
'split', 
'ozg',
'fillet',
'like',
'firmly', 
'lbg',
'rolled',
'sheet', 
'wide', 
'goodquality',
'get',
'additional',
'tbspg', 
'bowl',
'mashed',
'intact',
'loosely',
'links',
'ﬁnely',
'wellstirred',
'dash', 
'big',
'using',
'hard',
'surface',
'possible',
'attached',
'grinder' 
'cup',
 'medium',
 'water',
 'ounces',
 'removed',
 'sea',
 'teaspoons',
 'halved',
 'ounce',
 'coarsely',
 'toasted',
 'inch',
 'diced',
 'zest',
 'pound',
 'garnish',
 'minced',
 'temperature',
 'morton',
 'cooked',
 'diamond',
 'crystal',
 'pitted',
 'melted',
 'tender',
 'storebought',
 'parts',
 'pan',
 'cubes',
 'seeded',
 'needed',
 'sprigs',
 'hot',
 'lightly',
 'total',
 'sticks',
 'taste',
 'dusting',
 'stock',
 'extra',
 'see',
 'spray',
 'diameter',
 'countrystyle',
 'crumbled',
 'shaved',
 'cubed',
 'about',
 'choice',
 'cored',
 'hulled',
 'desired',
 'ends',
 'peel',
 'reserved',
 'blend',
 'tough',
 'kg',
 'little',
 'rind',
 'good',
 'heirloom',
 'doubleconcentrated',
 'confectioners',
 'prepared',
 'bag',
 'end',
 'long',
 'dutchprocess',
 'substitute',
 'regular',
 'instant',
 'recipe',
 'planks',
 'superfine',
 'outer',
 'plantains',
 'packages',
 'twists',
 'rim',
 'thickcut',
 'cleaned',
 'extrafirm',
 'quarts',
 'top',
 'creole',
 'reposado',
 'food',
 'mediumsize',
 'clean',
 'slightly',
 'heaping',
 'purpose',
 'proof',
 'old',
 'littleneck',
 'lo',
 'pot',
 'ritz',
 'approximately',
 'reserve',
 'percent',
 'lightlife',
 'noilly',
 'online',
 'according',
 'blackstrap',
 'agricole',
 'bias',
 'sanding']

In [21]:
custom_stopwords[170:]

['good',
 'heirloom',
 'doubleconcentrated',
 'confectioners',
 'prepared',
 'bag',
 'end',
 'long',
 'dutchprocess',
 'substitute',
 'regular',
 'instant',
 'recipe',
 'planks',
 'superfine',
 'outer',
 'plantains',
 'packages',
 'twists',
 'rim',
 'thickcut',
 'cleaned',
 'extrafirm',
 'quarts',
 'top',
 'creole',
 'reposado',
 'food',
 'mediumsize',
 'clean',
 'slightly',
 'heaping',
 'purpose',
 'proof',
 'old',
 'littleneck',
 'lo',
 'pot',
 'ritz',
 'approximately',
 'reserve',
 'percent',
 'lightlife',
 'noilly',
 'online',
 'according',
 'blackstrap',
 'agricole',
 'bias',
 'sanding']

In [13]:
# Setting bag of words scope - first 500

bag_of_words = recipes_df["Clean2"][:500]

In [14]:
# Removing regular stopwords

stop_words = set(stopwords.words('english')) 

for i in range(0, len(bag_of_words)):
    word_tokens = word_tokenize(bag_of_words[i])
    bag_of_words[i] = [w for w in word_tokens if not w in stop_words or not w in custom_stopwords] 

# Converting list to string
bag_of_words = bag_of_words.map(' '.join)

In [15]:
#Creating the bag of words dataframe

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(bag_of_words)

bag_of_word_df = pd.DataFrame(X.toarray(),columns = vectorizer.get_feature_names_out())

In [16]:
#Doing a count of the bag of words to identify custom stop words
#Denise is doing n00: n50 (Denise up to 850)

bag_of_word_df.sum().sort_values(ascending=False)[:50] # Starting with the first 50 rows 


cup            1063
or              715
tsp             630
tbsp            588
salt            469
chopped         372
cups            356
oz              354
oil             348
ground          335
kosher          322
teaspoon        305
for             289
plus            287
pepper          278
and             275
tablespoons     269
sliced          257
large           255
finely          241
sugar           241
fresh           230
into            219
garlic          207
more            198
butter          183
divided         183
olive           183
red             177
cut             174
black           172
thinly          168
of              163
freshly         162
lb              159
juice           158
leaves          157
white           155
cloves          155
seeds           148
ml              147
unsalted        145
grated          140
tablespoon      135
lemon           132
serving         130
powder          127
peeled          126
flour           125
extravirgin     125
