# Exploratory Data Analysis

## Data Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
chopped_df = pd.read_csv('data/chopped_raw.csv')

## Data Clean

In [3]:
chopped_df.head()

Unnamed: 0,season,season_episode,series_episode,episode_name,episode_notes,air_date,judge1,judge2,judge3,appetizer,entree,dessert,contestant1,contestant1_info,contestant2,contestant2_info,contestant3,contestant3_info,contestant4,contestant4_info
0,1,1,1,"""Octopus, Duck, Animal Crackers""",This is the first episode with only three offi...,"January 13, 2009",Marc Murphy,Alex Guarnaschelli,Aarón Sánchez,"baby octopus, bok choy, oyster sauce, smoked ...","duck breast, green onions, ginger, honey","prunes, animal crackers, cream cheese",Summer Kriegshauser,Private Chef and Nutrition Coach New York NY,Perry Pollaci,Private Chef and Sous chef Bar Blanc New Yo...,Katie Rosenhouse,Pastry Chef Olana Restaurant New York NY,Sandy Davis,Catering Chef Showstoppers Catering at Union...
1,1,2,2,"""Tofu, Blueberries, Oysters""",This is the first of a few episodes with five ...,"January 20, 2009",Aarón Sánchez,Alex Guarnaschelli,Marc Murphy,"firm tofu, tomato paste, prosciutto","daikon, pork loin, Napa cabbage, Thai chiles,...","phyllo dough, gorgonzola cheese, pineapple ri...",Raymond Jackson,Private Caterer and Culinary Instructor West...,Klaus Kronsteiner,Chef de cuisine Liberty National Golf Course...,Christopher Jackson,Executive Chef and Owner Ted and Honey Broo...,Pippa Calland,Owner and Chef Chef for Hire LLC Newville PA
2,1,3,3,"""Avocado, Tahini, Bran Flakes""",,"January 27, 2009",Aarón Sánchez,Alex Guarnaschelli,Marc Murphy,"lump crab meat, dried shiitake mushrooms, pin...","ground beef, cannellini beans, tahini paste, ...","brioche, cantaloupe, pecans, avocados",Margaritte Malfy,Executive Chef and Co-owner La Palapa New Y...,Rachelle Rodwell,Chef de cuisine SoHo Grand Hotel New York NY,Chris Burke,Private Chef New York NY,Andre Marrero,Chef tournant L’Atelier de Joël Robuchon Ne...
3,1,4,4,"""Banana, Collard Greens, Grits""","In the appetizer round, Chef Chuboda refused t...","February 3, 2009",Scott Conant,Amanda Freitag,Geoffrey Zakarian,"ground beef, wonton wrappers, cream of mushro...","scallops, collard greens, anchovies, sour cream","maple syrup, black plums, almond butter, waln...",Sean Chudoba,Executive Chef Ayza Wine Bar New York NY,Kyle Shadix,Chef Registered Dietician and Culinary Consu...,Luis Gonzales,Executive Chef Knickerbocker Bar & Grill Ne...,Einat Admony,Chef and Owner Taïm New York NY
4,1,5,5,"""Yucca, Watermelon, Tortillas""",,"February 10, 2009",Geoffrey Zakarian,Alex Guarnaschelli,Marc Murphy,"watermelon, canned sardines, pepper jack chee...","beef shoulder, yucca, raisins, ancho chiles, ...","flour tortillas, prosecco, Canadian bacon, ro...",John Keller,Personal Chef New York NY,Andrea Bergquist,Executive Chef New York NY,Ed Witt,Executive Chef / Wine Director Bloomingdale ...,Josh Emett,Chef de cuisine Gordon Ramsay at The London ...


In [4]:
meals = chopped_df.loc[:, ['appetizer', 'entree', 'dessert']]

In [5]:
#check for empty baskets
null_meals = meals[meals.isnull().any(axis=1)]
null_meals

Unnamed: 0,appetizer,entree,dessert
555,,,


In [6]:
full_meals = meals.drop([555])

In [7]:
#getting just the basket ingredients into one dataframe
appetizers = full_meals.loc[:, ['appetizer']]
appetizers.rename(columns={'appetizer': 'ingredients'}, inplace=True)
entrees = full_meals.loc[:, ['entree']]
entrees.rename(columns={'entree': 'ingredients'}, inplace=True)
desserts = full_meals.loc[:, ['dessert']]
desserts.rename(columns={'dessert': 'ingredients'}, inplace=True)

baskets = pd.concat([appetizers, entrees, desserts], ignore_index=True)

## Data Check

In [8]:
baskets.head()

Unnamed: 0,ingredients
0,"baby octopus, bok choy, oyster sauce, smoked ..."
1,"firm tofu, tomato paste, prosciutto"
2,"lump crab meat, dried shiitake mushrooms, pin..."
3,"ground beef, wonton wrappers, cream of mushro..."
4,"watermelon, canned sardines, pepper jack chee..."


In [9]:
baskets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ingredients  1704 non-null   object
dtypes: object(1)
memory usage: 13.4+ KB


In [10]:
baskets.ingredients.value_counts()

 capon, pak choi, michelada mix, raisin bran                                 1
 mayonnaise, bananas, prickly pear cactus, pre-cooked pizza crust            1
 pickled eggs, ramps, instant ramen noodles, calf eyeballs                   1
 toybox eggplant, mission figs, sweetened condensed milk, matambre           1
 seafood paella, ground chicken, dry sherry, bread and butter pickles        1
                                                                            ..
 Kansas City style barbecue sauce, bananas, whole coconut, smoked almonds    1
 chickpea curry, beef shoulder, finger limes, coffee pods                    1
 sancocho, lulo, blue masa dough, chicharrón de cerdo                        1
 poussin, shakshuka, pea tendrils, whipped cream vodka                       1
 polenta cake, béchamel sauce, plums, pink Himalayan sea salt                1
Name: ingredients, Length: 1704, dtype: int64

# Preprocessing Text

In [11]:
import nltk
import re
import string

In [12]:
#text preprocessing steps - remove numbers, remove punctuation, strip end-whitespace, remove capitalization

numpunc = lambda x: re.sub('[^A-Za-z\s]+', '', x)
strip = lambda x: x.strip()
lower = lambda x: x.lower()
baskets['ingredients'] = baskets.ingredients.map(numpunc).map(strip).map(lower)

#check
baskets.loc[0, 'ingredients']

'baby octopus bok choy oyster sauce smoked paprika'

# Count Vectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', ngram_range=(1, 4))
X = cv.fit_transform(baskets.ingredients)
pd.DataFrame(X.toarray(), columns=cv.get_feature_names()).head()

Unnamed: 0,abalone,abalone curry,abalone curry leaves,abalone curry leaves serrano,abalone fresno,abalone fresno peppers,abalone fresno peppers bobbys,abalone wasabi,abalone wasabi candy,abalone wasabi candy canes,...,zucchini pig pickin cake,zucchini ribbon,zucchini ribbon skewers,zucchini ribbon skewers camel,zucchini survival,zucchini survival candy,zucchini wine,zucchini wine ice,zucchini wine ice cream,zwiebelkuchen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Word2vec

In [14]:
sent = [row.split(',') for row in baskets['ingredients']]

In [15]:
sent[:2]

[['baby octopus bok choy oyster sauce smoked paprika'],
 ['firm tofu tomato paste prosciutto']]

In [16]:
from gensim.models import Word2Vec

model = Word2Vec(sent, min_count=1, size=50, workers=3, window=3, sg=1)

In [17]:
model['baby octopus bok choy oyster sauce smoked paprika']

  """Entry point for launching an IPython kernel.


array([ 0.00940975,  0.00572705,  0.00179597,  0.00839013, -0.00487955,
       -0.00782438,  0.00725232,  0.00261132, -0.00324737, -0.00085221,
       -0.00015065,  0.00139537, -0.00308207, -0.00684641, -0.0070388 ,
       -0.00231703,  0.0029226 , -0.00528657,  0.00140394,  0.00464613,
        0.00095449,  0.00587525,  0.00366082,  0.00787814, -0.00252235,
        0.00356555, -0.00159796, -0.00051812,  0.00568558,  0.00726013,
       -0.00784194, -0.00593997,  0.0056762 , -0.00826231, -0.00305512,
       -0.00476194, -0.00852246,  0.00851456, -0.00894912, -0.00703413,
       -0.00697013, -0.00033517,  0.00646908,  0.0099105 , -0.00752139,
       -0.00775776,  0.00363862,  0.00989887,  0.00251854, -0.00383531],
      dtype=float32)

## Compute Similarities

In [18]:
model.most_similar('baby octopus bok choy oyster sauce smoked paprika')

  """Entry point for launching an IPython kernel.


[('meatloaf mix sweet potatoes green beans old fashioned', 0.449171245098114),
 ('goat chops asafoetida petite french lentils quince paste',
  0.43560054898262024),
 ('wagyu ground beef swiss chard eggplant rollatini ranch dressing soda',
  0.4186896085739136),
 ('kholodets flaming shiso blue foot mushrooms goat head', 0.4138322174549103),
 ('red snapper sourdough bread purple cauliflower natto', 0.4091883897781372),
 ('chai tea latte gala apples black licorice puff pastry', 0.405636191368103),
 ('habanero chiles bittersweet chocolate fresh sardines', 0.40149879455566406),
 ('hollandaise sauce strawberries walnuts blue corn tortilla chips',
  0.3919934332370758),
 ('oysters pomegranate juice red chard gingerbread cookie dough',
  0.3877302408218384),
 ('muffin tops russian red kale soaked beans whole milk yogurt',
  0.3860570192337036)]

# Clustering