Purpose: Classify recipes using the machine learning techniques we practiced in class. Specifically, we will do the classification by cuisine.

Data: https://www.kaggle.com/c/whats-cooking/data

Much of the code of the original draft was taken from Phil's lecture (https://github.com/PhilChodrow/PIC16B/blob/master/lectures/tf/tf-3.ipynb)

Outline:
1. Set up the data into a TensorFlow dataset
2. Build and compile the model
3. Classify!

In [1]:
# imports
import json
import pandas as pd
import numpy as np

import re
import string

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import plotly.express as px 
import plotly.io as pio
pio.templates.default = "plotly_white"

In [4]:
# read in data
with open("Data/train.json") as f:
    data = json.load(f)
df = pd.DataFrame(data)
df

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [8]:
df.drop(columns = ["id"])
df.groupby("cuisine").size()

cuisine
0      467
1      804
2     1546
3     2673
4      755
5     2646
6     1175
7     3003
8      667
9     7838
10     526
11    1423
12     830
13    6438
14     821
15     489
16    4320
17     989
18    1539
19     825
dtype: int64

In [6]:
# send column from list to str
df["ingredients"] = df["ingredients"].apply(lambda x: ' '.join(map(str, x)))
df

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,romaine lettuce black olives grape tomatoes ga...
1,25693,southern_us,plain flour ground pepper salt tomatoes ground...
2,20130,filipino,eggs pepper salt mayonaise cooking oil green c...
3,22213,indian,water vegetable oil wheat salt
4,13162,indian,black pepper shallots cornflour cayenne pepper...
...,...,...,...
39769,29109,irish,light brown sugar granulated sugar butter warm...
39770,11462,italian,KRAFT Zesty Italian Dressing purple onion broc...
39771,2238,irish,eggs citrus fruit raisins sourdough starter fl...
39772,41882,chinese,boneless chicken skinless thigh minced garlic ...


In [7]:
le = LabelEncoder()
df["cuisine"] = le.fit_transform(df["cuisine"])
df

Unnamed: 0,id,cuisine,ingredients
0,10259,6,romaine lettuce black olives grape tomatoes ga...
1,25693,16,plain flour ground pepper salt tomatoes ground...
2,20130,4,eggs pepper salt mayonaise cooking oil green c...
3,22213,7,water vegetable oil wheat salt
4,13162,7,black pepper shallots cornflour cayenne pepper...
...,...,...,...
39769,29109,8,light brown sugar granulated sugar butter warm...
39770,11462,9,KRAFT Zesty Italian Dressing purple onion broc...
39771,2238,8,eggs citrus fruit raisins sourdough starter fl...
39772,41882,3,boneless chicken skinless thigh minced garlic ...


In [11]:
# build a TensorFlow dataset
data = tf.data.Dataset.from_tensor_slices((df["ingredients"], df["cuisine"]))

for ingredients, cuisine in data.take(5):
    print(cuisine)
    print(ingredients)
    print("")

tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(b'romaine lettuce black olives grape tomatoes garlic pepper purple onion seasoning garbanzo beans feta cheese crumbles', shape=(), dtype=string)

tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(b'plain flour ground pepper salt tomatoes ground black pepper thyme eggs green tomatoes yellow corn meal milk vegetable oil', shape=(), dtype=string)

tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(b'eggs pepper salt mayonaise cooking oil green chilies grilled chicken breasts garlic powder yellow onion soy sauce butter chicken livers', shape=(), dtype=string)

tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(b'water vegetable oil wheat salt', shape=(), dtype=string)

tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(b'black pepper shallots cornflour cayenne pepper onions garlic paste milk butter salt lemon juice water chili powder passata oil ground cumin boneless chicken skinless thigh garam masala double cream natural yogurt bay leaf', shape=(), dtype=strin

In [12]:
data = data.shuffle(buffer_size = len(data))

train_size = int(0.75*len(data))
val_size   = int(0.1*len(data))

train = data.take(train_size)
val   = data.skip(train_size).take(val_size)
test  = data.skip(train_size + val_size)

In [13]:
def standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    no_punctuation = tf.strings.regex_replace(lowercase, '[%s]' % re.escape(string.punctuation),'')
    return no_punctuation

max_tokens = 1500 # how many words to keep track of

sequence_length = 40 # length

vectorize_layer = TextVectorization(
    standardize = standardization,
    max_tokens = max_tokens,
    output_mode = 'int',
    output_sequence_length = sequence_length)

In [14]:
ingredients = train.map(lambda x, y: x)
vectorize_layer.adapt(ingredients)

In [None]:
def vectorize_ingr(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), [label]

train_vec = train.map(vectorize_ingr)
val_vec   = val.map(vectorize_ingr)
test_vec  = test.map(vectorize_ingr)

In [None]:
list(train_vec.take(2))

In [None]:
# time to model

In [None]:
model = tf.keras.Sequential([
  layers.Embedding(max_tokens, output_dim = 20, name="embedding"),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(len(le.classes_))]
)

In [None]:
model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer='adam', 
              metrics=['accuracy'])

In [None]:
history = model.fit(train_vec, epochs = 3, validation_data = val_vec)

In [None]:
from matplotlib import pyplot as plt
plt.plot(history.history["accuracy"], label = "training")
plt.plot(history.history["val_accuracy"], label = "validation")
plt.gca().set(xlabel = "epoch", ylabel = "accuracy")
plt.legend()

In [None]:
model.evaluate(test_vec)

In [None]:
weights = model.get_layer('embedding').get_weights()[0] # get the weights from the embedding layer
vocab = vectorize_layer.get_vocabulary()                # get the vocabulary from our data prep for later

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
weights = pca.fit_transform(weights)

In [None]:
while len(vocab) < 2000:
    vocab += " "

In [None]:
embedding_df = pd.DataFrame({
    'word' : vocab, 
    'x0'   : weights[:,0],
    'x1'   : weights[:,1]
})
embedding_df

In [None]:
import plotly.express as px 
fig = px.scatter(embedding_df, 
                 x = "x0", 
                 y = "x1", 
                 size = list(np.ones(len(embedding_df))),
                 size_max = 2,
                 hover_name = "word")

fig.show()