In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import string

from tensorflow.keras import layers
from tensorflow.keras import losses

# requires update to tensorflow 2.4
# >>> conda activate PIC16B
# >>> pip install tensorflow==2.4
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
import sqlite3

def retrieve(word):
    '''
    queries recipe database for titles including a specified word
    Parameters:
        word (str): keyword for parsing the recipe data
    Returns:
        pandas DataFrame with relevant recipes
    '''
    
    with sqlite3.connect("recipes1M.db") as conn:
        cmd = \
        f"""
        SELECT R.title
        FROM recipes R
        WHERE R.title LIKE "%{word}%"
        """
        df = pd.read_sql_query(cmd, conn)
    
    return df

In [43]:
df = retrieve("salmon")

In [4]:
df

Unnamed: 0,title
0,Salmon & Salad a La SPORTZ
1,Curried Pumpkin and Smoked Salmon Soup
2,Grilled Rosemary Salmon Spedini
3,Garlic and Dill Salmon
4,Spicy Grilled Orange Salmon
...,...
10788,Creamy Smoked Salmon and Dill Frittata
10789,Salmon and Spaghetti Casserole
10790,Salmon Pot Pie
10791,Pesto-Crusted Salmon Fillet With Citrus-Soy Sauce


In [None]:
# split up the data into the text and the ideal predicted label

In [61]:
def give_input_split(title):
    ''' gives the string but without the last space and the text following it'''
    split_text = title.rsplit(" ", maxsplit = 1)
    return split_text[0]

def give_output_split(title):
    ''' gives the word following the last space of the text'''
    split_text = title.rsplit(" ", maxsplit = 1)
    if(len(split_text) < 2):
        return ""
    return split_text[1]

In [62]:
# create new columns based on the previous functions
df["input"] = df["title"].apply(give_input_split)
df["predict"] = df["title"].apply(give_output_split)

In [65]:
df.drop(columns = ["next", "temp"])

Unnamed: 0,title,input,predict
0,Salmon & Salad a La SPORTZ,Salmon & Salad a La,SPORTZ
1,Curried Pumpkin and Smoked Salmon Soup,Curried Pumpkin and Smoked Salmon,Soup
2,Grilled Rosemary Salmon Spedini,Grilled Rosemary Salmon,Spedini
3,Garlic and Dill Salmon,Garlic and Dill,Salmon
4,Spicy Grilled Orange Salmon,Spicy Grilled Orange,Salmon
...,...,...,...
10788,Creamy Smoked Salmon and Dill Frittata,Creamy Smoked Salmon and Dill,Frittata
10789,Salmon and Spaghetti Casserole,Salmon and Spaghetti,Casserole
10790,Salmon Pot Pie,Salmon Pot,Pie
10791,Pesto-Crusted Salmon Fillet With Citrus-Soy Sauce,Pesto-Crusted Salmon Fillet With Citrus-Soy,Sauce


In [None]:
# so the idea is that we can use the input column to predict the output column

In [67]:
data = tf.data.Dataset.from_tensor_slices((df["input"], df["predict"]))

In [77]:
for headline, category in data.take(5):
    print(headline)
    print(category)
    print("")

tf.Tensor(b'Grilled Salmon Steaks with Lime', shape=(), dtype=string)
tf.Tensor(b'Butter', shape=(), dtype=string)

tf.Tensor(b'Trout/Salmon Fillets With Tarragon Cream', shape=(), dtype=string)
tf.Tensor(b'Sauce', shape=(), dtype=string)

tf.Tensor(b'Smoked Salmon and Pancetta', shape=(), dtype=string)
tf.Tensor(b'Crostini', shape=(), dtype=string)

tf.Tensor(b'Smoked Salmon and Sour Cream', shape=(), dtype=string)
tf.Tensor(b'Sauce', shape=(), dtype=string)

tf.Tensor(b"BgCtGal's Lemon Garlic Butter", shape=(), dtype=string)
tf.Tensor(b'Salmon', shape=(), dtype=string)



In [78]:
data = data.shuffle(buffer_size = len(data))
train_size = int(0.7*len(data))
val_size   = int(0.1*len(data))

train = data.take(train_size)
val   = data.skip(train_size).take(val_size)
test  = data.skip(train_size + val_size)
len(train), len(val), len(test)

(7555, 1079, 2159)

In [69]:
val

<TakeDataset shapes: ((), ()), types: (tf.string, tf.string)>

In [70]:
def standardize(input):
    lwer = tf.strings.lower(input)
    punc = tf.strings.regex_replace(lwer, '[%s]' % re.escape(string.punctuation), '')
    return punc 

In [71]:
max_tokens = 5000
sequence_length = 20

vectorize = TextVectorization(
    standardize = standardize,
    max_tokens = max_tokens,
    output_mode = 'int',
    output_sequence_length = sequence_length) 

In [72]:
# don't need this anymore
#data = tf.data.Dataset.from_tensor_slices(df) # convert to TensorFlow Dataset

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type NoneType).

In [79]:
recipes = train.map(lambda x, y: x)
vectorize.adapt(recipes)

In [73]:
# don't need this anymore
#vectorize.adapt(data) # set up the vectorizing

ValueError: The dataset passed to 'adapt' must contain a single tensor value.

In [75]:
for recipe in data.take(5):
    print(recipe)

(<tf.Tensor: shape=(), dtype=string, numpy=b'Micro Poached Salmon'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Recipe'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"Bobby Flay's Tostones With Salmon Tartar And Relish">, <tf.Tensor: shape=(), dtype=string, numpy=b'Recipe'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Grilled Salmon With Tomato/Basil'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Salsa'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Honey Mustard Grilled Salmon or Tuna'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Steaks'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Salmon With Sesame Orange'>, <tf.Tensor: shape=(), dtype=string, numpy=b'Relish'>)


In [96]:

def vectorize_recipe(text, label):
    text = tf.expand_dims(text, -1)
    label = tf.expand_dims(label, -1)
    label = vectorize(label)
    return vectorize(text), [label]


#def vectorize_recipe(text):
 #   text = tf.expand_dims(text, -1)
  #  return vectorize(text)

train_vec = train.map(vectorize_recipe)
val_vec   = val.map(vectorize_recipe)
test_vec  = test.map(vectorize_recipe)

In [97]:
train_vec

<MapDataset shapes: ((None, 20), (1, None, 20)), types: (tf.int64, tf.int64)>

In [99]:
list(val_vec.take(1))

[(<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
  array([[ 46,   9,   2,   3, 414,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0]], dtype=int64)>,
  <tf.Tensor: shape=(1, 1, 20), dtype=int64, numpy=
  array([[[189,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
             0,   0,   0,   0,   0,   0,   0,   0]]], dtype=int64)>)]

In [84]:
vocabulary = vectorize.get_vocabulary() # collect all the words used

In [21]:
def num_to_str(vec, vocab):
    ''' converts numeric recipe to original title '''
    arr = vec.numpy()
    arr = arr[0]
    title = ""
    for num in arr:
        title += vocab[num] + " "
    return title

In [22]:
for item in val_vec:
    out = num_to_str(item, vocabulary)
    print(out)

smoked salmon chive and tomato pita crisps              
fresh lemon dill salmon with veggies               
lomi salmon salad                  
smoked salmon pasta                  
mustard soy salmon                  
salmon w roasted asparagus lemoncaper sauce               
salmon pilaf with green onions                
grilled salmon with citrus dill marinade               
salmon with white beans and kale               
teriyaki salmon                   
golden salmon on a bed of lentils              
honeymustardglazed grilled salmon                  
marinade for grilled salmon                 
smoked salmon with toasted homemade beer bread recipe             
pan seared salmon with fresh tomatobasil relish              
salmon with pineapple salsa                 
caramelized salmon over a warm potato salad              
seared wild salmon with late spring succotash              
broiled salmon with bacon raspberry beurre blanc recipe             
escalope of salmon with chant

cilantro salmon                   
delicious and simple baked salmon with fancy sauce             
greek salmon and seafood skewers                
panseared salmon with a tangy thai sauce              
christmas recipe for salmon                 
salmon en papillote                  
farfalle with creamy smoked salmon and vodka sauce             
grilled salmon with citrus salsa verde               
orange glazed salmon                  
peppered salmon with grilled corn succotash               
scallops and smoked salmon pasta                
bourbon glazed salmon                  
ovenroasted salmon asparagus and new potatoes               
blackened salmon                   
japanese salmon ramen                  
golden salmon coins                  
roasted salmon with lemon relish                
broiled salmon with cilantrolime compound butter               
salmon nicoise salad                  
bbq salmon over mixed greens                
salmon marsala with mushrooms and bro

In [23]:
def split_input_target(title):
    input_text = title[:-1]
    target_text = title[1:]
    
    return input_text, target_text

dataset_targeted = train_vec.map(split_input_target)

print(dataset_targeted)

<MapDataset shapes: ((None, 20), (None, 20)), types: (tf.int64, tf.int64)>


In [24]:
list(dataset_targeted.take(1).take(1))

[(<tf.Tensor: shape=(0, 20), dtype=int64, numpy=array([], shape=(0, 20), dtype=int64)>,
  <tf.Tensor: shape=(0, 20), dtype=int64, numpy=array([], shape=(0, 20), dtype=int64)>)]

In [26]:
list(train_vec.take(1))

[<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
 array([[799,  96,   2,   9,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0]], dtype=int64)>]

In [27]:
VOCABULARY_SIZE = max_tokens +1

In [101]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.models.Sequential()

    model.add(tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        batch_input_shape=[batch_size, None]
    ))

    model.add(tf.keras.layers.LSTM(
        units=rnn_units,
        return_sequences=True,
        stateful=True,
        recurrent_initializer=tf.keras.initializers.GlorotNormal()
    ))

    model.add(tf.keras.layers.Dense(vocab_size))
    
    return model

model = build_model(
  vocab_size=VOCABULARY_SIZE,
  embedding_dim=256,
  rnn_units=1024,
  batch_size= sequence_length
)

model.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (20, None, 256)           1280256   
_________________________________________________________________
lstm_1 (LSTM)                (20, None, 1024)          5246976   
_________________________________________________________________
dense_2 (Dense)              (20, None, 5001)          5126025   
Total params: 11,653,257
Trainable params: 11,653,257
Non-trainable params: 0
_________________________________________________________________


In [102]:
model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer='adam', 
              metrics=['accuracy'])

In [103]:
train_vec

<MapDataset shapes: ((None, 20), (1, None, 20)), types: (tf.int64, tf.int64)>

In [104]:
history = model.fit(train_vec, epochs = 10, validation_data = val_vec)

Epoch 1/10


InvalidArgumentError:  Input to reshape is a tensor with 20 values, but the requested shape has 400
	 [[node gradient_tape/sequential_2/embedding_1/embedding_lookup/Reshape_1 (defined at <ipython-input-104-eddd9874a904>:1) ]] [Op:__inference_train_function_89097]

Function call stack:
train_function


In [87]:
model = tf.keras.Sequential([
  layers.Embedding(max_tokens, output_dim = 3, name="embedding"),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(100)]
)

In [88]:
model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer='adam', 
              metrics=['accuracy'])

In [100]:
history = model.fit(train_vec, epochs = 10, validation_data = val_vec)

Epoch 1/10


InvalidArgumentError:  logits and labels must have the same first dimension, got logits shape [1,100] and labels shape [20]
	 [[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at <ipython-input-100-eddd9874a904>:1) ]] [Op:__inference_train_function_86520]

Function call stack:
train_function
