In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import string

from tensorflow.keras import layers
from tensorflow.keras import losses

# requires update to tensorflow 2.4
# >>> conda activate PIC16B
# >>> pip install tensorflow==2.4
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
import sqlite3

def retrieve(word):
    '''
    queries recipe database for titles including a specified word
    Parameters:
        word (str): keyword for parsing the recipe data
    Returns:
        pandas DataFrame with relevant recipes
    '''
    
    with sqlite3.connect("recipes1M.db") as conn:
        cmd = \
        f"""
        SELECT R.title
        FROM recipes R
        WHERE R.title LIKE "%{word}%"
        """
        df = pd.read_sql_query(cmd, conn)
    
    return df

In [None]:
df = retrieve("salmon")

In [None]:
df

In [None]:
def standardize(input):
    lwer = tf.strings.lower(input)
    punc = tf.strings.regex_replace(lwer, '[%s]' % re.escape(string.punctuation), '')
    return punc 

In [None]:
max_tokens = 5000
sequence_length = 20

vectorize = TextVectorization(
    standardize = standardize,
    max_tokens = max_tokens,
    output_mode = 'int',
    output_sequence_length = sequence_length) 

In [None]:
data = tf.data.Dataset.from_tensor_slices(df) # convert to TensorFlow Dataset

In [None]:
vectorize.adapt(data) # set up the vectorizing

In [None]:
for recipe in data.take(5):
    print(recipe)

In [None]:
def vectorize_recipe(text):
    text = tf.expand_dims(text, -1)
    return vectorize(text)

data_vec = data.map(vectorize_recipe) # convert recipes to numbers

In [None]:
list(data_vec.take(5))

In [None]:
vocabulary = vectorize.get_vocabulary() # collect all the words used

In [None]:
def num_to_str(vec, vocab):
    ''' converts numeric recipe to original title '''
    arr = vec.numpy()
    arr = arr[0]
    title = ""
    for num in arr:
        title += vocab[num] + " "
    return title

In [None]:
for item in data_vec:
    out = num_to_str(item, vocabulary)
    print(out)