In [2]:
import random

In [3]:
random.seed(1234)

In [7]:
def title_generator_v1(main, aux, adj):
    '''
    Generates a new recipe title based on existing data.
    Parameters:
        main (list): list of ingredients that can be the central ingredient
        aux (list): extra ingredients (the supporting cast)
        adj (list): potential adjectives to describe the dish
    Returns:
        string object containing the new recipe title.
    '''
    
    ingrMain = random.randint(0, len(main) - 1)
    ingrAux = random.randint(0, len(aux) - 1)
    ingrAdj = random.randint(0, len(adj) -1)
    
    return adj[ingrAdj] + " " + main[ingrMain] + " with " + aux[ingrAux]

In [8]:
# create fake data
main = ["meatballs", "steak", "pasta", "lasagna", "ramen", "sandwich"]
aux = ["spinach", "noodles", "rice", "vegetables", "carrots"]
adj = ["curried", "epic", "roasted", "baked", "fried", "boiled", "creamed"]

In [20]:
new_recipe = title_generator_v1(main, aux, adj)
print(new_recipe)

fried meatballs with carrots


In [None]:
# onto level 2

In [21]:
import pandas as pd
import sqlite3
import numpy as np

In [25]:
def title_generator_v2(start):
    '''
    Generates a new recipe title based on bigger set of raw data.
    Parameters:
        start (str): keyword for parsing the recipe data
    Returns:
        string object with new recipe title
    '''
    
    with sqlite3.connect("recipes1M.db") as conn:
        cmd = \
        f"""
        SELECT R.title
        FROM recipes R
        WHERE R.title LIKE "%{start}%"
        """
        df = pd.read_sql_query(cmd, conn)
    
    
    
    
    return df

In [26]:
title_generator_v2("meatball")

Unnamed: 0,title
0,Greek Meatballs in Wine Sauce
1,Gf Turkey Meatballs
2,Turkey Porcupine Meatballs
3,Magnificent Meatballs
4,Spicy Meatball Stroganoff
...,...
6242,Sweet and Sour Meatballs
6243,Surprise Cocktail Meatballs
6244,Amazing Turkey Meatballs Recipe
6245,Real Italian Meatballs


In [27]:
# learn about word embeddings (a strategy for encoding text for ML)
# https://www.tensorflow.org/tutorials/text/word_embeddings

In [32]:
# get some cleaned data
with sqlite3.connect("recipes1M.db") as conn:
    cmd = \
    f"""
    SELECT R.title
    FROM recipes R
    WHERE R.title LIKE "%potato%"
    """
    df = pd.read_sql_query(cmd, conn)

df.head(5)

Unnamed: 0,title
0,Crunchy Onion Potato Bake
1,Erin's Mashed Potatoes
2,"Leek, Potato, and Bacon Casserole"
3,Midnight Mashed Potatoes
4,Old-Fashioned Sweet Potato Pie


In [33]:
#import io
#import os
#import re
#import shutil
#import string
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [34]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [35]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

array([[ 0.02101742,  0.00475962,  0.00221165,  0.0060176 , -0.04316204],
       [ 0.04935468,  0.04484386, -0.03575759,  0.00706694, -0.0112227 ],
       [-0.00302869,  0.01437206, -0.02922799, -0.03614534, -0.03918508]],
      dtype=float32)

In [36]:
result = embedding_layer(tf.constant([[0, 1, 2], [3, 4, 5]]))
result.shape

TensorShape([2, 3, 5])

In [39]:
dataset = tf.data.Dataset.from_tensor_slices((df.values))

In [41]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = dataset.map(lambda x: x)
vectorize_layer.adapt(text_ds)

NameError: name 're' is not defined