# Recipe formatter

Consistently formats recipes from the models for human evaluation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pathlib
import pandas as pd

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
CACHE_DIR = "./drive/Shared drives/Capstone/tmp"
pathlib.Path(CACHE_DIR).mkdir(exist_ok=True)
dataset_path = os.path.join(CACHE_DIR, 'recipes.pkl')

In [4]:
!head -n 50 "./drive/Shared drives/Capstone/tmp/text_recipes.txt"

<TITLE>
Slow Cooker Chicken and Dumplings
<INGREDIENTS>
• 4 skinless, boneless chicken breast halves
• 2 tablespoons butter
• 2 (10.75 ounce) cans condensed cream of chicken soup
• 1 onion, finely diced
• 2 (10 ounce) packages refrigerated biscuit dough, torn into pieces
<INSTRUCTIONS>
‣ Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.
‣ Cover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.
<DONE>
<TITLE>
Awesome Slow Cooker Pot Roast
<INGREDIENTS>
• 2 (10.75 ounce) cans condensed cream of mushroom soup
• 1 (1 ounce) package dry onion soup mix
• 1 1/4 cups water
• 5 1/2 pounds pot roast
<INSTRUCTIONS>
‣ In a slow cooker, mix cream of mushroom soup, dry onion soup mix and water. Place pot roast in slow cooker and coat with soup mixture.
‣ Cook on High setting for 3 to 4 hours, or on Low setting for 8 to 9 hours.
<DONE

In [5]:
if not os.path.exists(dataset_path):
    raise SystemExit("Run preprocess_pickle.ipynb to generate data file before continuing")
else:
    recipes = pd.read_pickle(dataset_path)

# TODO: Remove subsetting for final training
recipes = recipes[:20000]

In [6]:
recipes

Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"• 4 skinless, boneless chicken breast halves\n...","‣ Place the chicken, butter, soup, and onion i..."
1,Awesome Slow Cooker Pot Roast,• 2 (10.75 ounce) cans condensed cream of mush...,"‣ In a slow cooker, mix cream of mushroom soup..."
2,Brown Sugar Meatloaf,• 1/2 cup packed brown sugar\n• 1/2 cup ketchu...,‣ Preheat oven to 350 degrees F (175 degrees C...
3,Best Chocolate Chip Cookies,"• 1 cup butter, softened\n• 1 cup white sugar\...",‣ Preheat oven to 350 degrees F (175 degrees C...
4,Homemade Mac and Cheese Casserole,• 8 ounces whole wheat rotini pasta\n• 3 cups ...,‣ Preheat oven to 350 degrees F. Line a 2-quar...
...,...,...,...
20161,Georgia's Tennessee Jam Cake,"• 1 cup butter, softened\n• 2 cups white sugar...",‣ Preheat the oven to 350 degrees F (175 degre...
20162,Poached Eggs and Asparagus,• 4 eggs\n• 1 cube chicken bouillon (optional)...,‣ Fill a saucepan half way full of water. Brin...
20163,Bistecca alla Fiorentina (Tuscan Porterhouse),"• 4 sprigs fresh rosemary, chopped\n• 1 (2 1/2...",‣ Press chopped rosemary onto both sides of po...
20164,Courtney's Three Tomato Pasta Sauce,• 1/2 pound bulk mild Italian sausage\n• 1/2 p...,‣ Cook mild and hot Italian sausage in a large...


In [7]:
def recipe_to_str(recipe):
    # Combine components of recipe into a string
    return f"{recipe.title}<ING>{recipe.ingredients}<INS>{recipe.instructions}"

recipe_strings = recipes.apply(recipe_to_str, axis=1)

In [8]:
recipe_strings[:10]

0     Slow Cooker Chicken and Dumplings<ING>• 4 skin...
1     Awesome Slow Cooker Pot Roast<ING>• 2 (10.75 o...
2     Brown Sugar Meatloaf<ING>• 1/2 cup packed brow...
3     Best Chocolate Chip Cookies<ING>• 1 cup butter...
4     Homemade Mac and Cheese Casserole<ING>• 8 ounc...
5     Banana Banana Bread<ING>• 2 cups all-purpose f...
7     Mom's Zucchini Bread<ING>• 3 cups all-purpose ...
8     The Best Rolled Sugar Cookies<ING>• 1 1/2 cups...
9     Singapore Chili Crabs<ING>• Sauce:\n• 1/2 cup ...
10    Downeast Maine Pumpkin Bread<ING>• 1 (15 ounce...
dtype: object

#Import GPT-2 recipes and RNN recipes

GPT-2 recipes

In [11]:
CACHE_DIR = "./drive/Shared drives/Capstone/tmp"
#pathlib.Path(CACHE_DIR).mkdir(exist_ok=True)
gpt2_recipes_path = os.path.join(CACHE_DIR, 'gpt2_title_prompt_output_recipes')

In [13]:
pretty_print_path = os.path.join(CACHE_DIR, 'pretty_printed', 'gpt2_finetuned')

for i in range(500):
  print(f"> {i} out of 500")
  file_path = os.path.join(gpt2_recipes_path, f"gpt2_recipe_{i}.txt")
  recipe_string = "TITLE\n"
  for line in open(file_path, 'r'):
    recipe_string += line
  recipe_string = recipe_string.replace("<ING>", "\n\nINGREDIENTS").replace("<INS>", "\n\nINSTRUCTIONS")
  recipe_string = recipe_string.replace("•", "\n•").replace("‣", "\n‣")
  out_path = os.path.join(pretty_print_path, f"gpt2_recipe_{i}.txt")
  with open(out_path, 'w') as out_file:
    out_file.write(recipe_string)

> 0 out of 500
> 1 out of 500
> 2 out of 500
> 3 out of 500
> 4 out of 500
> 5 out of 500
> 6 out of 500
> 7 out of 500
> 8 out of 500
> 9 out of 500
> 10 out of 500
> 11 out of 500
> 12 out of 500
> 13 out of 500
> 14 out of 500
> 15 out of 500
> 16 out of 500
> 17 out of 500
> 18 out of 500
> 19 out of 500
> 20 out of 500
> 21 out of 500
> 22 out of 500
> 23 out of 500
> 24 out of 500
> 25 out of 500
> 26 out of 500
> 27 out of 500
> 28 out of 500
> 29 out of 500
> 30 out of 500
> 31 out of 500
> 32 out of 500
> 33 out of 500
> 34 out of 500
> 35 out of 500
> 36 out of 500
> 37 out of 500
> 38 out of 500
> 39 out of 500
> 40 out of 500
> 41 out of 500
> 42 out of 500
> 43 out of 500
> 44 out of 500
> 45 out of 500
> 46 out of 500
> 47 out of 500
> 48 out of 500
> 49 out of 500
> 50 out of 500
> 51 out of 500
> 52 out of 500
> 53 out of 500
> 54 out of 500
> 55 out of 500
> 56 out of 500
> 57 out of 500
> 58 out of 500
> 59 out of 500
> 60 out of 500
> 61 out of 500
> 62 out of 500
> 

In [15]:
pretty_print_path = os.path.join(CACHE_DIR, 'pretty_printed', 'ground_truth')

text_recipes_path = os.path.join(CACHE_DIR, 'text_recipes.txt')
with open(text_recipes_path, 'r') as in_file:
  recipe_string = ""
  count = 0
  for line in in_file:
    if "<DONE>" in line:
      recipe_string = recipe_string.replace("<TITLE>", "TITLE")
      recipe_string = recipe_string.replace("<INGREDIENTS>", "\nINGREDIENTS")
      recipe_string = recipe_string.replace("<INSTRUCTIONS>", "\nINSTRUCTIONS")
      out_path = os.path.join(pretty_print_path, f"ground_truth_recipe_{count}.txt")
      with open(out_path, 'w') as out_file:
        out_file.write(recipe_string)
      count += 1
      recipe_string = ""
    else:
      recipe_string += line
    if count > 499:
      break

In [18]:
pretty_print_path = os.path.join(CACHE_DIR, 'pretty_printed', 'rnn')

rnn_recipes_path = os.path.join(CACHE_DIR, 'rnn_char_title_prompt_output_recipes')

for i in range(500):
  print(f"> {i} out of 500")
  file_path = os.path.join(rnn_recipes_path, f"rnn_char_recipe_{i}.txt")
  recipe_string = "TITLE\n"
  for line in open(file_path, 'r'):
    recipe_string += line
  recipe_string = recipe_string.replace("\U0001F951", "INGREDIENTS").replace("\U0001f963", "INSTRUCTIONS")
  # recipe_string = recipe_string.replace("•", "\n•").replace("‣", "\n‣")
  print(recipe_string)
  out_path = os.path.join(pretty_print_path, f"rnn_recipe_{i}.txt")
  with open(out_path, 'w') as out_file:
    out_file.write(recipe_string)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
‣ Bake in preheated oven for 55 minutes or until heated through. Set aside to cool completely.
‣ Top butter chicken with 
> 117 out of 500
TITLE
Fusilli with Spinach and Asiago Cheese Sush and Hot Sauce, recipe follows
• 1 (6 ounce) can French-fruit pectin
• 2 tablespoons milk
• 1 teaspoon vanilla extract

INSTRUCTIONS
‣ Preheat an oven to 400 degrees F (200 degrees C). Grease a cookie sheet.
‣ In a medium saucepan over medium heat, combine the cream, flour, cinnamon, cloves, ginger and salt; pour the white sugar over the unbaked pie spinach. Pour over the hot syrup and mix. Crush lightly on all sides of the guace, and place over the top just apple mixture and top with the thawed frozen mixer to water. Sprinkle the crushed cranberries and pine nuts with the custard. Pour the smoothie evenly over the top of the top. Pour the remaining mixture over the top, and spread the mixture over the top. Cover the dish with aluminum f

In [19]:
gpt2_recipe_strings = []

for i in range(500):
  print(f"> {i} out of 500")
  file_path = os.path.join(gpt2_recipes_path, f"gpt2_recipe_{i}.txt")
  gpt2_recipe_string = ""
  for line in open(file_path, 'r'):
    gpt2_recipe_string += line
  gpt2_recipe_strings.append(gpt2_recipe_string)

> 0 out of 500
> 1 out of 500
> 2 out of 500
> 3 out of 500
> 4 out of 500
> 5 out of 500
> 6 out of 500
> 7 out of 500
> 8 out of 500
> 9 out of 500
> 10 out of 500
> 11 out of 500
> 12 out of 500
> 13 out of 500
> 14 out of 500
> 15 out of 500
> 16 out of 500
> 17 out of 500
> 18 out of 500
> 19 out of 500
> 20 out of 500
> 21 out of 500
> 22 out of 500
> 23 out of 500
> 24 out of 500
> 25 out of 500
> 26 out of 500
> 27 out of 500
> 28 out of 500
> 29 out of 500
> 30 out of 500
> 31 out of 500
> 32 out of 500
> 33 out of 500
> 34 out of 500
> 35 out of 500
> 36 out of 500
> 37 out of 500
> 38 out of 500
> 39 out of 500
> 40 out of 500
> 41 out of 500
> 42 out of 500
> 43 out of 500
> 44 out of 500
> 45 out of 500
> 46 out of 500
> 47 out of 500
> 48 out of 500
> 49 out of 500
> 50 out of 500
> 51 out of 500
> 52 out of 500
> 53 out of 500
> 54 out of 500
> 55 out of 500
> 56 out of 500
> 57 out of 500
> 58 out of 500
> 59 out of 500
> 60 out of 500
> 61 out of 500
> 62 out of 500
> 

In [None]:
len(gpt2_recipe_strings)

In [None]:
gpt2_recipe_strings_df = pd.DataFrame(gpt2_recipe_strings)

In [None]:
gpt2_recipe_strings_df

In [None]:
print(gpt2_recipe_strings[0])

Char-level RNN recipes

In [None]:
#CACHE_DIR = "./drive/Shared drives/CS 269: Recipe/tmp"
#pathlib.Path(CACHE_DIR).mkdir(exist_ok=True)
rnn_recipes_path = os.path.join(CACHE_DIR, 'rnn_output_recipes')

rnn_recipe_strings = []

for i in range(500):
  print(f"> {i} out of 500")
  file_path = os.path.join(rnn_recipes_path, f"rnn_recipe_{i}.txt")
  rnn_recipe_string = ""
  for line in open(file_path, 'r'):
    rnn_recipe_string += line
  rnn_recipe_strings.append(rnn_recipe_string)

In [None]:
rnn_recipe_strings

In [None]:
rnn_recipe_strings[7][20:30]

In [None]:
len(rnn_recipe_strings)

In [None]:
# Clean up RNN recipe strings

cleaned_rnn_recipe_strings = []

for rnn_recipe_string in rnn_recipe_strings:
  cleaned_string = ""
  for char in rnn_recipe_string:
    if char == "\n":
      pass
    elif char == "\U0001f963":
      cleaned_string += "<INS>"
    else:
      cleaned_string += char
  cleaned_rnn_recipe_strings.append(cleaned_string)

In [None]:
len(cleaned_rnn_recipe_strings)
cleaned_rnn_recipe_strings

In [None]:
rnn_recipe_strings_df = pd.DataFrame(rnn_recipe_strings)

In [None]:
rnn_recipe_strings_df

Save these datasets

In [None]:
dataset_path = os.path.join(CACHE_DIR, 'gpt2_finetuned_recipes.pkl')
gpt2_recipe_strings_df.to_pickle(dataset_path) 

In [None]:
dataset_path = os.path.join(CACHE_DIR, 'rnn_recipes.pkl')
rnn_recipe_strings_df.to_pickle(dataset_path) 

#Training

Load in datasets

In [None]:
gpt2_recipe_strings_df = pd.read_pickle(os.path.join(CACHE_DIR, "gpt2_finetuned_recipes.pkl"))
rnn_recipe_strings_df = pd.read_pickle(os.path.join(CACHE_DIR, "rnn_recipes.pkl"))

Training-Test split

In [None]:
# recipe_strings['label'] = 0 # ground truth

recipe_strings_df = pd.DataFrame(recipe_strings)
# recipe_strings_df.rename(columns={"0" : "text"})
recipe_strings_df['text'] = recipe_strings_df[0]
recipe_strings_df['label'] = 0 # ground truth
gpt2_recipe_strings_df['text'] = gpt2_recipe_strings_df[0]
gpt2_recipe_strings_df['label'] = 1 # GPT-2 finetuned
rnn_recipe_strings_df['text'] = rnn_recipe_strings_df[0]
rnn_recipe_strings_df['label'] = 2 # RNN

In [None]:
recipe_strings_df['label']

0        0
1        0
2        0
3        0
4        0
        ..
20161    0
20162    0
20163    0
20164    0
20165    0
Name: label, Length: 20000, dtype: int64

##Distinguishing GPT-2 from the ground truth

In [None]:
truth_vs_gpt2_df = pd.concat([recipe_strings_df.sample(n=100), gpt2_recipe_strings_df.sample(n=100)])

In [None]:
truth_vs_gpt2_df

Unnamed: 0,0,text,label
18395,Grilled Gingered Salmon<ING>• 1 cup soy sauce\...,Grilled Gingered Salmon<ING>• 1 cup soy sauce\...,0
17976,Southern Coleslaw<ING>• 1 tablespoon red wine ...,Southern Coleslaw<ING>• 1 tablespoon red wine ...,0
7788,Pork Chop Soup<ING>• 2 (8 ounce) bone-in pork ...,Pork Chop Soup<ING>• 2 (8 ounce) bone-in pork ...,0
19725,Creamy Family Style Rice Pudding<ING>• 1 1/3 c...,Creamy Family Style Rice Pudding<ING>• 1 1/3 c...,0
15876,Veggie Pate<ING>• 1 cup sunflower seeds\n• 1/2...,Veggie Pate<ING>• 1 cup sunflower seeds\n• 1/2...,0
...,...,...,...
148,Yeast-Free Strawberry Dessert<ING>• 1 1/2 cups...,Yeast-Free Strawberry Dessert<ING>• 1 1/2 cups...,1
343,Jollyrogers' Cake III<ING>• 1 (18.25 ounce) pa...,Jollyrogers' Cake III<ING>• 1 (18.25 ounce) pa...,1
31,Quebec-Style Au Gratin<ING>• 1 (3 pound) beef ...,Quebec-Style Au Gratin<ING>• 1 (3 pound) beef ...,1
240,(Chicken) Alfredo<ING>• 1 (8 ounce) package un...,(Chicken) Alfredo<ING>• 1 (8 ounce) package un...,1


In [None]:
truth_vs_gpt2_df['nwords'] = truth_vs_gpt2_df['text'].apply(lambda x: len(x.split()))

truth_vs_gpt2_df = truth_vs_gpt2_df[truth_vs_gpt2_df['nwords']<350]

truth_vs_gpt2_df

Unnamed: 0,0,text,label,nwords
18395,Grilled Gingered Salmon<ING>• 1 cup soy sauce\...,Grilled Gingered Salmon<ING>• 1 cup soy sauce\...,0,183
17976,Southern Coleslaw<ING>• 1 tablespoon red wine ...,Southern Coleslaw<ING>• 1 tablespoon red wine ...,0,89
7788,Pork Chop Soup<ING>• 2 (8 ounce) bone-in pork ...,Pork Chop Soup<ING>• 2 (8 ounce) bone-in pork ...,0,214
19725,Creamy Family Style Rice Pudding<ING>• 1 1/3 c...,Creamy Family Style Rice Pudding<ING>• 1 1/3 c...,0,86
15876,Veggie Pate<ING>• 1 cup sunflower seeds\n• 1/2...,Veggie Pate<ING>• 1 cup sunflower seeds\n• 1/2...,0,170
...,...,...,...,...
148,Yeast-Free Strawberry Dessert<ING>• 1 1/2 cups...,Yeast-Free Strawberry Dessert<ING>• 1 1/2 cups...,1,98
343,Jollyrogers' Cake III<ING>• 1 (18.25 ounce) pa...,Jollyrogers' Cake III<ING>• 1 (18.25 ounce) pa...,1,163
31,Quebec-Style Au Gratin<ING>• 1 (3 pound) beef ...,Quebec-Style Au Gratin<ING>• 1 (3 pound) beef ...,1,128
240,(Chicken) Alfredo<ING>• 1 (8 ounce) package un...,(Chicken) Alfredo<ING>• 1 (8 ounce) package un...,1,168


In [None]:
truth_vs_gpt2_df['nwords'].max()

332

Build DistilBERT

In [None]:
!pip install transformers
import transformers as ppb



In [None]:
model_class, tokenizer_class, pretrained_weights = ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased'

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
tokenized = truth_vs_gpt2_df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

tokenized

18395    [101, 26192, 2094, 14580, 2098, 11840, 1026, 1...
17976    [101, 2670, 5624, 14540, 10376, 1026, 13749, 1...
7788     [101, 15960, 24494, 11350, 1026, 13749, 1028, ...
19725    [101, 24519, 2155, 2806, 5785, 29593, 1026, 13...
15876    [101, 2310, 13871, 2666, 6986, 2063, 1026, 137...
                               ...                        
148      [101, 21957, 1011, 2489, 16876, 18064, 1026, 1...
343      [101, 22193, 3217, 15776, 1005, 9850, 3523, 10...
31       [101, 5447, 1011, 2806, 8740, 24665, 20363, 10...
240      [101, 1006, 7975, 1007, 19423, 1026, 13749, 10...
248      [101, 4589, 1010, 16876, 1010, 1998, 14722, 10...
Name: text, Length: 199, dtype: object

In [None]:
max_len = max([len(i) for i in tokenized.values])

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(199, 472)

In [None]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
  last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features.shape

(199, 768)

In [None]:
labels = truth_vs_gpt2_df['label']
labels

18395    0
17976    0
7788     0
19725    0
15876    0
        ..
148      1
343      1
31       1
240      1
248      1
Name: label, Length: 199, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

ValueError: ignored

In [None]:
class FFNet(nn.Module):
  def __init__(self, input_dim=768, hidden_dim=1024, output_dim=1, dropout=0.8):
    super(FFNet, self).__init__()
    self.fc1 = nn.Sequential(
        nn.Linear(input_dim, hidden_dim),
        nn.Dropout(dropout),
        nn.LeakyReLU(),
        nn.BatchNorm1d(hidden_dim),
    )
    self.fc3 = nn.Sequential(
        nn.Linear(hidden_dim, output_dim),
        nn.Sigmoid(),
    )

  def forward(self, x):
    x = self.fc1(x)
    x = self.fc3(x)
    return x

ffnet = FFNet()

In [None]:
criterion = nn.BCELoss()
optimizer = optim.SGD(ffnet.parameters(), lr=0.001, momentum=0.9)

In [None]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels
  def __len__(self):
    return self.features.shape[0]
  def __getitem__(self, index):
    X = self.features[index,:]
    y = self.labels[index]
    return X, y

train_dataset = Dataset(train_features, train_labels.to_numpy())

In [None]:
trainloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True
)

In [None]:
NUM_EPOCHS = 1000

for epoch in range(NUM_EPOCHS):
  running_loss = 0.0
  for data, labels in trainloader:
    optimizer.zero_grad()
    outputs = ffnet(data)
    loss = criterion(outputs, labels.float().unsqueeze(1))
    loss.backward()
    optimizer.step()
    #print(outputs.view(1, -1))
    #print(labels.view(1, -1))

    running_loss += loss.item()
  if epoch % 50 == 0:
    print('Epoch {}, loss: {}'.format(epoch, running_loss))

print('Finished training')

Epoch 0, loss: 14.062329351902008
Epoch 50, loss: 12.031810313463211
Epoch 100, loss: 12.852627873420715
Epoch 150, loss: 13.759123593568802
Epoch 200, loss: 14.313785433769226
Epoch 250, loss: 14.727734088897705
Epoch 300, loss: 14.656821697950363
Epoch 350, loss: 13.173215806484222
Epoch 400, loss: 12.370548456907272
Epoch 450, loss: 14.940066009759903
Epoch 500, loss: 14.06834763288498
Epoch 550, loss: 13.959258615970612
Epoch 600, loss: 13.480918765068054
Epoch 650, loss: 14.402411460876465
Epoch 700, loss: 14.474220991134644
Epoch 750, loss: 14.299385607242584
Epoch 800, loss: 15.006634294986725
Epoch 850, loss: 12.757049798965454
Epoch 900, loss: 13.281872779130936
Epoch 950, loss: 14.127125859260559
Finished training


In [None]:
test_dataset = Dataset(test_features, test_labels.to_numpy())

testloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=True
)

In [None]:
y_pred = []
y = []
#total = 0
#correct = 0
with torch.no_grad():
  for data, labels in testloader:
    outputs = ffnet(data)
    predicted = torch.LongTensor(np.where(outputs > 0.5, 1, 0)).view(-1)
    y_pred.extend(predicted.tolist())
    y.extend(labels.tolist())
    #total += labels.size(0)
    #correct += (predicted == labels).sum().item()

print(y_pred)
print(y)

print('Accuracy of the FFNet trained on BERT sentence embeddings\non the test sentences: %0.3f %%' % accuracy_score(np.array(y), np.array(y_pred)))
print('F1-score of the FFNet trained on BERT sentence embeddings\non the test sentences: %0.3f %%' % f1_score(np.array(y), np.array(y_pred), average='micro'))

[1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1]
[0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1]
Accuracy of the FFNet trained on BERT sentence embeddings
on the test sentences: 0.560 %
F1-score of the FFNet trained on BERT sentence embeddings
on the test sentences: 0.560 %


##Distinguishing the RNN from the ground truth

In [None]:
truth_vs_rnn_df = pd.concat([recipe_strings_df.sample(n=100), rnn_recipe_strings_df.sample(n=100)])

truth_vs_rnn_df

Unnamed: 0,0,text,label
15677,Sweet-n-Sour Kielbasa<ING>• 2 pounds kielbasa ...,Sweet-n-Sour Kielbasa<ING>• 2 pounds kielbasa ...,0
15670,Green Bean and Asparagus Salad<ING>• 1 pound f...,Green Bean and Asparagus Salad<ING>• 1 pound f...,0
858,Sweet and Sour Pork III<ING>• 1 pound pork but...,Sweet and Sour Pork III<ING>• 1 pound pork but...,0
655,Chicken and Biscuit Casserole<ING>• 1/4 cup bu...,Chicken and Biscuit Casserole<ING>• 1/4 cup bu...,0
18224,"Menudo<ING>• 2 pounds beef tripe\n• 2 onions, ...","Menudo<ING>• 2 pounds beef tripe\n• 2 onions, ...",0
...,...,...,...
51,"Gorgonzola: Remove chopped chocolate chips, dr...","Gorgonzola: Remove chopped chocolate chips, dr...",2
30,D'Emily Shaped Cheese Cupcakes cookies a small...,D'Emily Shaped Cheese Cupcakes cookies a small...,2
238,Blackberry-Cinnamon Pie\n\n🥑\n• 4 (6 ounce) fa...,Blackberry-Cinnamon Pie\n\n🥑\n• 4 (6 ounce) fa...,2
432,Dippin's Fasici-Ancho yeast and Sour cream mak...,Dippin's Fasici-Ancho yeast and Sour cream mak...,2


In [None]:
truth_vs_rnn_df['nwords'] = truth_vs_rnn_df['text'].apply(lambda x: len(x.split()))

truth_vs_rnn_df = truth_vs_rnn_df[truth_vs_rnn_df['nwords']<350]

truth_vs_rnn_df

Unnamed: 0,0,text,label,nwords
15677,Sweet-n-Sour Kielbasa<ING>• 2 pounds kielbasa ...,Sweet-n-Sour Kielbasa<ING>• 2 pounds kielbasa ...,0,75
15670,Green Bean and Asparagus Salad<ING>• 1 pound f...,Green Bean and Asparagus Salad<ING>• 1 pound f...,0,201
858,Sweet and Sour Pork III<ING>• 1 pound pork but...,Sweet and Sour Pork III<ING>• 1 pound pork but...,0,312
655,Chicken and Biscuit Casserole<ING>• 1/4 cup bu...,Chicken and Biscuit Casserole<ING>• 1/4 cup bu...,0,251
18224,"Menudo<ING>• 2 pounds beef tripe\n• 2 onions, ...","Menudo<ING>• 2 pounds beef tripe\n• 2 onions, ...",0,97
...,...,...,...,...
51,"Gorgonzola: Remove chopped chocolate chips, dr...","Gorgonzola: Remove chopped chocolate chips, dr...",2,75
30,D'Emily Shaped Cheese Cupcakes cookies a small...,D'Emily Shaped Cheese Cupcakes cookies a small...,2,54
238,Blackberry-Cinnamon Pie\n\n🥑\n• 4 (6 ounce) fa...,Blackberry-Cinnamon Pie\n\n🥑\n• 4 (6 ounce) fa...,2,115
432,Dippin's Fasici-Ancho yeast and Sour cream mak...,Dippin's Fasici-Ancho yeast and Sour cream mak...,2,103


In [None]:
truth_vs_rnn_df['nwords'].max()

327

In [None]:
model_class, tokenizer_class, pretrained_weights = ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased'

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
tokenized = truth_vs_rnn_df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

tokenized

15677    [101, 4086, 1011, 1050, 1011, 14768, 20963, 22...
15670    [101, 2665, 14068, 1998, 2004, 28689, 12349, 1...
858      [101, 4086, 1998, 14768, 15960, 3523, 1026, 13...
655      [101, 7975, 1998, 20377, 28168, 16220, 10624, ...
18224    [101, 12183, 3527, 1026, 13749, 1028, 1528, 10...
                               ...                        
51       [101, 2175, 18581, 25650, 2721, 1024, 6366, 24...
30       [101, 1040, 1005, 6253, 5044, 8808, 2452, 1795...
238      [101, 25935, 1011, 21229, 11345, 100, 1528, 10...
432      [101, 16510, 8091, 1005, 1055, 6904, 19570, 20...
32       [101, 2531, 1003, 7427, 4487, 16643, 11001, 23...
Name: text, Length: 197, dtype: object

In [None]:
max_len = max([len(i) for i in tokenized.values])

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(197, 469)

In [None]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
  last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
features.shape

(197, 768)

In [None]:
labels = truth_vs_rnn_df['label']
labels

15677    0
15670    0
858      0
655      0
18224    0
        ..
51       2
30       2
238      2
432      2
32       2
Name: label, Length: 197, dtype: int64

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
class FFNet(nn.Module):
  def __init__(self, input_dim=768, hidden_dim=1024, output_dim=1, dropout=0.8):
    super(FFNet, self).__init__()
    self.fc1 = nn.Sequential(
        nn.Linear(input_dim, hidden_dim),
        nn.Dropout(dropout),
        nn.LeakyReLU(),
        nn.BatchNorm1d(hidden_dim),
    )
    self.fc3 = nn.Sequential(
        nn.Linear(hidden_dim, output_dim),
        nn.Sigmoid(),
    )

  def forward(self, x):
    x = self.fc1(x)
    x = self.fc3(x)
    return x

ffnet = FFNet()

In [None]:
criterion = nn.BCELoss()
optimizer = optim.SGD(ffnet.parameters(), lr=0.001, momentum=0.9)

In [None]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels
  def __len__(self):
    return self.features.shape[0]
  def __getitem__(self, index):
    X = self.features[index,:]
    y = self.labels[index]
    return X, y

train_dataset = Dataset(train_features, train_labels.to_numpy())

In [None]:
trainloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True
)

In [None]:
NUM_EPOCHS = 1000

for epoch in range(NUM_EPOCHS):
  running_loss = 0.0
  for data, labels in trainloader:
    optimizer.zero_grad()
    outputs = ffnet(data)
    loss = criterion(outputs, labels.float().unsqueeze(1))
    loss.backward()
    optimizer.step()
    #print(outputs.view(1, -1))
    #print(labels.view(1, -1))

    running_loss += loss.item()
  if epoch % 50 == 0:
    print('Epoch {}, loss: {}'.format(epoch, running_loss))

print('Finished training')

Epoch 0, loss: 11.31788244843483
Epoch 50, loss: -194.97640949487686
Epoch 100, loss: -242.69247835874557
Epoch 150, loss: -200.0297458767891
Epoch 200, loss: -229.0006217956543
Epoch 250, loss: -311.5216683149338
Epoch 300, loss: -461.4820215702057
Epoch 350, loss: -306.5561623573303
Epoch 400, loss: -310.3633278235793
Epoch 450, loss: -332.66350173950195
Epoch 500, loss: -300.150194644928
Epoch 550, loss: -188.09028300642967
Epoch 600, loss: -300.6163331270218
Epoch 650, loss: -436.1982421875
Epoch 700, loss: -252.1306470632553
Epoch 750, loss: -274.73557567596436
Epoch 800, loss: -217.45036166906357
Epoch 850, loss: -244.4103483557701
Epoch 900, loss: -227.3741238117218
Epoch 950, loss: -151.6368461647071
Finished training


In [None]:
test_dataset = Dataset(test_features, test_labels.to_numpy())

testloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=64,
    shuffle=True
)

In [None]:
y_pred = []
y = []
#total = 0
#correct = 0
with torch.no_grad():
  for data, labels in testloader:
    outputs = ffnet(data)
    predicted = torch.LongTensor(np.where(outputs > 0.5, 2, 0)).view(-1)
    y_pred.extend(predicted.tolist())
    y.extend(labels.tolist())
    #total += labels.size(0)
    #correct += (predicted == labels).sum().item()

print(y_pred)
print(y)

print('Accuracy of the FFNet trained on BERT sentence embeddings\non the test sentences: %0.3f %%' % accuracy_score(np.array(y), np.array(y_pred)))
print('F1-score of the FFNet trained on BERT sentence embeddings\non the test sentences: %0.3f %%' % f1_score(np.array(y), np.array(y_pred), average='micro'))

[2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2]
[2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0]
Accuracy of the FFNet trained on BERT sentence embeddings
on the test sentences: 0.600 %
F1-score of the FFNet trained on BERT sentence embeddings
on the test sentences: 0.600 %
