In [None]:
# The code below is needed for using Google Colab, so un comment this if that is what you're using
 
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/ECE1786 Project/Remy

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/ECE1786 Project/Remy


In [None]:
import torch 
import numpy as np

from nltk.tokenize import sent_tokenize 

from pathlib import Path 
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.bpe import BPETokenizer 
from mingpt.utils import set_seed 
import pandas as pd
set_seed(1234)

In [None]:
class RecipeDataset(Dataset):
    def __init__(self,  truncation=-1):
        df = pd.read_pickle("/content/drive/MyDrive/ECE1786 Project/Remy/cocktail_dataset.pkl")  
        recipes = []
        for i in range(len(df.index)):
            recipe = "RECIPE NAME\n" + df.loc[i, "Name"] + " \n\nRECIPE INGREDIENTS\n"
            skip = False
            for ingredient in df.loc[i, "Ingredients"]:
                # remove asterisks
                ingredient.replace('*', '')
                if "750" in ingredient:
                    #skip batch drink recipes
                    skip = True
                recipe += ingredient + "\n"
            if skip:
                continue
            recipe += "\nRECIPE INSTRUCTIONS\n" 
            for instruction in  df.loc[i, "Instructions"]:
                if instruction.startswith("*"):
                    continue
                recipe += instruction + "\n"
            recipes.append(recipe)

        # Tokenize
        self.tokenizer = BPETokenizer()
        self.data = []  # List of 1-d pytorch tensor
        for sent in recipes:
            tokenized = self.tokenizer(sent).view(-1)  # pytorch tensor
            if truncation >= 0:
                self.data.append(tokenized[:truncation])
            else:
                self.data.append(tokenized)

        # Count some items
        self.max_sentence_length = 512 #np.max([len(d) for d in self.data])

    def __len__(self):
        return len(self.data)

    def get_vocab_size(self):
        """
        We have to set this to the max vocab size (i.e., that decided by the BPE tokenizer), 
        but actually, only a small number of vocab is used, especially for the small text. 
        """
        return 50257

    def __getitem__(self, idx):
        """
        The output should be a tuple x and y, both as pytorch tensors.
        Please refer to the `run()` method in the mingpt/trainer.py script for 
        how the x and y are going to be used.
        """
        x = self.data[idx][:-1]
        y = self.data[idx][1:]
        return (x, y)

    def get_block_size(self):
        """
        block_size is the size at which lines are truncated to ensure they are equal-length.
        """
        return self.max_sentence_length
    

dataset = RecipeDataset(truncation=512)
print(len(dataset))



292
RECIPE NAME
Pineapple Mint Caipirinha 

RECIPE INGREDIENTS
4 1.5-inch pineapple chunks
2 mint leaves
1 ounce simple syrup
2 ounces unaged cachaça
Garnish: pineapple wedge

RECIPE INSTRUCTIONS
In a shaker, muddle the pineapple chunks, mint leaves and simple syrup.
Add the cachaça and ice and shake vigorously until well-chilled.
Pour (unstrained) into a rocks glass.
Garnish with a pineapple wedge.


In [None]:
print(dataset.tokenizer.decode(dataset[5][0]))

RECIPE NAME
Red Hook 

RECIPE INGREDIENTS
2 ounces rye whiskey
1/2 ounce maraschino liqueur
1/2 ounce Punt e Mes
Garnish: maraschino cherry

RECIPE INSTRUCTIONS
Add the rye whiskey, maraschino liqueur and Punt e Mes into a mixing glass with ice and stir until well-chilled.
Strain into a cocktail glass.
Garnish with a maraschino cherry.


In [None]:
def lm_collate_fn(batch, device):
    x = [item[0] for item in batch]  # List (len B) of varying lengths
    y = [item[1] for item in batch]  # List (len B) of the same lengths as x
    maxlen = max([len(s) for s in x])

    padded_x, padded_y = [], []
    for sx, sy in zip(x, y):
        padded_x.append(torch.cat([sx, torch.ones(maxlen - len(sx))]))
        padded_y.append(torch.cat([sy, torch.ones(maxlen - len(sy))]))
    return torch.stack(padded_x).long().to(device), torch.stack(padded_y).long().to(device)


In [None]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = dataset.get_vocab_size()
model_config.block_size = dataset.get_block_size()
model_config.n_classification_class = 2
model = GPT(model_config)
model.load_state_dict(torch.load("/content/drive/MyDrive/ECE1786 Project/Remy/model_large100K.pt"))

number of parameters: 2.52M


<All keys matched successfully>

In [None]:
# Create a Trainer object and set the core hyper-parameters
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 5000  
train_config.num_workers = 0
train_config.batch_size = 4    # For small corpus, batch size of 4 is fine.  For large corpus use 16
trainer = Trainer(train_config, model, dataset, dataset, collate_fn=lm_collate_fn)

running on device cuda


In [None]:
# This function is called at the end of every batch in training
# and is used to report the amount of time per 100 batches, and the loss at that point

def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

# Train!
trainer.run()

iter_dt 0.00ms; iter 0: train loss 8.99142
iter_dt 18.49ms; iter 100: train loss 2.85900
iter_dt 15.56ms; iter 200: train loss 2.06186
iter_dt 18.95ms; iter 300: train loss 2.19352
iter_dt 20.52ms; iter 400: train loss 2.16243
iter_dt 17.49ms; iter 500: train loss 1.66448
iter_dt 14.83ms; iter 600: train loss 1.26252
iter_dt 19.04ms; iter 700: train loss 1.68055
iter_dt 15.44ms; iter 800: train loss 1.42126
iter_dt 17.25ms; iter 900: train loss 0.83758
iter_dt 18.99ms; iter 1000: train loss 1.42286
iter_dt 17.09ms; iter 1100: train loss 0.75403
iter_dt 14.67ms; iter 1200: train loss 1.10615
iter_dt 18.78ms; iter 1300: train loss 1.01842
iter_dt 20.75ms; iter 1400: train loss 0.81326
iter_dt 13.76ms; iter 1500: train loss 0.90966
iter_dt 16.17ms; iter 1600: train loss 0.77234
iter_dt 18.55ms; iter 1700: train loss 0.86690
iter_dt 19.40ms; iter 1800: train loss 0.79430
iter_dt 17.61ms; iter 1900: train loss 0.96363
iter_dt 14.82ms; iter 2000: train loss 0.96691
iter_dt 18.09ms; iter 2100

In [None]:
prompt = "RECIPE NAME\nWhiskey Sour \n\nRECIPE INGREDIENTS\n"
encoded_prompt = dataset.tokenizer(prompt).to(trainer.device)
generated_sequence = model.generate(encoded_prompt,trainer.device, temperature=0.5, max_new_tokens=100, do_sample=True)
print(dataset.tokenizer.decode(generated_sequence[0]))

RECIPE NAME
Whiskey Sour 

RECIPE INGREDIENTS
1 ounce whiskey
1/2 ounce lemon juice, freshly squeezed
2 ounce simple syrup
2 ounce or to water
Garnish: lemon twist


RECIPE INSTRUCTIONS
Add the bourbon, lemon and lemon juice to a shaker with ice, and shake until well-chilled.
Strain into a chilled rocks glass.
Garnish with a lemon twist.
Garnish with a lime twist.
Garnish with a lemon twist.


In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/ECE1786 Project/Remy/recipe_baseline.pt")

In [None]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = dataset.get_vocab_size()
model_config.block_size = dataset.get_block_size()
model_config.n_classification_class = 2
model = GPT(model_config)
model.load_state_dict(torch.load("/content/drive/MyDrive/ECE1786 Project/Remy/recipe_baseline.pt"))

device = torch.device("cpu")

prompt = "RECIPE NAME\nWhiskey Sour \n\nRECIPE INGREDIENTS\n"
encoded_prompt = dataset.tokenizer(prompt).to(device)
generated_sequence = model.generate(encoded_prompt,device , temperature=0.7, max_new_tokens=100, do_sample=True)
print(dataset.tokenizer.decode(generated_sequence[0]))

number of parameters: 2.52M
RECIPE NAME
Whiskey Sour 

RECIPE INGREDIENTS
1 1/2 ounces amaro 
1/2 ounce St-proof rum 
1/2 ounces mezcal 
lemon twist 

RECIPE INSTRUCTIONS
Add the gin, sweet vermouth, orange liqueur, freshly squeezed into a mixing glass with ice and stir until well-chilled.
Strain into a rocks glass over fresh ice.
Top with the drink.
Garnish with a ground orange twist.
*C
