* https://huggingface.co/blog/how-to-generate
* http://jalammar.github.io/illustrated-gpt2/#part-2-illustrated-self-attention

In [4]:
!pip install -q git+https://github.com/huggingface/transformers.git

In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
# Local
# %cd ~/Documents/github/just-a-recipe-generator

# SageMaker
%cd ~/SageMaker/just-a-recipe-generator

/home/ec2-user/SageMaker/just-a-recipe-generator


In [5]:
import copy
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

In [8]:
from src.data.pickling import load_pickle
from src.features.clean_shoestring_data import clean_shoestring_recipes
from src.features.prepare_model_data import (ingredients_to_text, tokenize_text,
                                             create_sequences, split_input_target,
                                             make_training_data, text_from_ids)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [9]:
shoestring_recipes0 = load_pickle("data/raw/shoestring_recipes.pickle")

Loading data/raw/shoestring_recipes.pickle for consumption...


In [10]:
shoestring_recipes = copy.deepcopy(shoestring_recipes0)

In [11]:
recipe_dict = clean_shoestring_recipes(shoestring_recipes)

In [12]:
text = ingredients_to_text(recipe_dict)
# vocab, ids_from_chars, chars_from_ids = tokenize_text(text)
# sequences = create_sequences(text, ids_from_chars)
# dataset0 = sequences.map(split_input_target)
# dataset = make_training_data(dataset0)

In [13]:
text[:50]

'Ingredients:\n 5.0 ounces canned pumpkin puree\n 0.5'

## Encode context and generate text

In [13]:
# load and init model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/498M [00:00<?, ?B/s]

2023-04-11 02:24:30.104906: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [14]:
# encode context the generation is conditioned on
input_ids = tokenizer.encode(text, return_tensors='tf', max_length=1024, truncation=True)

## Beam Search

A decoder method
* Reduces the risk of missing hidden high prob word sequences by keeping the most likely number of beams of hypotheses at each time step and choosing the hypothesis that has the overall highest probability. 
* Always finds output sequence with higher prob than greedy search
* Not guaranteed to find most likely output

In [None]:
# set return_num_sequences > 1
beam_outputs = model.generate(
    input_ids, 
    max_length=3987,
    num_beams=5, 
    no_repeat_ngram_size=2, 
    num_return_sequences=5, 
    early_stopping=True
)


In [None]:
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))