In [9]:
from collections import OrderedDict 
import re

import pandas as pd
import pickle
import torch
import tqdm
import transformers

print(torch.__version__)
print(transformers.__version__)

1.7.0
4.2.1


# Load tokenizer and fine-tuned model

In [3]:
TOKENIZER_PATH = "data/gpt2_runs/tokenizers/gpt2_large"
MODEL_PATH = "data/gpt2_runs/v2/step_290000/"

tokenizer = transformers.GPT2Tokenizer.from_pretrained(TOKENIZER_PATH)
model = transformers.AutoModelForCausalLM.from_pretrained(
  MODEL_PATH,
  pad_token_id=tokenizer.eos_token_id
)

model = model.to('cuda:1')

# Greedy Output

In [4]:
input_prompt = "<|startoftext|> [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] "
input_ids = tokenizer.encode(input_prompt, return_tensors='pt').to('cuda:1')
print("Tokens: ", input_ids)

greedy_output = model.generate(input_ids, max_length=200)
print(tokenizer.decode(greedy_output[0]))

Tokens:  tensor([[50257, 50258, 21902,   353,  9530,   741, 14496,   303,    68, 47535,
           344,   609, 19917,    77,   323,  1853, 50259]], device='cuda:1')




<|startoftext|> [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] [category_1] "White Wine" [category_2] Chardonnay [origin] " from Russian River, Sonoma County, California" [description] "This wine is a blend of fruit from the estate vineyard in the Russian River Valley. The majority of the blend is comprised of fruit from the younger vines in the Russian River Valley, and the balance is comprised of fruit from the warmer southern end of the Russian River Valley. The wine is barrel fermented, and malolactic fermentation occurs in 30% to 70% of the blend. The wine was aged in 30% French and 70% American oak barrels for 10 months. The wine was bottled unfined and unfiltered after 22 months barrel aging." <|endoftext|>


# Beam Output

In [5]:
input_prompt = "<|startoftext|> [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] "
input_ids = tokenizer.encode(input_prompt, return_tensors='pt').to('cuda:1')

# activate beam search and early_stopping
beam_outputs = model.generate(
  input_ids, 
  max_length=200, 
  num_beams=5, 
  no_repeat_ngram_size=2,
  num_return_sequences=3, 
  early_stopping=True
)

print("Output:\n" + 80 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("="*20)
  print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))
  print('\n')

Output:
--------------------------------------------------------------------------------
0: [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] [category_1] "White Wine" [category_2] "Other White Blends" [origin] " from Russian River, Sonoma County, California" [description]


1: [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] [category_1] "Red Wine" [category_2] "Other White Blends" [origin] " from Russian River, Sonoma County, California" [description]


2: [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] [category_1] "Red Wine" [category_2] "Other Red Blends" [origin] " from Russian River, Sonoma County, California" [description]




# Random Sampling Output

In [6]:
input_prompt = "<|startoftext|> " + "[prompt] " + "Vendange Chardonnay " + "[response] "
input_ids = tokenizer.encode(input_prompt, return_tensors='pt')
print(input_ids)

# Send to GPU
model.to('cuda:1')
input_ids = input_ids.to('cuda:1')

sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=250, 
    top_p=0.8,
    top_k=200,
    temperature=0.9,
    eos_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.eos_token_id,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print("="*20)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
print('\n')

tensor([[50257, 50258,    53,   437,   858,   609, 19917,    77,   323, 50259]])
Output:
----------------------------------------------------------------------------------------------------
[prompt] Vendange Chardonnay [response] [category_1] "White Wine" [category_2] Chardonnay [origin] " from Margaret River, Western Australia, Australia" [description] "The fruit was sourced from both estate-owned and contracted vineyards in the Margaret River region. The majority of the blend was fermented in stainless steel tanks, while the balance was aged in French and American oak barrels. The wine has a pale straw colour with green hues. Aromas of fresh cut pear, white nectarine and a hint of lime blossom are followed by a palate showing intense peach and grapefruit characters with a hint of spice and a soft, creamy texture. A wine of weight, balance and complexity that will develop further complexity with careful cellaring." 




# Generate descriptions on fake wine names

In [10]:
names_path = 'data/fake/fake_names_12184_2020-11-19.pickle'
with open(names_path, 'rb',) as file:
    fake_names = pickle.load(file)

In [11]:
fake_names.head(3)

AttributeError: 'list' object has no attribute 'head'

In [14]:
# Send to GPU
model.to('cuda:1')
#input_ids = input_ids.to('cuda:1')

generated_descriptions = OrderedDict()
for fake_name in tqdm.tqdm(fake_names[:1000]):
  #print(f"Name: {fake_name}")
  
  # Create token from fake wine name
  try:
    input_ids = tokenizer.encode(
      text=("<|startoftext|> [prompt] " + fake_name + " " + "[response] "), 
      return_tensors='pt'
    ).to('cuda:1')
  
    # Generate a fake description based on the name
    model_output = model.generate(
      input_ids, 
      do_sample=True, 
      max_length=300,
      min_length=80,
      top_p=0.8,
      top_k=200,
      temperature=0.9,
      eos_token_id=tokenizer.eos_token_id,
      bos_token_id=tokenizer.bos_token_id,
      early_stopping=True
    )

    generated_descriptions[fake_name] = tokenizer.decode(
      token_ids=model_output[0], 
      skip_special_tokens=True
    )
    
  except:
    continue

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [1:22:55<00:00,  4.98s/it]


In [16]:
wine_df = pd.DataFrame.from_dict(generated_descriptions.items())
wine_df.columns = ['name', 'response']
print(wine_df.shape)

wine_df['category_1'] = wine_df['response'].str.split('\[category_1\]').str[1].str.split('\[category_2\]').str[0]
wine_df['category_2'] = wine_df['response'].str.split('\[category_2\]').str[1].str.split('\[origin\]').str[0]
wine_df['origin'] = wine_df['response'].str.split('\[origin\]').str[1].str.split('\[description\]').str[0]
wine_df['description'] = wine_df['response'].str.split('\[description\]').str[1]
#wine_df['description'] = wine_df['description'].str.strip()
#wine_df['description'] = wine_df['description'].str.strip('"')

#wine_df = wine_df[wine_df['description'].str.len() > 100]
wine_df = wine_df.applymap(str)\
            .applymap(lambda x: x.replace('"', ''))\
            .drop(['response'], axis=1)

print(wine_df.shape)
wine_df.head()

(1000, 2)
(1000, 5)


Unnamed: 0,name,category_1,category_2,origin,description
0,Piul Bothen Cabernet Sauvignon 2014,Red Wine,Cabernet Sauvignon,"from North Coast, California",The color of this wine is deep purple with ar...
1,Seacuscini Resantzass 2015,Red Wine,Other Red Blends,"from Tuscany, Italy","The perfect wine for the holidays, this wine ..."
2,Alpanena Vriestioge 2016,Red Wine,Tempranillo,from Spain,The wine is very approachable and fruit-forwa...
3,Sanmedarbecer Edena Cabernet,Red Wine,Cabernet Sauvignon,from Spain,The grapes for this wine come from a selectio...
4,Sauvignon 2001,White Wine,Sauvignon Blanc,"from North Coast, California",Sauvignon Blanc grapes from the North Coast v...


In [17]:
wine_df.to_csv("data/fake/descriptions/gpt2_desc_v2_20210127.csv")