In [1]:
import re

import pandas as pd
import torch
import tqdm
import transformers

# Load tokenizer and fine-tuned model

In [2]:
tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2_distil_output/")
model = transformers.GPT2LMHeadModel.from_pretrained(
  "gpt2_distil_output/checkpoint-37500/",
  pad_token_id=tokenizer.eos_token_id
)

#model = model.to('cuda')

In [3]:
tokenizer.special_tokens_map

{'bos_token': '<|startoftext|>',
 'eos_token': '<|startoftext|>',
 'unk_token': '<|endoftext|>'}

# Greedy Output

In [4]:
input_prompt = "<|startoftext|> [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] "
input_ids = tokenizer.encode(input_prompt, return_tensors='pt')
print("Tokens: ", input_ids)

greedy_output = model.generate(input_ids)
print(tokenizer.decode(greedy_output[0]))

Tokens:  tensor([[50257, 50258, 21902,   353,  9530,   741, 14496,   303,    68, 47535,
           344,   609, 19917,    77,   323,  1853, 50259]])
<|startoftext|> [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] Walter Han


# Beam Output

In [5]:
input_prompt = "<|startoftext|> [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] "
input_ids = tokenizer.encode(input_prompt, return_tensors='pt')

# activate beam search and early_stopping
beam_outputs = model.generate(
  input_ids, 
  max_length=200, 
  num_beams=5, 
  no_repeat_ngram_size=2,
  num_return_sequences=3, 
  early_stopping=True
)

print("Output:\n" + 80 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("="*20)
  print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))
  print('\n')

Output:
--------------------------------------------------------------------------------
0: [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] Aromas of white peach, apricot, and honeydew melon. On the palate, the wine is rich and full-bodied, with flavors of ripe apple, pear, vanilla cream and toasted hazelnut.	


1: [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] Aromas of white peach, apricot, and honeydew melon. On the palate, the wine is rich and full-bodied, with flavors of ripe apple, pear, vanilla cream and toasty oak.	


2: [prompt] Walter Hansel Cuvee Alyce Chardonnay 2015 [response] Aromas of white peach, apricot, and honeydew melon. On the palate, the wine is rich and full-bodied, with flavors of ripe apple, pear, vanilla cream and toasted hazelnut. The wine has a long, lingering finish.	




# Random Sampling Output

In [6]:
input_prompt = "<|startoftext|> " + "[prompt] " + "Vendange Chardonnay " + "[response] "
input_ids = tokenizer.encode(input_prompt, return_tensors='pt')
print(input_ids)

# Send to GPU
model.to('cuda:1')
input_ids = input_ids.to('cuda:1')

sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=250, 
    top_p=0.8,
    top_k=200,
    temperature=0.9,
    eos_token_id=50257,
    bos_token_id=50257,
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print("="*20)
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
print('\n')

tensor([[50257, 50258,    53,   437,   858,   609, 19917,    77,   323, 50259]])
Output:
----------------------------------------------------------------------------------------------------
[prompt] Vendange Chardonnay [response] Vendange Chardonnay is a brilliant straw yellow in color. The wine offers enticing aromas of fresh tropical fruit and ripe pear. On the palate, the wine is dry and medium-bodied with a smooth, creamy finish.	




# Generate descriptions on fake wine names

In [7]:
fake_names = pd.read_csv("data/fake/fake_names_13301_2020-05-20.csv")
print(fake_names.shape)

(13301, 2)


In [8]:
fake_names.head(3)

Unnamed: 0.1,Unnamed: 0,0
0,0,Lachos Cellars Sauvignon Blanc 2012
1,1,Dry Estated Bios de Bourting Sannero (375ML ha...
2,2,Tarodahadin Chaary Rivi 2010


In [9]:
# Send to GPU
model.to('cuda:1')
input_ids = input_ids.to('cuda:1')

generated_descriptions = {}
for fake_name in tqdm.tqdm(fake_names.iloc[:,1]):
  #print(fake_name)
  
  # Create token from fake wine name
  input_ids = tokenizer.encode(
    text=("<|startoftext|>\t[prompt]\t" + fake_name + "\t" + "[response] "), 
    return_tensors='pt'
  ).to('cuda:1')
  
  # Generate a fake description based on the name
  model_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=250, 
    top_p=0.8,
    top_k=200,
    temperature=0.9,
    eos_token_id=50257,
    bos_token_id=50257,
    early_stopping=True
  )
  
  generated_descriptions[fake_name] = tokenizer.decode(
    token_ids=model_output[0], 
    skip_special_tokens=True
  )

  0%|▏                                                                            | 22/13301 [00:10<1:49:28,  2.02it/s]


KeyboardInterrupt: 

In [124]:
wine_df = pd.DataFrame.from_dict(generated_descriptions.items())
wine_df.columns = ['name', 'description']
print(wine_df.shape)

wine_df['description'] = wine_df['description'].str.split('\[response\]').str[1]
wine_df['description'] = wine_df['description'].str.strip()
wine_df['description'] = wine_df['description'].str.strip('"')

wine_df = wine_df[wine_df['description'].str.len() > 100]

print(wine_df.shape)
wine_df.head()

(237, 2)
(184, 2)


Unnamed: 0,name,description
0,Lachos Cellars Sauvignon Blanc 2012,"Bright, crisp, and mouthwatering, the 2012 Sau..."
2,Tarodahadin Chaary Rivi 2010,"The Chaary (or """"Orvieto-variety"""") is a blend..."
3,Le Valli Sis de Trach Sauvignon Blanc 2013,"The nose is intense and complex, with intense ..."
4,Fotes Jadot Sauvignon Blanc 2005,This is a perfect example of a New Zealand Sau...
5,Alarag Caleforno Rosso 2018,"Ruby red with purple reflections. On the nose,..."


In [125]:
wine_df.to_csv("data/fake/fake_names_descriptions.csv")