In [6]:
import pandas
import transformers

# Add tokens from dataset

In [32]:
tokenizer = transformers.AutoTokenizer.from_pretrained('distilgpt2')
print(tokenizer.vocab_size)
print(len(tokenizer))

50257
50257


In [8]:
tokenizer.add_special_tokens(
  {'eos_token':'<|startoftext|>',
   'bos_token':'<|startoftext|>'
  }
)

print(tokenizer.vocab_size)
print(len(tokenizer))

50257
50258


In [9]:
tokenizer.add_tokens(['[prompt]','[response]'])

print(tokenizer.vocab_size)
print(len(tokenizer))

50257
50260


In [32]:
tokenizer.save_pretrained('data/modeling/gpt2_distil_model/')

('data/modeling/gpt2_distil_model/vocab.json',
 'data/modeling/gpt2_distil_model/merges.txt',
 'data/modeling/gpt2_distil_model/special_tokens_map.json',
 'data/modeling/gpt2_distil_model/added_tokens.json')

# Add GPT2 model to local

In [10]:
model = transformers.AutoModelWithLMHead.from_pretrained('distilgpt2')



In [11]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [34]:
model.save_pretrained('data/modeling/gpt2_distil_model/')

# Finetune

Due to the method in which Jupyter processes shell commands it won't show STDOUT live, only outputting once the run is finished. So I prefer to just paste this into a terminal instead of running in here.

# Scratchpad

### Find unknown tokens in the dataset

In [12]:
import pandas as pd

dataset = pd.read_csv('data/scraped/name_desc_nlp_ready.txt', sep='\t', header=None)
print(dataset.shape)

(125482, 6)


In [24]:
concat_row = "    ".join(list(row.values))

In [26]:
concat_row

'<|startoftext|>    [prompt]    Caymus Napa Valley Cabernet Sauvignon (1.5 Liter Magnum) 2017    [response]    Caymus has a signature style that is dark in color, with rich fruit and ripe, velvety tannins \x96 as approachable in youth as in maturity. We farm Cabernet grapes in eight of Napa\x92s 16 sub-appellations, with diversification enabling us to make the best possible wine in a given year. Our Cabernet offers layered, lush aromas and flavors, including cocoa, cassis, and ripe dark berries.    <|endoftext|>'

In [13]:
total_tokens = 0
total_unknown_tokens = 0
for ix, row in dataset.iterrows():
  print("-"*50)
  print(row[2])
  tokenized_row = tokenizer.encode(row[2])
  print(tokenized_row)
  total_tokens += len(tokenized_row)
  total_unknown_tokens += tokenized_row.count(50256)
  
  if ix>10:
    break

--------------------------------------------------
Laurent-Perrier Cuvee Rose
[14772, 495, 429, 12, 5990, 5277, 14496, 303, 68, 8049]
--------------------------------------------------
Piper-Heidsieck Cuvee Brut in Travel Case with 2 Champagne Flutes
[47, 9346, 12, 1544, 2340, 494, 694, 14496, 303, 68, 30291, 287, 13524, 8913, 351, 362, 29260, 21080, 1610, 1769]
--------------------------------------------------
Clarendon Hills Astralis Syrah 2011
[2601, 533, 358, 261, 14379, 34496, 271, 1632, 11392, 2813]
--------------------------------------------------
Yalumba Patchwork Shiraz 2014
[56, 282, 2178, 64, 17106, 1818, 21972, 1031, 1946]
--------------------------------------------------
Caymus Napa Valley Cabernet Sauvignon 2018
[34, 323, 14664, 14332, 64, 6916, 15976, 1142, 316, 23167, 85, 570, 261, 2864]
--------------------------------------------------
Veuve Clicquot Yellow Label Brut
[53, 12496, 303, 327, 677, 421, 313, 12550, 36052, 30291]
----------------------------------------

In [36]:
print(total_unknown_tokens / total_tokens)

0.0


### Compare GPT2 Models from HuggingFace

In [54]:
gpt2_distilled = transformers.AutoModelForCausalLM.from_pretrained('distilgpt2')
print(f"Total parameters: {gpt2_distilled.num_parameters()/1e6:.2f}M")

Total parameters: 81.91M


In [53]:
gpt2 = transformers.AutoModelForCausalLM.from_pretrained('gpt2')
print(f"Total parameters: {gpt2.num_parameters()/1e6:.2f}M")

Total parameters: 124.44M


In [49]:
gpt2_medium = transformers.AutoModelForCausalLM.from_pretrained('gpt2-medium')
print(f"Total parameters: {gpt2_medium.num_parameters()/1e6:.2f}M")

Total parameters: 354.82M


In [55]:
gpt2_large = transformers.AutoModelForCausalLM.from_pretrained('gpt2-large')
print(f"Total parameters: {gpt2_large.num_parameters()/1e6:.2f}M")

Total parameters: 774.03M


In [56]:
gpt2_xl = transformers.AutoModelForCausalLM.from_pretrained('gpt2-xl')
print(f"Total parameters: {gpt2_xl.num_parameters()/1e6:.2f}M")

Total parameters: 1557.61M


In [2]:
1500/350

4.285714285714286