In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import pandas
import torch
import transformers
import wandb

%env WANDB_WATCH=all
%env WANDB_PROJECT=wine_gpt2_Trainer_42

#wandb.login(anonymous='never', key="222a37baaf0c1b0d1499ec003e5c2fe49f97b107")
wandb.init()

print(torch.cuda.is_available())
print(f"transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")

# Add tokens from dataset

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained('distilgpt2')
print(tokenizer.vocab_size)

tokenizer.add_special_tokens(
  {'eos_token':'<|startoftext|>',
   'bos_token':'<|startoftext|>'
  }
)
tokenizer.add_tokens(['[prompt]','[response]','[category_1]',
                      '[category_2]','[origin]','[description]',
                      '<|endoftext|>'])

tokenizer.pad_token = tokenizer.eos_token

tokenizer.save_pretrained("data/modeling/trainer_42/")

print(tokenizer.vocab_size)
print("Created tokenizer")


In [None]:
from transformers import GPT2Config, GPT2TokenizerFast

config = GPT2Config()
tokenizer = GPT2TokenizerFast.from_pretrained("data/modeling/trainer_42/")

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained('distilgpt2')
print(f"model parameters: {model.num_parameters():,}")

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data/scraped/name_desc_nlp_ready_test.txt",
    block_size=64,
)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
  tokenizer=tokenizer, 
  mlm=False,
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="data/modeling/trainer_42/",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    save_steps=100,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
trainer.train()

# Add GPT2 model to local

In [None]:
model = transformers.AutoModelWithLMHead.from_pretrained('gpt2-xl')
print(f"Total parameters: {model.num_parameters()/1e6:.2f}M")

In [None]:
model.resize_token_embeddings(len(tokenizer))

In [None]:
model.save_pretrained('data/modeling/gpt2_xl_model/')

# Finetune

Due to the method in which Jupyter processes shell commands it won't show STDOUT live, only outputting once the run is finished. So I prefer to just paste this into a terminal instead of running in here.

In [None]:
!python transformers/examples/language-modeling/run_language_modeling.py \
--output_dir gpt2_distil_output \
--model_type gpt2 \
--model_name_or_path "data/modeling/gpt2_distil_model/" \
--do_train \
--train_data_file "data/scraped/name_desc_nlp_ready_train.txt" \
--do_eval \
--eval_data_file "data/scraped/name_desc_nlp_ready_test.txt" \
--per_gpu_train_batch_size 5

# Scratchpad

### Find unknown tokens in the dataset

In [None]:
import pandas as pd

dataset = pd.read_csv('data/scraped/name_desc_nlp_ready.txt', sep='\t', header=None)
print(dataset.shape)

In [None]:
total_tokens = 0
total_unknown_tokens = 0
for ix, row in dataset.iterrows():
  #print("-"*50)
  #print(row[2])
  tokenized_row = tokenizer.encode(row[2])
  #print(tokenized_row)
  total_tokens += len(tokenized_row)
  total_unknown_tokens += tokenized_row.count(50256)

In [None]:
print(total_unknown_tokens / total_tokens)

### Compare GPT2 Models from HuggingFace

In [None]:
gpt2_distilled = transformers.AutoModelForCausalLM.from_pretrained('distilgpt2')
print(f"Total parameters: {gpt2_distilled.num_parameters()/1e6:.2f}M")

In [None]:
gpt2 = transformers.AutoModelForCausalLM.from_pretrained('gpt2')
print(f"Total parameters: {gpt2.num_parameters()/1e6:.2f}M")

In [None]:
gpt2_medium = transformers.AutoModelForCausalLM.from_pretrained('gpt2-medium')
print(f"Total parameters: {gpt2_medium.num_parameters()/1e6:.2f}M")

In [None]:
gpt2_large = transformers.AutoModelForCausalLM.from_pretrained('gpt2-large')
print(f"Total parameters: {gpt2_large.num_parameters()/1e6:.2f}M")

In [None]:
gpt2_xl = transformers.AutoModelForCausalLM.from_pretrained('gpt2-xl')
print(f"Total parameters: {gpt2_xl.num_parameters()/1e6:.2f}M")

In [None]:
1500/350