In [11]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-macosx_11_0_arm64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.98


In [29]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, TextDataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer


In [3]:
import pandas as pd

In [23]:
df = pd.read_csv('toy_training_data.csv')

In [24]:
# split into train and validation
train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index).reset_index(drop=True)

# convert to list of dicts
train_examples = train_df.to_dict('records')
val_examples = val_df.to_dict('records')

In [30]:
# model_name = "t5-small" # Or other T5 models like "t5-base", "t5-large", etc.

# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name)

model_name = "gpt2-xl" # Or other GPT-2 models like "gpt2", "gpt2-medium", "gpt2-large"

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)



Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 10.5MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 18.7MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 689/689 [00:00<00:00, 554kB/s]
Downloading pytorch_model.bin: 100%|██████████| 6.43G/6.43G [02:50<00:00, 37.8MB/s]
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 116kB/s]


In [31]:
def tokenize_data(example, tokenizer):
    input_text = example["input_text"]
    output_text = example["output_text"]

    input_tokenized = tokenizer(input_text, truncation=True, padding="max_length", max_length=512)
    output_tokenized = tokenizer(output_text, truncation=True, padding="max_length", max_length=512)

    input_tokenized["labels"] = output_tokenized["input_ids"]
    return input_tokenized

train_data_tokenized = [tokenize_data(example, tokenizer) for example in train_examples]
val_data_tokenized = [tokenize_data(example, tokenizer) for example in val_examples]


Using pad_token, but it is not set yet.


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [16]:
train_data_tokenized[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [17]:
# convert to huggingface dataset format
# train_data = TextDataset(tokenizer, train_data_tokenized, column_names=["input_ids", "attention_mask", "labels"])
# val_data = TextDataset(tokenizer, val_data_tokenized, column_names=["input_ids", "attention_mask", "labels"])

TypeError: __init__() got an unexpected keyword argument 'column_names'

In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir="toy_t5",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="logging_dir",
    logging_steps=100,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [26]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data_tokenized,
    eval_dataset=val_data_tokenized,
    tokenizer=tokenizer,
)

In [27]:
trainer.train()

100%|██████████| 36/36 [03:37<00:00,  6.04s/it]

{'train_runtime': 217.4661, 'train_samples_per_second': 1.269, 'train_steps_per_second': 0.166, 'train_loss': 9.668910556369358, 'epoch': 3.0}





TrainOutput(global_step=36, training_loss=9.668910556369358, metrics={'train_runtime': 217.4661, 'train_samples_per_second': 1.269, 'train_steps_per_second': 0.166, 'train_loss': 9.668910556369358, 'epoch': 3.0})

In [28]:
# test the model

for i in range(5):
    print("Example #", i)
    print("Input: ", val_examples[i]["input_text"])
    print("Output: ", tokenizer.decode(model.generate(tokenizer(val_examples[i]["input_text"], return_tensors="pt").input_ids)[0]))
    print("Expected output: ", val_examples[i]["output_text"])
    print()


Example # 0
Input:  <OP_NAME> Ken Bone </s> <OP_TITLE> I’m American citizen, undecided voter, loving husband Ken Bone, Welcome to the Bone Zone! AMA </s> <OP_POST> Hello Reddit,

I’m just a normal guy, who spends his free time with his hot wife and cat in St. Louis. I didn’t see any of this coming, it’s been a crazy week. I want to make something good come out of this moment, so I’m donating a portion of the proceeds from my Represent T-Shirt campaign to the St. Patrick Center raising money to fight homelessness in St. Louis.

I’m an open book doing this AMA at my desk at work and excited to answer America’s question.

Please support the campaign and the fight on homelessness! [Represent.com/bonezone](https://represent.com/bonezone/)

Proof: http://i.imgur.com/GdMsMZ9.jpg

Edit: signing off now, just like my whole experience so far this has been overwhelmingly positive!  Special thanks to my Reddit brethren for sticking up for me when the few negative people attack.  Let's just show th



Output:  <pad><extra_id_0> <unk>OP_NAME> Ken Bone (born May 21, 1958) is
Expected output:  What was the process in order to actually ask the candidates your question? Did you sign a form, request using email yada yada yada.
Hey Ken,  Can't wait to be you for Halloween.   You're obviously really passionate about serving the homeless community. Is there any background as to why that is your charity of choice?
What has been the strangest thing someone tweeted at you in the wake of the debate?
Are you near or farsighted?
Is the best part of this whole ordeal the karma from this post?
Are you looking forward to your impersonation on SNL?
I love how you're using this moment for good. What does the St.Patrick center do for the homeless and why is this issue out of many the most important to you?
What will you be going as for Halloween? I hear the sexy Ken Bone will be popular
What do you think of Obama?
As a coal worker, how do you think environmental protection and energy production should b