In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, TextDataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('toy_training_data.csv')

In [4]:
# split into train and validation
train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index).reset_index(drop=True)

# convert to list of dicts
train_examples = train_df.to_dict('records')
val_examples = val_df.to_dict('records')

In [5]:
# model_name = "t5-small" # Or other T5 models like "t5-base", "t5-large", etc.

# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name)

model_name = "gpt2-xl" # Or other GPT-2 models like "gpt2", "gpt2-medium", "gpt2-large"

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)



In [7]:
tokenizer.pad_token = tokenizer.eos_token

In [8]:
def tokenize_data(example, tokenizer):
    input_text = example["input_text"]
    output_text = example["output_text"]

    input_tokenized = tokenizer(input_text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    output_tokenized = tokenizer(output_text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

    input_tokenized["labels"] = output_tokenized["input_ids"]
    return input_tokenized


train_data_tokenized = [tokenize_data(example, tokenizer) for example in train_examples]
val_data_tokenized = [tokenize_data(example, tokenizer) for example in val_examples]


In [16]:
train_data_tokenized[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [17]:
# convert to huggingface dataset format
# train_data = TextDataset(tokenizer, train_data_tokenized, column_names=["input_ids", "attention_mask", "labels"])
# val_data = TextDataset(tokenizer, val_data_tokenized, column_names=["input_ids", "attention_mask", "labels"])

TypeError: __init__() got an unexpected keyword argument 'column_names'

In [10]:
training_args = TrainingArguments(
    output_dir="output_dir",
    num_train_epochs=3,
    per_device_train_batch_size=1, # Reduce the batch size for large models like gpt2-xl
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="logging_dir",
    logging_steps=100,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)


In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_tokenized,
    eval_dataset=val_data_tokenized,
    tokenizer=tokenizer,
)

In [12]:
trainer.train()

  1%|▏         | 4/276 [02:24<2:50:42, 37.66s/it]

KeyboardInterrupt: 

In [30]:
# Set the input prompt
input_prompt = "Some questions Reddit users might have to Donald Trump are:"

# Tokenize the input
input_tokens = tokenizer.encode(input_prompt, return_tensors="pt")

# Generate the output
# output_tokens =  model.generate(input_tokens,
#                                max_length=50,
#                                num_return_sequences=1,
#                                top_k=50,
#                                top_p=0.95)

output_tokens = model.generate(input_tokens,
                               max_length=100,
                               num_return_sequences=1,
                                no_repeat_ngram_size=2,
                                do_sample=True,
                                top_k=50,
                                top_p=0.95,
                               temperature=1.0)



# Decode the output
output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

# Print the output
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Some questions Reddit users might have to Donald Trump are:

1) Is the man who doesn't like to lose a woman?
... and 2) What has the President said that can not be proven? Is his response to the Charlottesville violence right on? And when did Donald, as president, start a "global problem"?
-. - -


In [17]:
val_examples[2]["input_text"]

"<OP_NAME> Max Brooks </s> <OP_TITLE> I am Max Brooks, author of World War Z, and I am here to discuss the coronavirus. Let’s talk about why my fictional zombie book was banned by the very real government of China. AMA. </s> <OP_POST> Let’s talk about survival. Individuals, groups, nations. Let’s talk about how fictional threats can teach us real survival skills. Let’s talk about why my fictional zombie book, “World War Z” was [banned by the very real government of China](https://www.washingtonpost.com/outlook/china-barred-my-dystopian-novel-about-how-its-system-enables-epidemics/2020/02/27/cc0446f0-58e5-11ea-9000-f3cffee23036_story.html) and how that government has let another very real plague get out of control. No matter what I write about, zombies, World War 1, Minecraft, and even my new threat, Bigfoot, the theme is always the same: adapting to survive. Let’s talk about what it means to adapt to this new Coronavirus danger and what it will mean for all of us.\n\nProof: https://twi

In [15]:
# test the model

for i in range(5):
    print("Example #", i)
    # print("Input: ", val_examples[i]["input_text"])
    input_tokens = tokenizer(val_examples[i]["input_text"], return_tensors="pt").input_ids
    predicted_output_tokens = model.generate(input_tokens, max_length=512, num_return_sequences=1)
    print("Output: ", tokenizer.decode(predicted_output_tokens[0], skip_special_tokens=True))
    # print("Expected output: ", val_examples[i]["output_text"])
    print()


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example # 0


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 1281, but `max_length` is set to 512. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Output:  <OP_NAME> Ken Bone </s> <OP_TITLE> I’m American citizen, undecided voter, loving husband Ken Bone, Welcome to the Bone Zone! AMA </s> <OP_POST> Hello Reddit,

I’m just a normal guy, who spends his free time with his hot wife and cat in St. Louis. I didn’t see any of this coming, it’s been a crazy week. I want to make something good come out of this moment, so I’m donating a portion of the proceeds from my Represent T-Shirt campaign to the St. Patrick Center raising money to fight homelessness in St. Louis.

I’m an open book doing this AMA at my desk at work and excited to answer America’s question.

Please support the campaign and the fight on homelessness! [Represent.com/bonezone](https://represent.com/bonezone/)

Proof: http://i.imgur.com/GdMsMZ9.jpg

Edit: signing off now, just like my whole experience so far this has been overwhelmingly positive!  Special thanks to my Reddit brethren for sticking up for me when the few negative people attack.  Let's just show that we're be

IndexError: index out of range in self