In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("HScomcom/gpt2-game-of-thrones")

model = AutoModelForCausalLM.from_pretrained("HScomcom/gpt2-game-of-thrones")

In [2]:
text = "Once upon a time,"

encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

print(len(output))

2


In [7]:
input_ids = encoded_input.input_ids
outputs = model.generate(input_ids, do_sample=True, max_length=30, temperature=1.2)
res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(res)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["Once upon a time, I was a knight. Back in my day, you weren't a knight unless you killed a certain number of men. Then"]


In [3]:
from pathlib import Path 
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from nltk.tokenize import sent_tokenize 
import numpy as np

In [20]:
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [30]:
file = open("../data/Jon_1.txt", "r")

In [35]:
texts = []
labels = []
for i in range(1098):
    line = file.readline()
    words = word_tokenize(line)
    texts.append(TreebankWordDetokenizer().detokenize(words[:-1]))
    labels.append(TreebankWordDetokenizer().detokenize(words[1:]))
train_x, validation_x, train_y, validation_y = train_test_split(texts, labels, test_size=0.2, shuffle=False)


In [40]:
import pandas as pd
import datasets

In [36]:
train_dataset = pd.DataFrame()
train_dataset["text"] = train_x
train_dataset["label"] = train_y

validation_dataset = pd.DataFrame()
validation_dataset["text"] = validation_x
validation_dataset["label"] = validation_y



In [41]:
from datasets import DatasetDict, Dataset
dataset_dict = DatasetDict({"train":Dataset.from_pandas(train_dataset), "validation":Dataset.from_pandas(validation_dataset)})

In [42]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [44]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [45]:
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [47]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42)

In [48]:
from transformers import TrainingArguments, Trainer

In [49]:


training_args = TrainingArguments(output_dir="test_trainer", num_train_epochs=1)

import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [52]:
import torch

In [56]:
def lm_collate_fn(batch):
    x = [item[0] for item in batch]  # List (len B) of varying lengths
    y = [item[1] for item in batch]  # List (len B) of the same lengths as x
    maxlen = max([len(s) for s in x])

    padded_x, padded_y = [], []
    for sx, sy in zip(x, y):
        padded_x.append(torch.cat([sx, torch.ones(maxlen - len(sx))]))
        padded_y.append(torch.cat([sy, torch.ones(maxlen - len(sy))]))
    return torch.stack(padded_x).long(), torch.stack(padded_y).long()

In [57]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=lm_collate_fn
)

In [58]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 878
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 330


  0%|          | 0/330 [00:00<?, ?it/s]

KeyError: 0