In [52]:
import pandas as pd
from datasets import Dataset

# Sample pandas DataFrame
kjv = pd.read_csv("kjv.csv")
max_len = kjv.t.apply(lambda x:len(x.split())).max()

data = {
    "text": kjv.t.tolist()[:100]
}
df = pd.DataFrame(data)

# Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['text'],
    num_rows: 100
})

In [53]:
from transformers import AutoTokenizer

model_name = "gpt2"  # We'll use the base GPT-2 model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", max_length=max_len, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [54]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [55]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name)
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [68]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)


In [69]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator
)


In [70]:
trainer.train()


Step,Training Loss


TrainOutput(global_step=50, training_loss=2.2562245178222655, metrics={'train_runtime': 129.6949, 'train_samples_per_second': 1.542, 'train_steps_per_second': 0.386, 'total_flos': 9186048000000.0, 'train_loss': 2.2562245178222655, 'epoch': 2.0})

In [67]:
# Define your prompt
prompt = "In the beginning"

# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt")

# Generate the text
outputs = model.generate(
    inputs.input_ids,
    max_length=20,            # Maximum length of generated text
    num_return_sequences=2,    # Number of sequences to generate
    no_repeat_ngram_size=2,    # Prevent repeating n-grams
    top_k=50,                  # Number of highest probability vocabulary tokens to keep for top-k-filtering
    top_p=0.95,                # If set to float < 1, only the most probable tokens with probabilities that add up to top_p are kept for generation
    temperature=0.7,           # The temperature of the sampling distribution
    do_sample=True             # Sampling or greedy decoding
)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In the beginning, God made man gods and had dominion over the earth. But now God created man


In [60]:
model.save_pretrained("./kjv-gpt2")
tokenizer.save_pretrained("./kjv-gpt2")


('./kjv-gpt2\\tokenizer_config.json',
 './kjv-gpt2\\special_tokens_map.json',
 './kjv-gpt2\\vocab.json',
 './kjv-gpt2\\merges.txt',
 './kjv-gpt2\\added_tokens.json',
 './kjv-gpt2\\tokenizer.json')