In [2]:
import torch
print(torch.cuda.is_available())  # Should return True if a GPU is available
print(torch.cuda.device_count())  # Number of available GPUs
print(torch.cuda.current_device())  # Index of the currently active GPU (usually 0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
torch.cuda.empty_cache()

True
1
0
Using device: cuda


In [3]:
import pandas as pd
from datasets import Dataset

# Sample pandas DataFrame
kjv = pd.read_csv("kjv.csv")
max_len = kjv.t.apply(lambda x:len(x.split())).max()

data = {
    "text": kjv.t.tolist()
}
df = pd.DataFrame(data)

# Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['text'],
    num_rows: 31103
})

In [4]:
from transformers import AutoTokenizer

model_name = "gpt2"  # We'll use the base GPT-2 model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", max_length=max_len, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets



Map:   0%|          | 0/31103 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 31103
})

In [5]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [6]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [7]:
from transformers import TrainingArguments
model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2**5,
    save_steps=10_000,
    save_total_limit=2,
    fp16=True
)


In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator
)


In [9]:
trainer.train()


Step,Training Loss
500,3.2819
1000,3.0684
1500,2.8902
2000,2.826
2500,2.7121
3000,2.6784
3500,2.5896
4000,2.5598
4500,2.4842
5000,2.4619


RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
# Send back to CPU for inference
model.to("cpu")

# Define your prompt
prompt = "In the beginning God"

# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt")

# Generate the text
outputs = model.generate(
    inputs.input_ids,
    max_length=20,            # Maximum length of generated text
    num_return_sequences=3,    # Number of sequences to generate
    no_repeat_ngram_size=2,    # Prevent repeating n-grams
    top_k=50,                  # Number of highest probability vocabulary tokens to keep for top-k-filtering
    top_p=0.95,                # If set to float < 1, only the most probable tokens with probabilities that add up to top_p are kept for generation
    temperature=0.7,           # The temperature of the sampling distribution
    do_sample=True             # Sampling or greedy decoding
)

for row in outputs:
    # Decode and print the generated text
    generated_text = tokenizer.decode(row, skip_special_tokens=True)
    print(generated_text)


In [None]:
model.save_pretrained("./kjv-gpt2")
tokenizer.save_pretrained("./kjv-gpt2")


In [None]:
from huggingface_hub import notebook_login
import os

# Use this token: os.environ["HF_TOKEN"]
# on here at the CLI: huggingface-cli login

model.push_to_hub("bmconrad/kjv-gpt2")
tokenizer.push_to_hub("bmconrad/kjv-gpt2")

In [43]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="bmconrad/kjv-gpt2")

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/bmconrad/kjv-gpt2/commit/f0979f58f51fc9e8c817424aa242eaff2cf06673', commit_message='Upload model', commit_description='', oid='f0979f58f51fc9e8c817424aa242eaff2cf06673', pr_url=None, pr_revision=None, pr_num=None)

In [45]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("bmconrad/kjv-gpt2")
model = AutoModelForCausalLM.from_pretrained("bmconrad/kjv-gpt2")

tokenizer_config.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/946 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/125 [00:00<?, ?B/s]