In [1]:
from transformers import pipeline, set_seed

generator = pipeline(
    "text-generation",
    model="gpt2",  # or any other model you want to use
)

# 2. (Optional) Fix the seed for reproducibility
set_seed(42)

# 3. Generate
output = generator(
    "To be or not to be",   # your prompt
    max_new_tokens=50,      # ONLY counts the new tokens
    do_sample=True,         # sampling instead of greedy
    top_k=50,               # consider the 50 most likely next tokens
    temperature=0.7,        # lower → more conservative; higher → more creative
    num_return_sequences=1  # how many completions you want
)

print(output[0]["generated_text"])


  from .autonotebook import tqdm as notebook_tqdm
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


To be or not to be, what matters in the world is the world we live in, and if you want to live in an environment that's more like the world you live in, you should be more aware of this," she said.

"It's about how


In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

# 1. Load your text file as a Dataset
ds = load_dataset("text", data_files={"train": "first_1000_lines.txt"})

# 2. Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model     = AutoModelForCausalLM.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# 3. Tokenize & group into blocks
def tokenize_and_group(examples):
    tokens = tokenizer(examples["text"], return_special_tokens_mask=False)
    all_ids = sum(tokens["input_ids"], [])
    # chunk into non-overlapping blocks of size block_size
    block_size = 512
    input_ids = [
        all_ids[i : i + block_size]
        for i in range(0, len(all_ids) - block_size + 1, block_size)
    ]
    return {"input_ids": input_ids, "attention_mask": [[1]*block_size]*len(input_ids)}

tokenized = ds["train"].map(
    tokenize_and_group,
    batched=True,
    remove_columns=["text"],
)

# 4. Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# 5. Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=100,
    save_steps=500,
    fp16=False,                # if you have a GPU with half-precision
)

# 6. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator,
)

# 7. Fine-tune!
trainer.train()


Map: 100%|██████████| 1000/1000 [00:00<00:00, 26265.29 examples/s]
                                     
100%|██████████| 3/3 [01:02<00:00, 20.69s/it]

{'train_runtime': 62.0458, 'train_samples_per_second': 1.112, 'train_steps_per_second': 0.048, 'train_loss': 4.524337450663249, 'epoch': 2.0}





TrainOutput(global_step=3, training_loss=4.524337450663249, metrics={'train_runtime': 62.0458, 'train_samples_per_second': 1.112, 'train_steps_per_second': 0.048, 'total_flos': 12019433472000.0, 'train_loss': 4.524337450663249, 'epoch': 2.0})

In [6]:
trainer.save_model("gpt2-finetuned")            # writes pytorch_model.bin here
tokenizer.save_pretrained("gpt2-finetuned")  

('gpt2-finetuned\\tokenizer_config.json',
 'gpt2-finetuned\\special_tokens_map.json',
 'gpt2-finetuned\\vocab.json',
 'gpt2-finetuned\\merges.txt',
 'gpt2-finetuned\\added_tokens.json',
 'gpt2-finetuned\\tokenizer.json')

In [9]:
from transformers import pipeline

# 1. Create a text-generation pipeline pointing at your checkpoint
generator = pipeline(
    "text-generation",
    model="gpt2-finetuned",   # your fine-tuned model directory
    tokenizer="gpt2-finetuned",           # or "./gpt2-finetuned" if you saved tokenizer too
    device=-1,  # -1 for CPU, 0 for GPU
)

# 2. Run on an unseen prompt
prompt = "to be or not to be, that is the question"
outputs = generator(
    prompt,
    max_new_tokens=100,
    do_sample=True,
    top_k=40,
    temperature=0.8,
    num_return_sequences=1
)

print(outputs[0]["generated_text"])

to be or not to be, that is the question and the answer. For it is possible that all things, whether real or unreal, are possible, that is the issue. It is also possible that things, other than real things, are possible. Nothing is possible in order to exist. The question is whether it is possible to exist or not, not even to exist. There is no doubt that it is possible. No one can deny that we are all living beings. But that cannot be the only reality. That is the question. We can
