In [1]:
import json
import random

data_path = "./shakespeare.jsonl"

with open(data_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

random.shuffle(data)

split_idx = int(0.9 * len(data))
train_data = data[:split_idx]
val_data = data[split_idx:]

def save_jsonl(filename, dataset):
    with open(filename, "w", encoding="utf-8") as f:
        for item in dataset:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

save_jsonl("data_train.jsonl", train_data)
save_jsonl("data_val.jsonl", val_data)

print(f"Train: {len(train_data)}, Val: {len(val_data)}")


Train: 18971, Val: 2108


In [2]:
for item in data[:3]:
    print(item)
    print("\n")

{'instruction': 'Rewrite the text in Shakespearean style.', 'input': 'I just meant that our hearts are joined, so we can almost think of them as one heart.', 'output': 'I mean that my heart unto yours is knit So that but one heart we can make of it.'}


{'instruction': 'Rewrite the text in Shakespearean style.', 'input': 'The princes are under some strange misunderstanding.', 'output': 'There is some strange misprision in the princes.'}


{'instruction': 'Rewrite the text in Shakespearean style.', 'input': 'Everything’s fine now.', 'output': 'Now all is well.'}




In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

model_name = "distilbert/distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token 

model = AutoModelForCausalLM.from_pretrained(model_name)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

dataset = load_dataset("json", data_files={
    "train": "data_train.jsonl",
    "validation": "data_val.jsonl"
})

def preprocess(example):

    full_text = f"Instruction: {example['instruction']} Input: {example['input']} Output: {example['output']}"
    prompt_text = f"Instruction: {example['instruction']} Input: {example['input']} Output:"

    full_tokens = tokenizer(full_text, truncation=True, max_length=512)
    prompt_tokens = tokenizer(prompt_text, truncation=True, max_length=512)

    labels = full_tokens["input_ids"].copy()

    output_start = len(prompt_tokens["input_ids"])

    for i in range(output_start):
        labels[i] = -100

    full_tokens = tokenizer.pad(full_tokens, max_length=512, padding="max_length")
    full_tokens["labels"] = labels + [-100] * (512 - len(labels))

    return full_tokens

tokenized_dataset = dataset.map(preprocess, batched=False)