<a href="https://colab.research.google.com/github/frankrobotics/my-small-model-1/blob/main/another_bad_ai_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary libraries:

In [None]:
pip install peft transformers datasets accelerate torch

# login to hub

In [2]:
from huggingface_hub import login

from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')

login(
    token=HF_TOKEN,
    add_to_git_credential=True,
)

In [8]:
from datasets import load_dataset

dataset = load_dataset("avaliev/chat_doctor", split={"train": "train[:5%]", "test": "test[:5%]"})
# dataset = load_dataset("avaliev/chat_doctor", split="train[:10%]")

print(dataset)

# Split into train and validation sets
# dataset = dataset["train"].train_test_split(test_size=0.1)

# dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 4779
    })
    test: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 597
    })
})


# Load the model

In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn’t have a padding token by default

# Load pre-trained model
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [5]:
from peft import LoraConfig, get_peft_model

# Define LoRA config
lora_config = LoraConfig(
    r=8,                # Rank of the low-rank matrices
    lora_alpha=32,      # Scaling factor
    lora_dropout=0.1,   # Dropout for LoRA layers
    target_modules=["c_attn", "c_proj"],  # ✅ Correct layers for GPT-2
    # target_modules=["q_proj", "v_proj"],  # Apply LoRA to specific attention layers
    bias="none",
    task_type="CAUSAL_LM"  # Task type (adjust for other tasks)
)

# Wrap the model with LoRA
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475




In [6]:
lora_model.print_trainable_parameters()

trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475


In [None]:
import torch

total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

# Device Placement:

In [9]:
def tokenize_function(examples):
    # print(examples)
    # return tokenizer(examples["instruction"] + examples["input"] + examples["output"], padding="max_length", truncation=True, max_length=512)
    # Convert lists to strings by joining the elements
    instructions = [" ".join(x) for x in examples["instruction"]]
    inputs = [" ".join(x) for x in examples["input"]]
    outputs = [" ".join(x) for x in examples["output"]]
    # Now you can concatenate the strings together
    return tokenizer(
        [i + " " + inp + " " + o for i, inp, o in zip(instructions, inputs, outputs)],
        padding="max_length",
        truncation=True,
        max_length=512
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

print("tokenized_datasets", tokenized_datasets#.remove_columns(
    # ["instruction", "input", "output"]
# )
)

Map:   0%|          | 0/4779 [00:00<?, ? examples/s]

Map:   0%|          | 0/597 [00:00<?, ? examples/s]

tokenized_datasets DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output', 'input_ids', 'attention_mask'],
        num_rows: 4779
    })
    test: Dataset({
        features: ['input', 'instruction', 'output', 'input_ids', 'attention_mask'],
        num_rows: 597
    })
})



# Define compute_loss function

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)



In [11]:
import torch

#move to gpu if exist
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=2304, nx=768)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2304, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): lora.Linear(
            (base_layer): Conv1D(nf=768, nx=768)
         

In [12]:
from transformers import Trainer, DataCollatorForLanguageModeling

# Data collator helps with batching
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mfrankrobotics[0m ([33mfrankrobotics-complesity[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,1.4623,1.389034
2,1.3839,1.31934
3,1.3817,1.297099


TrainOutput(global_step=2868, training_loss=1.5078944361858289, metrics={'train_runtime': 1668.5404, 'train_samples_per_second': 8.593, 'train_steps_per_second': 1.719, 'total_flos': 3781863302234112.0, 'train_loss': 1.5078944361858289, 'epoch': 3.0})

In [15]:
# After training, merge LoRA weights:
lora_model = lora_model.merge_and_unload()

# Now save the merged model:
lora_model.save_pretrained("./gpt2_finetuned")
tokenizer.save_pretrained("./gpt2_finetuned")

('./gpt2_finetuned/tokenizer_config.json',
 './gpt2_finetuned/special_tokens_map.json',
 './gpt2_finetuned/vocab.json',
 './gpt2_finetuned/merges.txt',
 './gpt2_finetuned/added_tokens.json')

In [21]:
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2_finetuned", truncation=True, tokenizer=tokenizer)

print(generator("const logHelloo=>", max_length=1000))

Device set to use cuda:0


[{'generated_text': "const logHelloo=> y o u   a r e   a   d o c t o r,   p l e a s e   a n s w e r   t h e   m e d i c a l   q u e s t i o n s   b a s e d   o n   t h e   p a t i e n t's   d e s c r i p t i o n. I   h a v e   a f   a t y r y   t h e   c a i d   n o w   h a v e   a n y   i s   h u s c k   f o e t h   a l l e   q u e k t   a v i c k i n g   t o   l o w p   o f   s h i v e   a n d   b o u z e r   f o r n   t h a t s   g a r t   a m   c o n l y   o r t o k,   H a v e   a b t   i t   b a r y   d i m n   r i m g e   t h a y   c a i d   t o   w e r n   h a v e   a n y   w i p l l   f o r n   t h e   t h e   o f   n o u n   s e l d e.   T h a r y   p a r n i c l y   a    w i t   j o l d.   T h u t   h i s   l i v e   b e y   w h e e r   c e r p t h. T h e   A S   p p u r   o f   m y   a n d   h o c k i n g   o c t l o m e d   o f   n o u n t   h a t   m e d i c a l   s h i v e   h a v e   a f   t o   g o t   k i n d.   T h e   I   y o u   d a y   s u s s o.   C o u n c e   c 

In [22]:
model.push_to_hub("frankrobotics/gpt2_finetuned-v2")
tokenizer.push_to_hub("frankrobotics/gpt2_finetuned-v2")

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/frankrobotics/gpt2_finetuned-v2/commit/41398db8e7732d2b51f5c3d8e1e4fe0d6d4f06c7', commit_message='Upload tokenizer', commit_description='', oid='41398db8e7732d2b51f5c3d8e1e4fe0d6d4f06c7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/frankrobotics/gpt2_finetuned-v2', endpoint='https://huggingface.co', repo_type='model', repo_id='frankrobotics/gpt2_finetuned-v2'), pr_revision=None, pr_num=None)