<a href="https://colab.research.google.com/github/frankrobotics/my-small-model-1/blob/main/another_bad_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary libraries:

In [1]:
!pip install transformers datasets accelerate huggingface_hub

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (fr

# login to hub

In [2]:
from huggingface_hub import login

from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')

login(
    token=HF_TOKEN,
    add_to_git_credential=True,
)

In [6]:
from datasets import load_dataset

dataset = load_dataset("frankrobotics/my-second-dataset")

print(dataset)

# Split into train and validation sets
dataset = dataset["train"].train_test_split(test_size=0.1)

dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 5
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 5
    })
})


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 4
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 1
    })
})

# Load the model

In [7]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn’t have a padding token by default

# Load pre-trained model
model = GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [28]:
import torch

total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

Total parameters: 124439808


# Device Placement:

In [13]:
def tokenize_function(examples):
    print(examples)
    # Convert lists to strings by joining the elements
    instructions = [" ".join(x) for x in examples["instruction"]]
    inputs = [" ".join(x) for x in examples["input"]]
    outputs = [" ".join(x) for x in examples["output"]]
    # Now you can concatenate the strings together
    return tokenizer(
        [i + " " + inp + " " + o for i, inp, o in zip(instructions, inputs, outputs)],
        padding="max_length",
        truncation=True,
        max_length=512
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

print("tokenized_datasets", tokenized_datasets#.remove_columns(
    # ["instruction", "input", "output"]
# )
)

tokenized_datasets DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1
    })
})



# Define compute_loss function

In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)



In [16]:
from google.colab import userdata
Wandb_AI = userdata.get('Wandb_AI')

In [15]:
from transformers import Trainer, DataCollatorForLanguageModeling

# Data collator helps with batching
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,No log,3.214997
2,No log,2.807227
3,No log,2.567328


TrainOutput(global_step=3, training_loss=3.2848380406697593, metrics={'train_runtime': 454.124, 'train_samples_per_second': 0.026, 'train_steps_per_second': 0.007, 'total_flos': 3135504384000.0, 'train_loss': 3.2848380406697593, 'epoch': 3.0})

In [17]:
model.save_pretrained("./gpt2_finetuned")
tokenizer.save_pretrained("./gpt2_finetuned")

('./gpt2_finetuned/tokenizer_config.json',
 './gpt2_finetuned/special_tokens_map.json',
 './gpt2_finetuned/vocab.json',
 './gpt2_finetuned/merges.txt',
 './gpt2_finetuned/added_tokens.json')

In [29]:
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2_finetuned", truncation=True, tokenizer=tokenizer)

print(generator("List the features of Frank's Notes app.", max_length=100))

Device set to use cpu


[{'generated_text': 'List the features of Frank\'s Notes app. The new feature sets him an average of 6 episodes:\n\nIn his notes he notes: "A lot of writing happens on paper — I\'ve taken a job in a field where I don\'t work much, and sometimes I can\'t remember where the phone\'s turned off. I have to figure out where the audio files are. The only way I know is the phone\'s turned on,"\n\nHe writes and writes and writes and writes and writes'}]


In [30]:
model.push_to_hub("frankrobotics/gpt2_finetuned")
tokenizer.push_to_hub("frankrobotics/gpt2_finetuned")

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/frankrobotics/gpt2_finetuned/commit/8730f7df5a941bfe07fbbe38dbdf993b306d7aef', commit_message='Upload tokenizer', commit_description='', oid='8730f7df5a941bfe07fbbe38dbdf993b306d7aef', pr_url=None, repo_url=RepoUrl('https://huggingface.co/frankrobotics/gpt2_finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='frankrobotics/gpt2_finetuned'), pr_revision=None, pr_num=None)