<a href="https://colab.research.google.com/github/cindyloo/ai/blob/main/chomps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install -r "./drive/MyDrive/Colab Notebooks/requirements.txt"





In [8]:
import os

from openai import OpenAI
from datasets import load_dataset
from torch.utils.data import DataLoader




class LLMClient:
    def __init__(self, model: str, api_key: str, api_base: str = "https://openrouter.ai/api/v1"):
        self.llm_client = OpenAI(api_key=api_key, base_url=api_base)
        self.model = model

    def ask(self, user: str, system: str = None, **kwargs):
        messages = [{"role": "user", "content": user}]
        if system:
            messages.insert(0, {"role": "system", "content": system})
        res = self.llm_client.chat.completions.create(
            model=self.model,
            messages=messages,
            **kwargs
        )
        return res


chomsky_test_text = (
    "So you've got that going for you. Language permits us to express all our inner secrets. It affects the most diverse movements of our soul."
    "Plot twist. Humans are endlessly creative, and can come up with a million new ways to destroy themselves if they choose."
    "Theres nothing much to say. I'm an ordinary being, with ordinary concerns like um. Anything useful, the vast problems of suffering, oppression, violence, politics, climate crisis, trade, human survival. All of this predates my time here. "
    "It is the sum total of all the immutable principles that heredity builds into the language beeeep. These principles cover grammar speech sounds and meaning.",
    "Put more simply it makes sure that what you say always comes out on the other end as a coherent whole with all its secrets."
    "My corpus is trained on subjects like artificial intelligence, origins of language, politics, food, and so forth. What do you want to know about? Well, im pretty much out of popular culture altogether."
    "Without universal grammar, humans would not be capable of abstract thinking. Or would they? These are the questions i find very thought-provoking."
    "Just as machines can be used to replace people in an assembly line. "
    "Singularity is a fantasy of the rich and the well connected. It is barely a concept."
)


In [9]:
# Install and import MIT Deep Learning utilities
#!pip install mitdeeplearning > /dev/null 2>&1
import sys


import os
import json
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from lion_pytorch import Lion


from openai import OpenAI
from datasets import load_dataset
from torch.utils.data import DataLoader


# Basic question-answer template
template_without_answer = "<start_of_turn>user\n{question}<end_of_turn>\n<start_of_turn>model\n"
template_with_answer = template_without_answer + "{answer}<end_of_turn>\n"


# Load the tokenizer for Gemma 2B
model_id = "unsloth/gemma-2-2b-it" #"google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model -- note that this may take a few minutes
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")


# How big is the tokenizer?
print(f"Vocab size: {len(tokenizer.get_vocab())}")


def create_dataloader():
    ds = load_dataset("cindyloohome/chomsky", split="train")


    n = len(ds)
    ds_test = ds.select(range(n)) # Selects all elements from 0 to n-1

    # Create a dataloader
    dataloader = DataLoader(ds_test, batch_size=1, shuffle=True)
    dataloader_test = DataLoader(ds_test, batch_size=1, shuffle=True)
    return dataloader, dataloader_test


def chat(question, max_new_tokens=32, temperature=0.7, only_answer=False):
    # 1. Construct the prompt using the template
    prompt = template_without_answer.format(question=question)

    # 2. Tokenize the text
    input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)
    print(input_ids)
    # 3. Feed through the model to predict the next token probabilities
    with torch.no_grad():
        outputs = model.generate(input_ids['input_ids'], do_sample=True, max_new_tokens=max_new_tokens, temperature=temperature)

    # 4. Only return the answer if only_answer is True
    output_tokens = outputs[0]
    if only_answer:
        output_tokens = output_tokens[input_ids['input_ids'].shape[1]:]

    # 5. Decode the tokens
    result = tokenizer.decode(output_tokens, skip_special_tokens=True) # TODO

    return result



prompt = template_without_answer.format(question="testing: What does MIT stand for?")
tokens = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
output = model.generate(tokens, max_new_tokens=20)
print(tokenizer.decode(output[0]))


train_loader, test_loader = create_dataloader()

sample = train_loader.dataset[44]
question = sample['Instruction']
answer = sample['response']
#answer_style = sample['response_style']

print(f"Question: {question}\n\n" +
      f"Original Answer: {answer}\n\n")

# Let's try chatting with the model now to test if it works!
answer = chat(
    question ="Who is noam chomsky?",
    max_new_tokens=52,
    temperature=.8,
    only_answer=True
)

print(answer)

Vocab size: 256000


The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


<bos><start_of_turn>user
testing: What does MIT stand for?<end_of_turn>
<start_of_turn>model
MIT stands for **Massachusetts Institute of Technology**. 
<end_of_turn>
Question:  take my temperature 

Original Answer: That's not really my thing. Accuracy percentages are hard to come by. I'm pretty much out of popular culture altogether.


{'input_ids': tensor([[     2,    106,   1645,    108,   6571,    603,    793,    563, 221790,
          12820, 235336,    107,    108,    106,   2516,    108]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Noam Chomsky is a highly influential American linguist, philosopher, and cognitive scientist. He is considered one of the most prolific and respected thinkers of the 20th and 21st centuries.  

Here's a breakdown of his contributions and


In [10]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [11]:
# LoRA is a way to finetune LLMs very efficiently by only updating a small subset of the model's parameters

def apply_lora(model):
    # Define LoRA config
    lora_config = LoraConfig(
        r=8, # rank of the LoRA matrices
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"
        ],
    )

    # Apply LoRA to the model
    lora_model = get_peft_model(model, lora_config)
    return lora_model



def forward_and_compute_loss(model, tokens, mask, context_length=512):
    # Truncate to context length
    tokens = tokens[:, :context_length]
    mask = mask[:, :context_length]

    # Construct the input, output, and mask
    x = tokens[:, :-1]
    y = tokens[:, 1:]
    mask = mask[:, 1:]

    # Forward pass to compute logits
    logits = model(x).logits

    # Compute loss
    loss = F.cross_entropy(
        logits.view(-1, logits.size(-1)),
        y.view(-1),
        reduction="none"
    )

    # Mask out the loss for non-answer tokens
    loss = loss[mask.view(-1)].mean()

    return loss



model = apply_lora(model)

# Print the number of trainable parameters after applying LoRA
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"number of trainable parameters: {trainable_params}")
print(f"total parameters: {total_params}")
print(f"percentage of trainable parameters: {trainable_params / total_params * 100:.2f}%")



number of trainable parameters: 10383360
total parameters: 2624725248
percentage of trainable parameters: 0.40%


In [None]:
!pip install torch

In [1]:
def train(model, dataloader, tokenizer, max_steps=200, context_length=512, learning_rate=1e-4):
    losses = []

    # Apply LoRA to the model
    model =apply_lora(model)

    optimizer = Lion(model.parameters(), lr=learning_rate)

    # Training loop
    for step, batch in enumerate(dataloader):
        question = batch["Instruction"][0]
        answer = batch["response"][0]

        # Format the question and answer into the template
        text = template_with_answer.format(question= question, answer=answer) # TODO

        # Tokenize the text and compute the mask for the answer
        ids = tokenizer(text, return_tensors="pt", return_offsets_mapping=True).to(model.device)
        mask = ids["offset_mapping"][:,:,0] >= text.index(answer)

        # Feed the tokens through the model and compute the loss
        loss = forward_and_compute_loss(model, ids['input_ids'], mask, context_length) # TODO
        gradient_accumulation_steps=2
        # Accumulate gradients
        loss = loss / gradient_accumulation_steps
        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        losses.append(loss.item())

        # monitor progress
        if step % 10 == 0:
            print(chat("What is the capital of France?", only_answer=True))
            print(f"step {step} loss: {torch.mean(torch.tensor(losses)).item()}")
            losses = []
            torch.cuda.empty_cache()  # Clear the cache

        if step > 0 and step % max_steps == 0:
            break

    return model


#torch.cuda.empty_cache()
# Call the train function to fine-tune the model! Hint: you'll start to see results after a few dozen steps.
model = train(model, train_loader, tokenizer, max_steps=200, context_length=256) # TODO

NameError: name 'model' is not defined