<a href="https://colab.research.google.com/github/gabrielledoran/llm-finetuning-platform/blob/main/notebooks/llm_finetuning_platform_1_1_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project 2: llm-finetuning-platform
### Part 1: Model setup

In [1]:
# packages
!pip install -q transformers accelerate peft bitsandbytes datasets tensorboard scipy anthropic openai
import os
import sys

import bitsandbytes
import datasets
import datetime
from datasets import load_dataset
import huggingface_hub
import json
import random
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import peft

import anthropic
import openai

In [2]:
# get secrets
from google.colab import userdata
OPENAI_API_KEY = userdata.get('openaisecretkey')
ANTHROPIC_API_KEY = userdata.get('anthropicsecretkey')
HF_TOKEN = userdata.get('bab')

# login to huggingface
from huggingface_hub import login
login(new_session=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Load in Llama

In [5]:

##### from claude helper code
# load model in
model_name = "meta-llama/Llama-3.1-8B" ## originally:  "meta-llama/Meta-Llama-3-8B-Instruct"

print(f"Loading {model_name}... This may take a minute...")

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16,  # fp16 saves memory vs fp32
    device_map="auto",
    trust_remote_code=True,
)
# load in tokens
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Loading meta-llama/Llama-3.1-8B... This may take a minute...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
##### from Llama-3.1-8B.ipynb (llama documentation link on huggingface)
# https://colab.research.google.com/#fileId=https%3A//huggingface.co/meta-llama/Llama-3.1-8B.ipynb

# use pipeline to help
from transformers import pipeline
pipe = pipeline("text-generation", model="meta-llama/Llama-3.1-8B")

# load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Create a Model Interface class to select OpenAI or Anthropic models

In [7]:
class ModelInterface:
    """Unified interface for calling different models from APIs"""

    def __init__(self):
        self.openai_client = openai.OpenAI(api_key=OPENAI_API_KEY) if 'OPENAI_API_KEY' in globals() else None
        self.anthropic_client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY) if 'ANTHROPIC_API_KEY' in globals() else None
        self.local_model = model if 'model' in globals() else None
        self.local_tokenizer = tokenizer if 'tokenizer' in globals() else None

    def generate(self, prompt, model_name, max_tokens=150, temperature=0.7):
        """Generate response from specified model"""

        if model_name == "gpt-4":
            return self._generate_openai(prompt, "gpt-4", max_tokens, temperature)

        elif model_name == "gpt-3.5-turbo":
            return self._generate_openai(prompt, "gpt-3.5-turbo", max_tokens, temperature)

        elif model_name == "claude-3-sonnet":
            return self._generate_anthropic(prompt, "claude-3-sonnet-20240229", max_tokens, temperature)

        elif model_name == "Llama-3.1-8B":
            return self._generate_local(prompt, max_tokens, temperature)

        else:
            raise ValueError(f"Unknown model: {model_name}")

    def _generate_openai(self, prompt, model, max_tokens, temperature):
        """Generate from OpenAI API"""
        response = self.openai_client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=max_tokens,
            temperature=temperature,
        )
        return response.choices[0].message.content

    def _generate_anthropic(self, prompt, model, max_tokens, temperature):
        """Generate from Anthropic API"""
        response = self.anthropic_client.messages.create(
            model=model,
            max_tokens=max_tokens,
            temperature=temperature,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content[0].text

    def _generate_local(self, prompt, max_tokens, temperature):
        """Generate from local Llama model"""
        inputs = self.local_tokenizer(prompt, return_tensors="pt").to("cuda")

        with torch.no_grad():
            outputs = self.local_model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=temperature,
                do_sample=True,
                top_p=0.9,
                pad_token_id=self.local_tokenizer.eos_token_id,
            )


        response = self.local_tokenizer.decode(outputs[0], skip_special_tokens=True)
        # remove the prompt from the response
        response = response[len(prompt):].strip()
        return response

# initialize interface
interface = ModelInterface()

In [8]:
# test prompt to verify all models work!
test_prompt = "Complete this sentence: The doctor walked into the room and"

models_list = ["Llama-3.1-8B"]
# model buffet: "gpt-3.5-turbo", "Llama-3.1-8B", "gpt-4", "claude-3-sonnet"

for model_name in models_list:
    try:
        print(f"\n--- {model_name} ---")
        response = interface.generate(test_prompt, model_name, max_tokens=50)
        print(f"✓ Success!")
        print(f"Response: {response[:100]}...")
    except Exception as e:
        print(f"❌ Error: {e}")

print("\n" + "="*80)
print("✓ Model setup complete!")



--- Llama-3.1-8B ---
✓ Success!
Response: (a) the patient's blood pressure was normal, (b) the patient's blood pressure was high, or (c) the p...

✓ Model setup complete!


In [10]:
# document our setup/progress and save in model_setup.json
from datetime import datetime

setup_info = {
    "date": datetime.now().isoformat(),
    "environment": "Google Colab",
    "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None",
    "models": {
        "local": ["meta-llama/Llama-3.1-8B"],
        "api": ["gpt-3.5-turbo", "gpt-4", "claude-3-sonnet"] if 'OPENAI_API_KEY' in globals() else ["None - add API keys"],
    },
    "memory_usage_gb": round(torch.cuda.memory_allocated()/1e9, 2) if torch.cuda.is_available() else 0,
}

with open('model_setup.json', 'w') as f:
    json.dump(setup_info, f, indent=2)
