<a href="https://colab.research.google.com/github/etuckerman/SOCOTEC/blob/main/SOCOTEC_FINETUNE2_elliot_tuckerman.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
#install unsloth, xformers (for flash attn) and other pckgs
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [2]:

from unsloth import FastLanguageModel
import pandas as pd
import torch
from datasets import Dataset
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

# Constants
MODEL_NAME = "unsloth/mistral-7b-v0.3"
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True
BATCH_SIZE = 2
GRAD_ACCUMULATION = 4
LEARNING_RATE = 2e-4
TRAIN_STEPS = 100  # Optimized training steps
OUTPUT_DIR = "outputs"
SEED = 3407

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=LOAD_IN_4BIT
)

# Apply LoRA Adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.1,  # Small dropout for regularization
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
    use_rslora = False,
    loftq_config = None,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.4: Fast Mistral patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.12.4 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [10]:
# Define the alpaca-style prompt formatting function
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
Return only the function call from one of the following functions:
- add(a, b): Adds two numbers, a and b.
- square(a): Squares the number a.
- cube(a): Cubes the number a.
- greet(name): Greets the person with the given name.

Do not include any additional text, explanations, or commentary. Just return the function call. There may be multiple function calls, but must still follow parentheses logic.

{output}""" # Changed: Removed extra curly braces and replaced with {output} to use the function call

EOS_TOKEN = "</s>"  # Define the EOS token for the model

# Function to format prompts for the dataset
def formatting_prompts_func(examples):
    texts = []
    for instruction, input_text, output in zip(examples['instruction'], examples['input'], examples['output']): # Changed: Use instruction from examples
        # Format the prompt and add the EOS token, using the instruction from the examples
        text = alpaca_prompt.format(instruction=instruction, input=input_text, output=output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Map the dataset to apply the formatting function
dataset = dataset.map(formatting_prompts_func, batched=True)

# Check the dataset structure after formatting
print(dataset[0])

Map:   0%|          | 0/136 [00:00<?, ? examples/s]

{'instruction': "What's the square of 6?", 'output': 'square(6)', 'input': "What's the square of 6?", 'text': "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat's the square of 6?\n\n### Input:\nWhat's the square of 6?\n\n### Response:\nReturn only the function call from one of the following functions:\n- add(a, b): Adds two numbers, a and b.\n- square(a): Squares the number a.\n- cube(a): Cubes the number a.\n- greet(name): Greets the person with the given name.\n\nDo not include any additional text, explanations, or commentary. Just return the function call. There may be multiple function calls, but must still follow parentheses logic.\n\nsquare(6)</s>"}


In [3]:
import itertools
import csv

# Function to generate multi-function examples
def generate_multi_function_examples():
    numbers = list(range(1, 21))

    multi_function_examples = [
        # Square and Add combinations
        {
            'prompt': f"What is the square of {n1} added to {n2}?",
            'function_call': f"add(square({n1}), {n2})"
        } for n1, n2 in itertools.combinations(range(1, 11), 2)
    ] + [
        # Cube and Add combinations
        {
            'prompt': f"Add the cube of {n1} to {n2}",
            'function_call': f"add(cube({n1}), {n2})"
        } for n1, n2 in itertools.combinations(range(1, 11), 2)
    ] + [
        # Add cubed results
        {
            'prompt': f"What do you get when you add the cube of {n1} and the cube of {n2}?",
            'function_call': f"add(cube({n1}), cube({n2}))"
        } for n1, n2 in itertools.combinations(range(1, 11), 2)
    ] + [
        # Add squared results
        {
            'prompt': f"Sum the squares of {n1} and {n2}",
            'function_call': f"add(square({n1}), square({n2}))"
        } for n1, n2 in itertools.combinations(range(1, 11), 2)
    ] + [
        # Complex combinations
        {
            'prompt': f"What is the cube of {n1} plus the square of {n2}?",
            'function_call': f"add(cube({n1}), square({n2}))"
        } for n1, n2 in itertools.combinations(range(1, 11), 2)
    ] + [
        # Greetings with numbers
        {
            'prompt': f"Greet {n1} friends. The first friend is Alice.",
            'function_call': f'greet("Alice")'
        } for n1 in range(1, 6)
    ] + [
        # Name-based examples
        {
            'prompt': f"Multiply {n1} by itself then add {n2}",
            'function_call': f"add(square({n1}), {n2})"
        } for n1, n2 in itertools.combinations(range(1, 11), 2)
    ]

    return multi_function_examples

# Function to append new examples to CSV
def append_to_csv(filename, new_examples):
    existing_prompts = set()
    with open(filename, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        existing_prompts = {row['Prompt'] for row in reader}

    with open(filename, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for example in new_examples:
            if example['prompt'] not in existing_prompts:
                writer.writerow([example['prompt'], example['function_call']])
                existing_prompts.add(example['prompt'])

# Generate and append multi-function examples to dataset
new_examples = generate_multi_function_examples()
append_to_csv('SOCOTEC_DATASET.csv', new_examples)

print(f"Added {len(new_examples)} new multi-function examples to the dataset.")

Added 275 new multi-function examples to the dataset.


In [4]:
# Load and process dataset
dataset = pd.read_csv("SOCOTEC_DATASET.csv")

# Reformat to match the Alpaca dataset structure
dataset_reformatted = dataset[['Prompt', 'Function Call']]
dataset_reformatted.columns = ['instruction', 'output']

# Save the reformatted dataset
dataset_reformatted.to_csv("SOCOTEC_DATASET_ALPACA_FORMAT.csv", index=False)

In [6]:
# Load the dataset
dataset = pd.read_csv("SOCOTEC_DATASET.csv")

# Print the first few rows of the dataset to check its structure
print(dataset.head())

# Check the column names
print("Columns in dataset:", dataset.columns)

# Ensure that the dataset has the expected 'Prompt' and 'Function Call' columns
if 'Prompt' not in dataset.columns or 'Function Call' not in dataset.columns:
    raise ValueError("Dataset does not contain the expected columns: 'Prompt' and 'Function Call'.")


                               Prompt Function Call
0             What's the square of 6?     square(6)
1           Calculate the square of 7     square(7)
2         Find the squared value of 5     square(5)
3  What do you get when you square 4?     square(4)
4             Compute the square of 3     square(3)
Columns in dataset: Index(['Prompt', 'Function Call'], dtype='object')


In [7]:
# Reformat to match the Alpaca dataset structure
dataset_reformatted = dataset[['Prompt', 'Function Call']]

# Rename columns to match Alpaca format
dataset_reformatted.columns = ['instruction', 'output']

# Save the reformatted dataset
dataset_reformatted.to_csv("SOCOTEC_DATASET_ALPACA_FORMAT.csv", index=False)


In [9]:
# Load the reformatted dataset
dataset = pd.read_csv("SOCOTEC_DATASET_ALPACA_FORMAT.csv")

# Print the first few rows to check the structure
print(dataset.head())


                          instruction     output
0             What's the square of 6?  square(6)
1           Calculate the square of 7  square(7)
2         Find the squared value of 5  square(5)
3  What do you get when you square 4?  square(4)
4             Compute the square of 3  square(3)


In [10]:
# Load the dataset (already in the correct format)
dataset = pd.read_csv("SOCOTEC_DATASET.csv")

# Check the structure to ensure it has the correct columns
if 'Prompt' not in dataset.columns or 'Function Call' not in dataset.columns:
    raise ValueError("Dataset must contain 'Prompt' and 'Function Call' columns.")

# Reformat to match the Alpaca-style format for fine-tuning (Prompt -> instruction, Function Call -> output)
dataset = dataset[['Prompt', 'Function Call']]
dataset.columns = ['instruction', 'output']  # Change column names to match fine-tuning format

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(dataset)


In [11]:
# Training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    warmup_steps=5,
    max_steps=TRAIN_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=SEED,
    output_dir=OUTPUT_DIR,
    report_to="none",
)

# Initialize Trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
)

# Train model
trainer_stats = trainer.train()

# Print memory usage and training time
gpu_stats = torch.cuda.get_device_properties(0)
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"GPU: {gpu_stats.name}")
print(f"Peak memory usage: {used_memory} GB")
print(f"Training runtime: {trainer_stats.metrics['train_runtime']} seconds")

Map (num_proc=2):   0%|          | 0/136 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 136 | Num Epochs = 6
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,0.9776
20,0.1306
30,0.0939
40,0.0647
50,0.061
60,0.0535
70,0.0521
80,0.0475
90,0.0431
100,0.0417


GPU: NVIDIA A100-SXM4-40GB
Peak memory usage: 5.152 GB
Training runtime: 220.8786 seconds


In [12]:
# Inference function adjustments to generate only valid function calls
def generate_function_call(input_text):
    FastLanguageModel.for_inference(model)
    # Add a special instruction to ensure model only returns function call
    instruction = "Return only the function call, no additional text."

    # Prepare the input by adding instruction
    input_with_instruction = f"{instruction} {input_text}"

    # Tokenize the input
    inputs = tokenizer([input_with_instruction], return_tensors="pt").to("cuda")

    # Generate output from the model
    outputs = model.generate(**inputs, max_new_tokens=16, use_cache=True)

    # Decode the output and clean up (strip any unnecessary text after the function call)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Ensure that the result only contains the function call (strip extra text if present)
    if "(" in result:  # Ensuring a valid function call is present
        return result
    else:
        return "Invalid function call"

# Example inference
input_texts = [
    "What is 2 plus 3?",
    "What is two squared plus 6?",
    "162 squared plus 1902",
    "increase 6 by 2",
    "say wassup to pete",
    "increase 3 by negative 2",
    "add 3 to 2 cubed"
]

# Test the model's response for each input
for input_text in input_texts:
    res = generate_function_call(input_text)
    print(f"Input: [{input_text}] \nGenerated Function Call: [{res}]")

Input: [What is 2 plus 3?] 
Generated Function Call: [Return only the function call, no additional text. What is 2 plus 3?
add(2, 3)]
Input: [What is two squared plus 6?] 
Generated Function Call: [Return only the function call, no additional text. What is two squared plus 6?
square(2)]
Input: [162 squared plus 1902] 
Generated Function Call: [Return only the function call, no additional text. 162 squared plus 1902 cubed

cube(1902)]
Input: [increase 6 by 2] 
Generated Function Call: [Return only the function call, no additional text. increase 6 by 2

increase(6, 2)]
Input: [say wassup to pete] 
Generated Function Call: [Invalid function call]
Input: [increase 3 by negative 2] 
Generated Function Call: [Return only the function call, no additional text. increase 3 by negative 2

increase(3, -2)]
Input: [add 3 to 2 cubed] 
Generated Function Call: [Return only the function call, no additional text. add 3 to 2 cubed

cube(2)]


In [20]:
# Assuming 'your_username' is your Hugging Face username
model.push_to_hub("etuckerman/SOCOTEC", token="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")  # Online saving
tokenizer.push_to_hub("etuckerman/SOCOTEC", token="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")  # Online saving

README.md:   0%|          | 0.00/591 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/etuckerman/SOCOTEC


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

In [None]:
# Inference
FastLanguageModel.for_inference(model)
input_text = "increase 6 by 2"
inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=16, use_cache=True)
result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(f"Generated Function Call: {result}")


In [None]:
# Inference
FastLanguageModel.for_inference(model)
input_text = "say wassup to pete"
inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=16, use_cache=True)
result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(f"Generated Function Call: {result}")


In [None]:
# Inference
FastLanguageModel.for_inference(model)
input_text = "increase 3 by negative 2"
inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=16, use_cache=True)
result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(f"Generated Function Call: {result}")


In [None]:
# Inference
FastLanguageModel.for_inference(model)
input_text = "add 3 to 2 cubed"
inputs = tokenizer([input_text], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=16, use_cache=True)
result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(f"Generated Function Call: {result}")


In [None]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
import torch


# Inference Adjustments: Make the model generate structured outputs (function calls)
def generate_function_call(input_text):
    # Add a special instruction to ensure model only returns function call
    instruction = "Return only the function call, no additional text."

    # Prepare the input by adding instruction
    input_with_instruction = f"{instruction} {input_text}"

    # Tokenize the input
    inputs = tokenizer([input_with_instruction], return_tensors="pt").to("cuda")

    # Generate output from the model
    outputs = model.generate(**inputs, max_new_tokens=16, use_cache=True)

    # Decode the output and clean up (strip any unnecessary text after the function call)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Ensure that the result only contains the function call (strip extra text if present)
    if "(" in result:  # Ensuring a valid function call is present
        return result
    else:
        return "Invalid function call"

# Example inference (adjust input for testing)
input_texts = [
    "What is 2 plus 3?",
    "What is two squared plus 6?",
    "162 squared plus 1902",
    "increase 6 by 2",
    "say wassup to pete",
    "increase 3 by negative 2",
    "add 3 to 2 cubed"
]

# Test the model's response for each input
for input_text in input_texts:
    res = generate_function_call(input_text)
    print(f"Input: [{input_text}] \nGenerated Function Call: [{res}]")
