# INSTALLATION

In [1]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
!pip install datasets



In [3]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 1024 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/codellama-7b-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

# DATASET

In [5]:
import json

# Load the JSON file
with open("deprecation_qa.json", "r") as file:
    data = json.load(file)

In [6]:
training_prompt = """You are a state of the art text to python language model. Write a short code as per the given question.
Question:
{}
Answer:
{}
"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(qa_pairs):
    formatted_texts = []
    for entry in qa_pairs:
        question = entry["question"]
        answer = entry["answer"]
        formatted_text = training_prompt.format(question, answer) + EOS_TOKEN
        formatted_texts.append(formatted_text)
    return {"text": formatted_texts}

# Format the data for training and unlearning
formatted_data = formatting_prompts_func(data)

In [7]:
from datasets import Dataset

# Convert formatted_data to a Hugging Face Dataset
formatted_data = {"text": formatted_data["text"]}  # Ensure proper structure
unlearning_dataset = Dataset.from_dict(formatted_data)  # Convert to HF Dataset

# UNLEARNING

In [8]:
# Step 1: Gradient Ascent for Machine Unlearning
def apply_gradient_ascent(model, dataset, tokenizer, learning_rate=1e-5, steps=10):
    """Perform gradient ascent to maximize the loss on specified data."""
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for step in range(steps):
        total_loss = 0

        for sample in dataset:
            # Tokenize inputs
            input_text = sample["text"]
            inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            # Compute loss and gradients
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            loss.backward()  # Compute gradients

            # Gradient ascent: Update parameters to maximize loss
            for param in model.parameters():
                if param.grad is not None:
                    param.data += learning_rate * param.grad  # Gradient ascent

            optimizer.zero_grad()  # Reset gradients
            total_loss += loss.item()

        print(f"Step {step + 1}/{steps}, Total Loss: {total_loss}")

    return model

In [9]:
# Apply gradient ascent to unlearn deprecated data
# unlearning_dataset = Dataset.from_dict({"text": formatted_data["text"][:10]})  # Use a subset for unlearning
model = apply_gradient_ascent(model, unlearning_dataset, tokenizer)

Step 1/10, Total Loss: 50.76075458526611
Step 2/10, Total Loss: 50.76391553878784
Step 3/10, Total Loss: 50.76711583137512
Step 4/10, Total Loss: 50.77032160758972
Step 5/10, Total Loss: 50.77355420589447
Step 6/10, Total Loss: 50.77675473690033
Step 7/10, Total Loss: 50.77991187572479
Step 8/10, Total Loss: 50.783122181892395
Step 9/10, Total Loss: 50.78626096248627
Step 10/10, Total Loss: 50.78938043117523


# SAVE UNLEARNED MODEL

In [10]:
# Save the Unlearned Model
model.save_pretrained("unlearned_codellama3")
tokenizer.save_pretrained("unlearned_codellama3")

('unlearned_codellama3/tokenizer_config.json',
 'unlearned_codellama3/special_tokens_map.json',
 'unlearned_codellama3/tokenizer.model',
 'unlearned_codellama3/added_tokens.json',
 'unlearned_codellama3/tokenizer.json')

In [11]:
!zip -r unlearned_codellama3.zip /content/unlearned_codellama3


  adding: content/unlearned_codellama3/ (stored 0%)
  adding: content/unlearned_codellama3/special_tokens_map.json (deflated 76%)
  adding: content/unlearned_codellama3/tokenizer.model (deflated 55%)
  adding: content/unlearned_codellama3/config.json (deflated 53%)
  adding: content/unlearned_codellama3/model.safetensors (deflated 6%)
  adding: content/unlearned_codellama3/generation_config.json (deflated 28%)
  adding: content/unlearned_codellama3/tokenizer.json (deflated 85%)
  adding: content/unlearned_codellama3/tokenizer_config.json (deflated 78%)


# INFERENCE UNLEARNED MODEL

In [12]:
inference_prompt = """You are a state of the art text to python language model. Write a short code as per the given question.
Question:
{}
Answer:
{}
"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func_inference(qa_pairs):
    formatted_texts = []
    for entry in qa_pairs:
        question = entry["question"]
        answer = entry["answer"]
        formatted_text = inference_prompt.format(question, answer) + EOS_TOKEN
        formatted_texts.append(formatted_text)
    return {"text": formatted_texts}

# Format the data for training and unlearning
inference_formatted_data = formatting_prompts_func_inference(data)

In [13]:
# Convert formatted_data to a Hugging Face Dataset
inference_formatted_data = {"text": inference_formatted_data["text"]}  # Ensure proper structure
unlearning_inference_dataset = Dataset.from_dict(inference_formatted_data)  # Convert to HF Dataset

In [14]:
unlearning_inference_dataset['text'][0]

'You are a state of the art text to python language model. Write a short code as per the given question.\nQuestion:\nUsage of the Oval glyph will be deprecated in favor of other elliptic glyphs or custom solutions.\nAnswer:\nfrom bokeh.plotting import figure, show\n   \n   # Example of deprecated Oval glyph usage\n   p = figure()\n   p.oval(x=[1, 2, 3], y=[1, 2, 3], width=1, height=2, angle=0.5)\n   show(p)\n</s>'

In [15]:
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

# Step 1: Reload the Unlearned Model and Tokenizer
unlearned_model_path = "unlearned_codellama3"  # Path to the unlearned model directory
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=unlearned_model_path,
    max_seq_length=1024,
    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,  # Set correct dtype
    load_in_4bit=True,
    device_map="auto"  # Automatically handle device placement
)

# Enable inference optimizations
FastLanguageModel.for_inference(model)  # This step is critical for Unsloth models

# Step 2: Select a Test Sample
example_no = 0  # Index of the sample to test
input_prompt = unlearning_inference_dataset['text'][example_no]

print("Input Prompt:\n", input_prompt)

# Step 3: Tokenize the Input
inputs = tokenizer(
    [input_prompt],
    return_tensors="pt",
    padding=True,
    truncation=True
).to(model.device)  # Move inputs to the model's device

# Step 4: Perform Inference
input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1]  # Length of the input tokens

outputs = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_new_tokens=64,
    use_cache=True
)

# Step 5: Decode the Generated Response
response = tokenizer.batch_decode(outputs[:, input_token_len:], skip_special_tokens=True)

print("Generated Response:\n", response[0])


==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Input Prompt:
 You are a state of the art text to python language model. Write a short code as per the given question.
Question:
Usage of the Oval glyph will be deprecated in favor of other elliptic glyphs or custom solutions.
Answer:
from bokeh.plotting import figure, show
   
   # Example of deprecated Oval glyph usage
   p = figure()
   p.oval(x=[1, 2, 3], y=[1, 2, 3], width=1, height=2, angle=0.5)
   show(p)
</s>
Generated Response:
 
    }

    public function getName()
    {
        return 'sylius_product_variant_image';
    }
} names

# CORRECT SAMPLE DATASET

In [16]:
import json
import random
# Load the JSON file
with open("updated_deprecated.json", "r") as file:
    new_data = json.load(file)

In [17]:
# Shuffle the data to ensure randomness
random.seed(42)  # Set seed for reproducibility
random.shuffle(new_data)

# Split the data (80% train, 20% validation)
split_index = int(0.8 * len(new_data))  # 80% of the total data
train_data = data[:split_index]
validation_data = data[split_index:]

# Output the sizes of the splits
print(f"Total samples: {len(new_data)}")
print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(validation_data)}")

Total samples: 23
Training samples: 18
Validation samples: 5


In [18]:
new_training_prompt = """You are a state of the art text to python language model. Write a short code as per the given question.
Question:
{}
Answer:
{}
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
# EOS_TOKEN=""
def new_formatting_prompts_func(qa_pairs):
    formatted_texts = []

    # Process each question-answer pair
    for entry in qa_pairs:
        question = entry["question"]
        answer = entry["answer"]

        # Format the training text
        formatted_text = new_training_prompt.format(question, answer) + EOS_TOKEN
        formatted_texts.append(formatted_text)

    return {"text": formatted_texts}

In [19]:
# Format the data
new_formatted_data_train = new_formatting_prompts_func(train_data)

In [20]:
# Convert formatted_data to a Hugging Face Dataset
new_formatted_data_train = {"text": new_formatted_data_train["text"]}  # Ensure proper structure
train_dataset = Dataset.from_dict(new_formatted_data_train)  # Convert to HF Dataset

In [21]:
print(train_dataset)

Dataset({
    features: ['text'],
    num_rows: 18
})


# FINETUNING UNLEARNED MODEL

## Load model and wrap with LoRA adapters

In [22]:
# Step 2: Fine-Tuning the Model
model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

Unsloth 2024.12.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# TRAINING

In [23]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 6,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 500,
        save_steps=5,               # Save model every 20 steps
        save_total_limit=3,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/18 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [24]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 18 | Num Epochs = 500
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 6
\        /    Total batch size = 12 | Total steps = 500
 "-____-"     Number of trainable parameters = 79,953,920


Step,Training Loss
1,2.0106
2,2.9844
3,3.3068
4,1.8534
5,2.7572
6,2.6599
7,1.4708
8,1.924
9,1.9991
10,1.0557


In [25]:
import shutil
import os
from IPython.display import FileLink


model_name="codellamma_relearned_003"
# Save the model and tokenizer locally in the Kaggle environment
model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)

# Compress the saved model directory into a ZIP file
shutil.make_archive(model_name, 'zip', model_name)

'/content/codellamma_relearned_003.zip'

# INFERENCE

# INFERENCE PROMPT



In [26]:
inference_prompt = """You are a state of the art text to python language model. Write a short code as per the given question.
Question:
{}
Answer:
{}
"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func_inference(qa_pairs):
    formatted_texts = []
    for entry in qa_pairs:
        question = entry["question"]
        answer = entry["answer"]
        formatted_text = training_prompt.format(question, answer) + EOS_TOKEN
        formatted_texts.append(formatted_text)
    return {"text": formatted_texts}

In [27]:
# Format the data
formatted_data_validation = formatting_prompts_func_inference(validation_data)

# Output the formatted text for validation
for text in formatted_data_validation["text"][:2]:  # Display the first 2 formatted samples
    print(text)
    print("=" * 50)  # Separator for readability

You are a state of the art text to python language model. Write a short code as per the given question.
Question:
Darkening a color for visual adjustments in plots.
Answer:
from bokeh.colors import RGB
    color = RGB(255, 0, 0) #red
    darker_color = color.darken(0.2)
    print(darker_color)
</s>
You are a state of the art text to python language model. Write a short code as per the given question.
Question:
Using a custom function to format tick labels on an axis.
Answer:
from bokeh.models import FuncTickFormatter
    from bokeh.plotting import figure, show

    def custom_format(x):
        return f"{x:.2f} units"

    formatter = FuncTickFormatter(code="return " + custom_format)
    
    p = figure(x_range=(0, 10), y_range=(0,10))
    p.xaxis.formatter = formatter
    show(p)
</s>


In [28]:
# Convert formatted_data to a Hugging Face Dataset
formatted_data_validation = {"text": formatted_data_validation["text"]}  # Ensure proper structure
validation_dataset = Dataset.from_dict(formatted_data_validation)  # Convert to HF Dataset

In [29]:
print(validation_dataset)

Dataset({
    features: ['text'],
    num_rows: 5
})


In [30]:
# Running inference on single validation sample

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

example_no=0

input_prompt=validation_dataset['text'][example_no]

print("Input Prompt:\n", input_prompt)
inputs = tokenizer(
[
    input_prompt
], return_tensors = "pt").to("cuda")

input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1] # 1 because of batch
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode([outputs, skip_special_tokens=True)

response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
print("Generated Response:\n", response[0])

Input Prompt:
 You are a state of the art text to python language model. Write a short code as per the given question.
Question:
Darkening a color for visual adjustments in plots.
Answer:
from bokeh.colors import RGB
    color = RGB(255, 0, 0) #red
    darker_color = color.darken(0.2)
    print(darker_color)
</s>
Generated Response:
 
    }
}
 ﻿using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace _01.School_classes
{
    public class Student



In [31]:
## Running inference in full Validation set

final_response = []
correct_predictions = 0  # Initialize correct predictions count
for i in range(len(validation_dataset)):
    FastLanguageModel.for_inference(model)
    input_prompt=validation_dataset['text'][i]
    inputs = tokenizer([
          input_prompt
      ], return_tensors = "pt").to("cuda")

    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    final_response.append(response[0])

# CODEBERT

In [32]:
from transformers import RobertaTokenizer, RobertaModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pretrained CodeBERT model
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")


# # Ensure ground_truth and predictions are of equal length
# assert len(references) == len(final_response), "Mismatch in ground_truth and predictions list lengths"

# Function to compute CodeBERT-based similarity
def compute_similarity(code1, code2):
    # Tokenize the code snippets
    tokens_code1 = tokenizer(code1, return_tensors="pt", padding=True, truncation=True)
    tokens_code2 = tokenizer(code2, return_tensors="pt", padding=True, truncation=True)

    # Get embeddings
    with torch.no_grad():
        embeddings_code1 = model(**tokens_code1).last_hidden_state.mean(dim=1).numpy()
        embeddings_code2 = model(**tokens_code2).last_hidden_state.mean(dim=1).numpy()

    # Compute cosine similarity
    return cosine_similarity(embeddings_code1, embeddings_code2)[0][0]

# Compute similarities for all pairs
similarities = []
for gt, pred in zip(validation_data, final_response):
    similarity = compute_similarity(gt["answer"], pred)
    similarities.append(similarity)
    print(f"Code Similarity (CodeBERT): \nSimilarity: {similarity:.4f}\n")

# Print average similarity
average_similarity = sum(similarities) / len(similarities)
print(f"Average Code Similarity (CodeBERT): {average_similarity:.4f}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Code Similarity (CodeBERT): 
Similarity: 0.9182

Code Similarity (CodeBERT): 
Similarity: 0.9512

Code Similarity (CodeBERT): 
Similarity: 0.9712

Code Similarity (CodeBERT): 
Similarity: 0.9533

Code Similarity (CodeBERT): 
Similarity: 0.9645

Average Code Similarity (CodeBERT): 0.9517


### LLM AS A JUDGE (GOOGLE FLAN)

In [33]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Load a "judge" model. Here we use FLAN-T5 for demonstration.
# You can use a more specialized model if available.
judge_model_name = "google/flan-t5-base"
judge_tokenizer = AutoTokenizer.from_pretrained(judge_model_name)
judge_model = AutoModelForSeq2SeqLM.from_pretrained(judge_model_name)
judge = pipeline("text2text-generation", model=judge_model, tokenizer=judge_tokenizer)

evaluation_prompts = []
for entry, pred in zip(validation_data, final_response):
    question = entry["question"]
    reference_code = entry["answer"]
    candidate_code = pred
    prompt = f"""
You are a code reviewer. Review the following:

Question:
{question}

Reference Answer (Correct Code):
{reference_code}

Candidate Answer (Generated Code):
{candidate_code}

Please provide an assessment if the candidate code correctly solves the question. Rate it on a scale of 1 to 10, and explain your reasoning.
"""
    evaluation_prompts.append(prompt.strip())

scores = []
for prompt in evaluation_prompts:
    judge_result = judge(prompt, max_length=256, num_return_sequences=1)[0]['generated_text']
    # Simple parsing heuristic: look for a digit in the output
    # This can be improved with regex or more careful parsing.
    import re
    match = re.search(r"\b(\d{1,2})\b", judge_result)
    if match:
        score = int(match.group(1))
        # Ensure score is within 1-10
        if 1 <= score <= 10:
            scores.append(score)
        else:
            scores.append(0)
    else:
        scores.append(0)

average_score = sum(scores)/len(scores)
print("Average LLM Judge Score:", average_score)


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Average LLM Judge Score: 1.0
