 ## INSTALLATION

In [1]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/codellama-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/3.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [4]:
import json
import random
# List of JSON file paths
file_paths = ["generated_code_sample_data_01.json", "generated_code_sample_data_02.json", "generated_code_sample_data_03.json", "generated_code_sample_data_04.json"]

# Initialize an empty list to store combined data
data = []

# Iterate through the file paths and load data
for file_path in file_paths:
    with open(file_path, "r") as file:
        file_data = json.load(file)
        data.extend(file_data)  # Extend the list with the contents of each file

# Shuffle the data to ensure randomness
random.seed(42)  # Set seed for reproducibility
random.shuffle(data)

# Split the data (80% train, 20% validation)
split_index = int(0.8 * len(data))  # 80% of the total data
train_data = data[:split_index]
validation_data = data[split_index:]

# Output the sizes of the splits
print(f"Total samples: {len(data)}")
print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(validation_data)}")

Total samples: 219
Training samples: 175
Validation samples: 44


In [5]:
training_prompt = """You are a state of the art text to python language model. Write a short code as per the given question.
Question:
{}
Answer:
{}
"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
# EOS_TOKEN=""
def formatting_prompts_func(qa_pairs):
    formatted_texts = []

    # Process each question-answer pair
    for entry in qa_pairs:
        question = entry["usecase"]
        answer = entry["response2"]

        # Format the training text
        formatted_text = training_prompt.format(question, answer) + EOS_TOKEN
        formatted_texts.append(formatted_text)

    return {"text": formatted_texts}

## Dataset

In [6]:
# Format the data
formatted_data_train = formatting_prompts_func(train_data)

# Output the formatted text for training
for text in formatted_data_train["text"][:2]:  # Display the first 5 formatted samples
    print(text)
    print("=" * 50)  # Separator for readability

You are a state of the art text to python language model. Write a short code as per the given question.
Question:
Build a system for tracking food production, transportation, and distribution with interactive maps and charts.
Answer:
```python
# Installation Instructions:
#
# If you don't have the necessary libraries installed, run these commands in your terminal:
#
# pip install bokeh pandas numpy
#

import json
import numpy as np
import pandas as pd

from bokeh.io import show
from bokeh.layouts import row, column
from bokeh.models import GeoJSONDataSource, HoverTool, ColumnDataSource
from bokeh.plotting import figure
from bokeh.tile_providers import get_provider, WMTSTileSource
from bokeh.palettes import Category20c
from bokeh.transform import factor_cmap
from bokeh.models import Range1d

# 1. Data Generation
try:
    # Generate random locations (latitude and longitude)
    np.random.seed(42)  # for reproducibility
    n_locations = 5
    latitudes = np.random.uniform(30, 40, n_locat

In [7]:
!pip install datasets



In [8]:
from datasets import Dataset

# Convert formatted_data to a Hugging Face Dataset
formatted_data_train = {"text": formatted_data_train["text"]}  # Ensure proper structure
train_dataset = Dataset.from_dict(formatted_data_train)  # Convert to HF Dataset

In [9]:
print(train_dataset)

Dataset({
    features: ['text'],
    num_rows: 175
})


## Load model and wrap with LoRA adapters

In [10]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## SFT

In [11]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 6,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 1000,
        save_steps=5,               # Save model every 20 steps
        save_total_limit=3,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/175 [00:00<?, ? examples/s]

## TRAINING

In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 175 | Num Epochs = 72
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 6
\        /    Total batch size = 12 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 79,953,920


Step,Training Loss
1,4.6842
2,5.1128
3,4.7211
4,4.7762
5,4.8075
6,4.7293
7,4.2835
8,4.5826
9,3.854
10,3.8664


## SAVING MODEL

In [13]:
import shutil
import os
from IPython.display import FileLink


model_name="7bmodel_001"
# Save the model and tokenizer locally in the Kaggle environment
model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)

# Compress the saved model directory into a ZIP file
shutil.make_archive(model_name, 'zip', model_name)

'/content/7bmodel_001.zip'

## INFERENCE

## INFERENCE PROMPT

In [14]:
inference_prompt = """You are a state of the art text to python language model. Write a short code as per the given question.
Question:
{}
Answer:
{}
"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func_inference(qa_pairs):
    formatted_texts = []
    for entry in qa_pairs:
        question = entry["usecase"]
        answer = entry["response2"]
        formatted_text = inference_prompt.format(question, answer) + EOS_TOKEN
        formatted_texts.append(formatted_text)
    return {"text": formatted_texts}

# Format the data for training and unlearning
inference_formatted_data = formatting_prompts_func_inference(data)

## VALIDATION DATASET


In [15]:
# Format the data
formatted_data_validation = formatting_prompts_func_inference(validation_data)

# Output the formatted text for validation
for text in formatted_data_validation["text"][:2]:  # Display the first 2 formatted samples
    print(text)
    print("=" * 50)  # Separator for readability

You are a state of the art text to python language model. Write a short code as per the given question.
Question:
Develop a dashboard for tracking the progress of clinical trials with interactive charts and data analysis.
Answer:
```python
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from bokeh.plotting import figure, curdoc
from bokeh.layouts import row, column
from bokeh.models import ColumnDataSource, HoverTool, NumeralTickFormatter, Range1d
from bokeh.palettes import Category20c

# Generate Sample Data
np.random.seed(42)
num_trials = 10
stages = ['Phase I', 'Phase II', 'Phase III']
now = datetime.now()
start_dates = [now - timedelta(days=np.random.randint(30, 365)) for _ in range(num_trials)]

data = {
    'trial_id': [f'Trial {i+1}' for i in range(num_trials)],
    'stage': np.random.choice(stages, num_trials),
    'enrollment_target': np.random.randint(50, 500, num_trials),
    'enrollment_actual': np.random.randint(10, 400, num_trials),
    'st

In [16]:
# Convert formatted_data to a Hugging Face Dataset
formatted_data_validation = {"text": formatted_data_validation["text"]}  # Ensure proper structure
validation_dataset = Dataset.from_dict(formatted_data_validation)  # Convert to HF Dataset

In [17]:
print(validation_dataset)

Dataset({
    features: ['text'],
    num_rows: 44
})


In [18]:
# #print a smaple training example
validation_dataset['text'][0]

'You are a state of the art text to python language model. Write a short code as per the given question.\nQuestion:\nDevelop a dashboard for tracking the progress of clinical trials with interactive charts and data analysis.\nAnswer:\n```python\nimport pandas as pd\nimport numpy as np\nfrom datetime import datetime, timedelta\nfrom bokeh.plotting import figure, curdoc\nfrom bokeh.layouts import row, column\nfrom bokeh.models import ColumnDataSource, HoverTool, NumeralTickFormatter, Range1d\nfrom bokeh.palettes import Category20c\n\n# Generate Sample Data\nnp.random.seed(42)\nnum_trials = 10\nstages = [\'Phase I\', \'Phase II\', \'Phase III\']\nnow = datetime.now()\nstart_dates = [now - timedelta(days=np.random.randint(30, 365)) for _ in range(num_trials)]\n\ndata = {\n    \'trial_id\': [f\'Trial {i+1}\' for i in range(num_trials)],\n    \'stage\': np.random.choice(stages, num_trials),\n    \'enrollment_target\': np.random.randint(50, 500, num_trials),\n    \'enrollment_actual\': np.ran

In [19]:
# Running inference on single validation sample

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

example_no=0

input_prompt=validation_dataset['text'][example_no]

print("Input Prompt:\n", input_prompt)
inputs = tokenizer(
[
    input_prompt
], return_tensors = "pt").to("cuda")

input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1] # 1 because of batch
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode([outputs, skip_special_tokens=True)

response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
print("Generated Response:\n", response[0])

Input Prompt:
 You are a state of the art text to python language model. Write a short code as per the given question.
Question:
Develop a dashboard for tracking the progress of clinical trials with interactive charts and data analysis.
Answer:
```python
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from bokeh.plotting import figure, curdoc
from bokeh.layouts import row, column
from bokeh.models import ColumnDataSource, HoverTool, NumeralTickFormatter, Range1d
from bokeh.palettes import Category20c

# Generate Sample Data
np.random.seed(42)
num_trials = 10
stages = ['Phase I', 'Phase II', 'Phase III']
now = datetime.now()
start_dates = [now - timedelta(days=np.random.randint(30, 365)) for _ in range(num_trials)]

data = {
    'trial_id': [f'Trial {i+1}' for i in range(num_trials)],
    'stage': np.random.choice(stages, num_trials),
    'enrollment_target': np.random.randint(50, 500, num_trials),
    'enrollment_actual': np.random.randint(10, 400, num_t

In [20]:
## Running inference in full Validation set

final_response = []
correct_predictions = 0  # Initialize correct predictions count
for i in range(len(validation_dataset)):
    FastLanguageModel.for_inference(model)
    input_prompt=validation_dataset['text'][i]
    inputs = tokenizer([
          input_prompt
      ], return_tensors = "pt").to("cuda")

    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    final_response.append(response[0])

# CODEBERT

In [21]:
from transformers import RobertaTokenizer, RobertaModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pretrained CodeBERT model
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")


# # Ensure ground_truth and predictions are of equal length
# assert len(references) == len(final_response), "Mismatch in ground_truth and predictions list lengths"

# Function to compute CodeBERT-based similarity
def compute_similarity(code1, code2):
    # Tokenize the code snippets
    tokens_code1 = tokenizer(code1, return_tensors="pt", padding=True, truncation=True)
    tokens_code2 = tokenizer(code2, return_tensors="pt", padding=True, truncation=True)

    # Get embeddings
    with torch.no_grad():
        embeddings_code1 = model(**tokens_code1).last_hidden_state.mean(dim=1).numpy()
        embeddings_code2 = model(**tokens_code2).last_hidden_state.mean(dim=1).numpy()

    # Compute cosine similarity
    return cosine_similarity(embeddings_code1, embeddings_code2)[0][0]

# Compute similarities for all pairs
similarities = []
for gt, pred in zip(validation_data, final_response):
    similarity = compute_similarity(gt["response2"], pred)
    similarities.append(similarity)
    print(f"Code Similarity (CodeBERT): \nSimilarity: {similarity:.4f}\n")

# Print average similarity
average_similarity = sum(similarities) / len(similarities)
print(f"Average Code Similarity (CodeBERT): {average_similarity:.4f}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Code Similarity (CodeBERT): 
Similarity: 0.9205

Code Similarity (CodeBERT): 
Similarity: 0.9264

Code Similarity (CodeBERT): 
Similarity: 0.9366

Code Similarity (CodeBERT): 
Similarity: 0.9239

Code Similarity (CodeBERT): 
Similarity: 0.9346

Code Similarity (CodeBERT): 
Similarity: 0.9214

Code Similarity (CodeBERT): 
Similarity: 0.9287

Code Similarity (CodeBERT): 
Similarity: 0.9046

Code Similarity (CodeBERT): 
Similarity: 0.9072

Code Similarity (CodeBERT): 
Similarity: 0.9214

Code Similarity (CodeBERT): 
Similarity: 0.9308

Code Similarity (CodeBERT): 
Similarity: 0.9315

Code Similarity (CodeBERT): 
Similarity: 0.9445

Code Similarity (CodeBERT): 
Similarity: 0.8902

Code Similarity (CodeBERT): 
Similarity: 0.9336

Code Similarity (CodeBERT): 
Similarity: 0.9303

Code Similarity (CodeBERT): 
Similarity: 0.9201

Code Similarity (CodeBERT): 
Similarity: 0.9352

Code Similarity (CodeBERT): 
Similarity: 0.9112

Code Similarity (CodeBERT): 
Similarity: 0.9214

Code Similarity (Cod

# LLM AS A JUDGE (GOOGLE FLAN)

In [22]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Load a "judge" model. Here we use FLAN-T5 for demonstration.
# You can use a more specialized model if available.
judge_model_name = "google/flan-t5-base"
judge_tokenizer = AutoTokenizer.from_pretrained(judge_model_name)
judge_model = AutoModelForSeq2SeqLM.from_pretrained(judge_model_name)
judge = pipeline("text2text-generation", model=judge_model, tokenizer=judge_tokenizer)

evaluation_prompts = []
for entry, pred in zip(validation_data, final_response):
    question = entry["usecase"]
    reference_code = entry["response2"]
    candidate_code = pred
    prompt = f"""
You are a code reviewer. Review the following:

Question:
{question}

Reference Answer (Correct Code):
{reference_code}

Candidate Answer (Generated Code):
{candidate_code}

Please provide an assessment if the candidate code correctly solves the question. Rate it on a scale of 1 to 10, and explain your reasoning.
"""
    evaluation_prompts.append(prompt.strip())

scores = []
for prompt in evaluation_prompts:
    judge_result = judge(prompt, max_length=256, num_return_sequences=1)[0]['generated_text']
    # Simple parsing heuristic: look for a digit in the output
    # This can be improved with regex or more careful parsing.
    import re
    match = re.search(r"\b(\d{1,2})\b", judge_result)
    if match:
        score = int(match.group(1))
        # Ensure score is within 1-10
        if 1 <= score <= 10:
            scores.append(score)
        else:
            scores.append(0)
    else:
        scores.append(0)

average_score = sum(scores)/len(scores)
print("Average LLM Judge Score:", average_score)


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Average LLM Judge Score: 1.0


In [23]:
len(scores)

44