In [None]:
!pip install -q accelerate==0.26.0 peft==0.4.0 bitsandbytes>=0.41.3 trl==0.4.7 openai==0.28 fastapi

In [None]:
# Mounting google drive to load the models
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install evaluate
!pip install rouge-score



# Importing modules

In [None]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from peft import PeftModel
from datasets import load_dataset, Dataset
import csv
import evaluate
from tqdm import tqdm
import pandas as pd
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
import os
from huggingface_hub import login

# Initialize OpenAI API key

huggingface_token = "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"
login(token = huggingface_token)

In [None]:
model_path_mistral = "/content/drive/MyDrive/Models_LLM_Project/mistral-7b-lora-it-code-optimize"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)
# Load the base model
base_model_mistral = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    device_map="auto",  # Uses Colab's GPU
    torch_dtype=torch.float16,  # Use FP16 for performance
    quantization_config =bnb_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenizer_mistral = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)

In [None]:
def format_example(example):
    """
    Formats the dataset into a consistent structure using the given prompt and response.
    """
    prompt = example.get('prompt', '').strip()

    formatted_string = f"""
    ### Task:
    Analyze the provided problem and optimize the given code. Include constructive feedback and an improved version of the code.

    ### Prompt:
    {prompt}

    ### Instructions:
    1. Read and understand the original code and its problem statement.
    2. Begin the response with the original code starting with "Original:"
    2. Provide a critique of the code, highlighting inefficiencies and areas for improvement.
    3. Rewrite the code to address the critique.
    4. Begin your critique with "CRITIQUE:" and your revised code with "REVISED:".

    ### Response:

    """

    return formatted_string


In [None]:
model_mistral = PeftModel.from_pretrained(base_model_mistral, model_path_mistral, device_map="auto")

# Load the tokenizer
tokenizer_mistral = AutoTokenizer.from_pretrained(model_path_mistral, use_fast=False)

# Set padding token if not already set
if not tokenizer_mistral.pad_token:
    tokenizer_mistral.pad_token = tokenizer_mistral.eos_token

# Set model to evaluation mode
model_mistral.eval()


  adapters_weights = torch.load(


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(
                in_features=4096, out_features=1024, bias=False
 

In [None]:
def generate_response_qualitative(prompt):
    """
    Generates a response from the model for a given prompt.
    """
    inputs = tokenizer_mistral(prompt, return_tensors="pt").to('cuda')

    outputs = model_mistral.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.2,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.2,
        eos_token_id=tokenizer_mistral.eos_token_id,
    )
    generated_text = tokenizer_mistral.decode(outputs[0], skip_special_tokens=True)
    response = generated_text[len(prompt):].strip()
    return response

In [None]:
def generate_response(prompt):
    """
    Generates a response from the model for a given prompt.
    """
    inputs = tokenizer_mistral(prompt, return_tensors="pt").to('cuda')

    outputs = model_mistral.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.2,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.2,
        eos_token_id=tokenizer_mistral.eos_token_id,
    )
    generated_text = tokenizer_mistral.decode(outputs[0], skip_special_tokens=True)
    response = generated_text[len(prompt):].strip()
    return response

# Loading the dataset

In [None]:
code_optimization_dataset = load_dataset("Dahoas/code-review-instruct-critique-revision-python")
dataset = code_optimization_dataset['train']
shuffled_dataset = dataset.shuffle(seed = 42)
split_dataset = shuffled_dataset.train_test_split(test_size=0.2, seed = 42)


In [None]:
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

# Verify the sizes of the splits
print("Train size:", len(train_dataset))
print("Test size:", len(test_dataset))

Train size: 7569
Test size: 1893


In [None]:
!pip install evaluate
!pip install rouge-score



In [None]:
# Load the ROUGE metric
rouge = evaluate.load('rouge')

# List to store quantitative and qualitative results
results = []

# Generate predictions and calculate ROUGE scores
print("Evaluating model...")
for i in tqdm(range(len(test_dataset.select(range(10)))), desc="Processing Test Set"):
    sample = test_dataset[i]
    prompt = format_example(sample)
    reference = sample['response']

    # Generate model prediction
    prediction = generate_response(prompt)

    # Compute ROUGE scores
    scores = rouge.compute(predictions=[prediction], references=[reference])

    # Append all results
    results.append({
        "Prompt": prompt,
        "Reference": reference,
        "Prediction": prediction,
        "ROUGE-1": scores["rouge1"],
        "ROUGE-2": scores["rouge2"],
        "ROUGE-L": scores["rougeL"],
        "ROUGE-Lsum": scores["rougeLsum"]
    })

# Create a DataFrame for export
results_df = pd.DataFrame(results)

# Sort results by ROUGE-Lsum
results_sorted = results_df.sort_values(by="ROUGE-Lsum", ascending=False)

# Export all results to a CSV file
output_file = "model_evaluation_results.csv"
results_sorted.to_csv(output_file, index=False)
print(f"Results exported to {output_file}")

# Display the top 10 results for presentation
top_10_results = results_sorted.head(10)
top_10_output_file = "top_10_model_evaluation_results.csv"
top_10_results.to_csv(top_10_output_file, index=False)

print(f"Top 10 results exported to {top_10_output_file}")

Evaluating model...


Processing Test Set:   0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Test Set:  10%|█         | 1/10 [00:55<08:17, 55.23s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Test Set:  20%|██        | 2/10 [01:42<06:45, 50.66s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Test Set:  30%|███       | 3/10 [02:18<05:08, 44.06s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Test Set:  40%|████      | 4/10 [03:13<04:48, 48.10s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Test Set:  50%|█████     | 5/10 [03:47<03:36, 43.23s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Test Set:  60%|██████    | 6/10 [04:42<03:08, 47.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing Test Set:  70%|███████   | 7/10

Results exported to model_evaluation_results.csv
Top 10 results exported to top_10_model_evaluation_results.csv





In [None]:
results_df

Unnamed: 0,Prompt,Reference,Prediction,ROUGE-1,ROUGE-2,ROUGE-L,ROUGE-Lsum
0,\n ### Task:\n Analyze the provided prob...,\nOriginal: \nclass Game:\n\n _rolls = [0] * ...,Original: \nclass Game:\n\n _rolls = [0] * 21...,0.585448,0.556876,0.568528,0.57868
1,\n ### Task:\n Analyze the provided prob...,\nOriginal: \ndef MinimumSwaps(Queue):\n ...,Original: \n<pre><code>def MinimumSwaps(Queue)...,0.802168,0.757493,0.791328,0.791328
2,\n ### Task:\n Analyze the provided prob...,"\nORIGINAL: \n\ndef finder(arr1, arr2):\n l =...","Original: \ndef finder(arr1, arr2):\n l = len...",0.515254,0.334471,0.433898,0.474576
3,\n ### Task:\n Analyze the provided prob...,\nORIGINAL: \nimport time\nimport csv\nfrom pp...,Original: \n<pre><code>import time\nimport csv...,0.393909,0.388606,0.391878,0.393909
4,\n ### Task:\n Analyze the provided prob...,\nORIGINAL: \ndef movie_version_map():\n mo...,Original: \n\ndef movie_version_map():\n mo...,0.839695,0.769231,0.816794,0.824427
5,\n ### Task:\n Analyze the provided prob...,ORIGINAL: \nfor i in range(len(employeesChose...,Original: \nfor i in range(len(employeesChosen...,0.404651,0.403263,0.404651,0.404651
6,\n ### Task:\n Analyze the provided prob...,\nOriginal: \n<pre><code>import random;\n\ndef...,Original: \n<pre><code>import random;\n\ndef g...,0.786749,0.694387,0.778468,0.782609
7,\n ### Task:\n Analyze the provided prob...,\nORIGINAL: \ndef sum13(nums):\n sum = 0\n f...,Original: \ndef sum13(nums):\n sum = 0\n for...,0.604878,0.551724,0.585366,0.585366
8,\n ### Task:\n Analyze the provided prob...,\nORIGINAL: \n\nimport re\n\n# either of the f...,Original: \nimport re\n\n# either of the follo...,0.904459,0.878205,0.904459,0.904459
9,\n ### Task:\n Analyze the provided prob...,\nOriginal: \n<pre><code>def composite_count(l...,Original: \n\ndef composite_count(limit):\n ...,0.595136,0.545194,0.572246,0.575107


In [None]:
# Create a DataFrame for export
results_df = pd.DataFrame(results)

# Compute average ROUGE scores
average_rouge1 = results_df["ROUGE-1"].mean()
average_rouge2 = results_df["ROUGE-2"].mean()
average_rougeL = results_df["ROUGE-L"].mean()
average_rougeLsum = results_df["ROUGE-Lsum"].mean()

In [None]:
print("Average rouge 1 score: ", average_rouge1)
print("Average rouge 2 score: ", average_rouge2)
print("Average rouge L score: ", average_rougeL)
print("Average rougeLSum score: ", average_rougeLsum)

Average rouge 1 score:  0.6432347138593104
Average rouge 2 score:  0.5879450366363341
Average rouge L score:  0.6247615792750546
Average rougeLSum score:  0.6315112104833774


In [None]:
from tqdm import tqdm

# Collect qualitative results
qualitative_results = []
for i in tqdm(range(len(test_dataset.select(range(2)))), desc="Generating Summaries"):
    sample = test_dataset[i]
    prompt = format_example(sample)
    reference = sample['response']

    # Generate the model's prediction
    prediction = generate_response_qualitative(prompt)

    # Append results for qualitative analysis
    qualitative_results.append({
        "prompt": prompt,
        "reference": reference,
        "prediction": prediction,
    })

# Display qualitative comparisons
for idx, sample in enumerate(qualitative_results[:2], start=1):  # Show top 10 for inspection
    print(f"Sample {idx}")
    print(f"Prompt: {sample['prompt']}")
    print(f"Reference: {sample['reference']}")
    print(f"Prediction: {sample['prediction']}")
    print("-" * 80)


Generating Summaries:   0%|          | 0/2 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating Summaries:  50%|█████     | 1/2 [00:55<00:55, 55.26s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating Summaries: 100%|██████████| 2/2 [01:43<00:00, 51.53s/it]

Sample 1
Prompt: 
    ### Task:
    Analyze the provided problem and optimize the given code. Include constructive feedback and an improved version of the code.

    ### Prompt:
    Question: <p>I've been writing basic Python scripts for a while now to help process data or automate some task but I've decided I should start picking up unit testing and objective orientated programming (the vast majority of my scripts so far have been procedural).</p>

<p>As a starter I decided to follow along with Uncle Bob's <a href="http://butunclebob.com/ArticleS.UncleBob.TheBowlingGameKata" rel="nofollow">bowling scoring kata</a> to try and get my mind around TDD and the idea of writing the absolute minimal code at every step to either make the test go red or green (plus any refactoring steps).</p>

<p>As it's a bare bones example of following TDD the main program doesn't actually have an entry point other than via the tests.</p>

<p>Things that stand out to my beginner's eye:</p>

<ul>
<li><p>There 


