In [4]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
# !pip install unsloth
# Get latest Unsloth
# !pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --upgrade pip
# !pip install "unsloth[cu124-torch250] @ git+https://github.com/unslothai/unsloth.git"
!conda activate unsloth_env

In [6]:
!pip install datasets

[0m

In [7]:
# Mount gdrive for colab
colab = False
if colab:
    from google.colab import drive
    drive.mount('/content/drive')
    local_drive_mount = "/content/drive/MyDrive/"
else:
    local_drive_mount = "/Users/dbaeka/Library/CloudStorage/GoogleDrive-dbaekajnr@gmail.com/My Drive/"

In [8]:
# Set up Logging
import logging
import os

log_folder_path = os.path.join(local_drive_mount, 'soen691/logs')
os.makedirs(log_folder_path, exist_ok=True)

log_file_path = os.path.join(log_folder_path, 'logfile.txt')

logging.basicConfig(
    filename=log_file_path,
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
)

## Load Test Dataset
Since 500 test set is a subset of 5000 dataset, i.e. first 500, if we run inference on 5000 set, we do not need to run again on 500. We can simply extract predicted result from the results during post-processing before evaluation

In [9]:
from datasets import load_dataset
from tqdm import tqdm
import os

In [10]:
# Define IOUtils
import io
import json

def _make_w_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f_dirname = os.path.dirname(f)
        if f_dirname != "":
            os.makedirs(f_dirname, exist_ok=True)
        f = open(f, mode=mode, encoding="utf-8")
    return f


def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode, encoding="utf-8")
    return f


def jdump(obj, f: str, mode="w", indent=4, default=str):
    """Dump a str or dictionary to a file in json format."""
    f = _make_w_io_base(f, mode)
    if isinstance(obj, (dict, list)):
        json.dump(obj, f, indent=indent, default=default)
    elif isinstance(obj, str):
        f.write(obj)
    else:
        raise ValueError(f"Unexpected type: {type(obj)}")
    f.close()


def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

In [11]:
from typing import Optional, Sequence

models = [
  "Qwen/Qwen2.5-7B-Instruct",
  "Qwen/Qwen2.5-Coder-7B-Instruct",
  "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
]

# set model to use for notebook
model_set = 2
model_name = models[model_set]

# set test to use
test_name = "test_5000"

BATCH_SIZE=5

def shard_dataset(repo_name, output_dir, chunk_size: int = 10_000):
  dataset = load_dataset(repo_name)['test']
  for i in range(0, len(dataset), chunk_size):
    shard = dataset[i:i + chunk_size]
    jdump(shard, f"{output_dir}/shard_{i // chunk_size}_input.json")

def prettify(name: str) -> str:
  return name.replace("/", "_").replace("-", "_").replace(".", "_")

In [12]:
# set shard size to use
shard_size = 32

base_results_dir = os.path.join(local_drive_mount, "soen691/results/")
test_results_dir = os.path.join(base_results_dir, prettify(model_name), f"{test_name}_input")

shard_dataset(f"dbaeka/soen_691_msg_{test_name}_hashed", test_results_dir, shard_size)

In [16]:
!pip install rank_bm25

[0m

In [17]:
## Set up BM25
from rank_bm25 import BM25Okapi

train_dataset = load_dataset("dbaeka/soen_691_msg_train")['train']
print(f"Train data length: {len(train_dataset)}")
print("\n-----------------\n")

tokenized_corpus = [doc["patch"].split(" ") for doc in train_dataset]
bm25 = BM25Okapi(tokenized_corpus)

Train data length: 117739

-----------------



In [18]:
import numpy as np

# Query example
query = "CrossProduct"
tokenized_query = query.split(" ")

# Get scores
scores = bm25.get_scores(tokenized_query)

# Sort documents by score (descending order)
sorted_indices = np.argsort(scores)[::-1]  # Get indices sorted by highest score

# Show top 3 matches
top_k = 3
print("Top Retrieved Documents:")
for i in range(top_k):
    index = sorted_indices[i]
    patch = train_dataset["patch"]
    print(f"Rank {i+1}: Score {scores[index]:.4f} - {patch[index]}")

Top Retrieved Documents:
Rank 1: Score 0.0000 - @@ -595,8 +595,10 @@ namespace Kratos
             array_1d<double, 3> b = ZeroVector(3);
             b[0] = 1.0;
 
-            const array_1d<double, 3>  c = MathUtils<double>::CrossProduct(a, b);
-            const array_1d<double, 3>  d = MathUtils<double>::UnitCrossProduct(a, b);
+            array_1d<double, 3>  c, d;
+
+            MathUtils<double>::CrossProduct(c, b, a);
+            MathUtils<double>::UnitCrossProduct(d, b, a);
             
             KRATOS_CHECK_EQUAL(c[2], 2.0);
             KRATOS_CHECK_EQUAL(d[2], 1.0);

Rank 2: Score 0.0000 - @@ -537,7 +537,7 @@ define([
             var docUri = new Uri(document.location.href);
             var modelUri = new Uri(model._basePath);
             model._baseUri = modelUri.resolve(docUri);
-        });
+        }, getFailedLoadFunction(model, 'gltf', url));
 
         return model;
     };

Rank 3: Score 0.0000 - @@ -189,7 +189,7 @@ func (p *plugin) buildEvent(m *message,

In [19]:
from unsloth import FastLanguageModel

max_seq_length=4096
model, tokenizer = FastLanguageModel.from_pretrained(model_name, max_seq_length=max_seq_length, load_in_4bit=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.3.15: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    Quadro RTX 5000. Num GPUs = 1. Max memory: 15.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536, padding_idx=151654)
    (layers): ModuleList(
      (0): Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear4bit(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )


In [20]:
import torch
import re

INSTRUCTION_PROMPT = "Please GIVE FORMAL Codereview for software developers in ONE SENTENCE for testcase, implementing Few Shot Learning from example. Dont start with Codereview/review. Just give the answer."

WITH_SUMMARY=False
WITH_CALLGRAPH=False
SEED=0
NUM_OF_RESULTS=5
NUM_OF_FEW_SHOT=2
TEMPERATURE=0.7
IS_REASONING_MODEL=True

torch.manual_seed(SEED)

def extract_cot_and_answer(response):
    # Extract content within <think>...</think>
    cot_match = re.search(r"<think>(.*?)</think>", response, re.DOTALL)
    cot = cot_match.group(1).strip() if cot_match else ""

    if not IS_REASONING_MODEL:
      cot = "NO THINKING"

    # Extract content after </think>
    answer_match = re.search(r"</think>\s*(.*)", response, re.DOTALL)
    answer = answer_match.group(1).strip() if answer_match else ""

    return {"cot": cot, "answer": answer}

def forward(inputs, max_new_tokens: int=2048, temperature: float=0.05) -> Optional[Sequence[str]]:
  logging.debug("Generating")
  outputs = model.generate(
      **inputs,
      tokenizer=tokenizer,
      max_new_tokens=max_new_tokens,
      stop_strings=["</s>"],
      temperature=temperature,
      use_cache = True,
      num_return_sequences=NUM_OF_RESULTS,
      do_sample=True,
      top_p=1
      )
  decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
  logging.debug("Total decoded: " + str(len(decoded)))
  all_results = []
  for idx in range(0, BATCH_SIZE):
    result = []
    prompt_results = decoded[idx * NUM_OF_RESULTS : (idx + 1) * NUM_OF_RESULTS]
    for seq_idx, text in enumerate(prompt_results):
        logging.debug(f"Prompt {idx + 1} - Decoded Sequence {seq_idx + 1}: {text.replace(tokenizer.pad_token, '')}")
        logging.debug("_" * 70)
        result.append(extract_cot_and_answer(text))
    all_results.append(result)
  logging.debug(f"Total number of results for the batch: {len(all_results)}")
  print(len(all_results))
  return all_results

def get_bm25_review_context(example, train_data, num_shot:int=1):
  tokenized_query = example.split(" ")
  scores = bm25.get_scores(tokenized_query)
  scores_arr = np.array(scores)
  sorted_indices = scores_arr.argsort()[-num_shot:][::-1]
  msg = []
  for i in sorted_indices:
    context=""
    context=context+"Code: \t"+train_data["patch"][i]+"\n"
    if WITH_SUMMARY:
        context=context+"Summary: \t"+train_data["summary"][i]+"\n"
    if WITH_CALLGRAPH:
        context=context+"Callgraph: \t"+train_data["callgraph"][i]+"\n"
    context=context+"Codereview: "
    msg.append({"role": "user", "content": context})
    context="<think>\n...some explantion here...\n</think>\n\n" + train_data["msg"][i] + " </s>"+"\n\n"
    msg.append({"role": "assistant", "content": context})
  return msg

In [21]:
def review_comment_generation(model_name:str, test_name:str, shard_index: int, base_dir: str, batch_size: int=32):
  input_dir = os.path.join(base_dir, prettify(model_name), f"{test_name}_input")
  input_path = os.path.join(input_dir, f"shard_{shard_index}_input.json")
  input_data = jload(input_path)
  input_list = [{"hash": h, "value": v} for h, v in zip(input_data["hash"], input_data["value"])]

  output_dir = os.path.join(base_dir, prettify(model_name), f"{test_name}_output")
  os.makedirs(output_dir, exist_ok=True)
  output_path = os.path.join(output_dir, f"shard_{shard_index}_output.json")

  # Load existing results if they exist
  existing_results = jload(output_path) if os.path.exists(output_path) else {}

  # Filter out already processed hashes with 5 results
  filtered_input = [
      sample for sample in input_list
      if sample["hash"] not in existing_results or len(existing_results[sample["hash"]]) != NUM_OF_RESULTS
  ]

  for i in tqdm(range(0, len(filtered_input), batch_size)): # TODO
    end_index = min(i + batch_size, len(filtered_input))
    batch = filtered_input[i:end_index]

    print(f"Processing batch {i} to {end_index}")
    logging.debug(f"Processing batch {i} to {end_index}")

    prompts = []
    for j in range(len(batch)):
      dialog = [{"role": "user", "content": INSTRUCTION_PROMPT}]
      context_msg = get_bm25_review_context(batch[j]["value"]["patch"], train_dataset, num_shot=NUM_OF_FEW_SHOT)
      dialog.extend(context_msg)

      test_code = batch[j]["value"]["patch"]
      test_summary = batch[j]["value"]["summary"]
      test_callgraph = batch[j]["value"]["callgraph"]

      context=""
      context=context+"Code: \t"+test_code+"\n"
      if WITH_SUMMARY:
        context=context+"Summary: \t"+test_summary+"\n"
      if WITH_CALLGRAPH:
        context=context+"Callgraph: \t"+test_callgraph+"\n"
      context=context+"Codereview: "

      dialog.append({"role": "user", "content": context})
      prompts.append(dialog)

      logging.debug("################context ####################")
      logging.debug(dialog)

    texts = tokenizer.apply_chat_template(prompts, add_generation_prompt=True, tokenize=False)
    inputs = tokenizer(texts, padding_side="left", padding="longest", return_tensors="pt").to("cuda")

    results = forward(inputs, temperature=TEMPERATURE)

    for sample, result in zip(batch, results):
      filtered_result = [
          r for r in result
          if r.get("cot", "").strip() != "" or r.get("answer", "").strip() != ""
      ]
      if filtered_result:
        existing_results[sample["hash"]] = filtered_result
    jdump(existing_results, output_path)

  logging.info(f"Completed processing shard {shard_index}")

In [None]:
import random

TOTAL_SHARDS=157

shard_indices = list(range(TOTAL_SHARDS))
random.shuffle(shard_indices)

for shard_idx in tqdm(shard_indices):
  print(f"Processing shard {shard_idx}")
  logging.info(f"Processing shard {shard_idx}")
  review_comment_generation(model_name, test_name, shard_idx, base_results_dir, BATCH_SIZE)

  0%|          | 0/157 [00:00<?, ?it/s]

Processing shard 105



  0%|          | 0/7 [00:00<?, ?it/s][A

Processing batch 0 to 5



 14%|█▍        | 1/7 [06:41<40:06, 401.15s/it][A

5
Processing batch 5 to 10


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
