In [1]:
# Mount gdrive for colab
colab = False
if colab:
    from google.colab import drive

    drive.mount('/content/drive')
    local_drive_mount = "/content/drive/MyDrive/"
else:
    local_drive_mount = "/Users/dbaeka/Library/CloudStorage/GoogleDrive-dbaekajnr@gmail.com/My Drive/"
local_drive_mount = "/home/ubuntu/"

In [2]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
# Set up Logging
import logging
import os
import time

log_folder_path = os.path.join(local_drive_mount, 'soen691/logs')
os.makedirs(log_folder_path, exist_ok=True)
# use the current time to create a unique log file
log_file_path = os.path.join(log_folder_path, f"review_comment_generation_{time.strftime('%Y%m%d')}.log")
logging.basicConfig(
    filename=log_file_path,
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
)

## Load Test Dataset
Since 500 test set is a subset of 5000 dataset, i.e. first 500, if we run inference on 5000 set, we do not need to run again on 500. We can simply extract predicted result from the results during post-processing before evaluation

In [4]:
from datasets import load_dataset
from tqdm import tqdm
import os

In [5]:
# Define IOUtils
import io
import json


def _make_w_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f_dirname = os.path.dirname(f)
        if f_dirname != "":
            os.makedirs(f_dirname, exist_ok=True)
        f = open(f, mode=mode, encoding="utf-8")
    return f


def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode, encoding="utf-8")
    return f


def jdump(obj, f: str, mode="w", indent=4, default=str):
    """Dump a str or dictionary to a file in json format."""
    f = _make_w_io_base(f, mode)
    if isinstance(obj, (dict, list)):
        json.dump(obj, f, indent=indent, default=default)
    elif isinstance(obj, str):
        f.write(obj)
    else:
        raise ValueError(f"Unexpected type: {type(obj)}")
    f.close()


def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

In [6]:
from typing import Optional, Sequence

models = [
    "Qwen/Qwen2.5-7B-Instruct",
    "Qwen/Qwen2.5-Coder-7B-Instruct",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
]

# set model to use for notebook
model_set = 2
model_name = models[model_set]

# set test to use
test_name = "test_5000"

BATCH_SIZE = 32
WITH_SUMMARY = False
WITH_CALLGRAPH = False


def shard_dataset(repo_name, output_dir, chunk_size: int = 10_000):
    dataset = load_dataset(repo_name)['test']
    for i in range(0, len(dataset), chunk_size):
        shard = dataset[i:i + chunk_size]
        jdump(shard, f"{output_dir}/shard_{i // chunk_size}_input.json")


def prettify(name: str) -> str:
    return name.replace("/", "_").replace("-", "_").replace(".", "_")

In [7]:
# set shard size to use
shard_size = 32

base_results_dir = os.path.join(local_drive_mount, "soen691/results/")
test_results_dir = os.path.join(base_results_dir, f"{test_name}_input")

output_dir = "_base" if not WITH_SUMMARY and not WITH_CALLGRAPH else ""
if WITH_SUMMARY:
    output_dir += "_summary"
if WITH_CALLGRAPH:
    output_dir += "_callgraph"

shard_dataset(f"dbaeka/soen_691_few_shot_{test_name}{output_dir}_hashed", test_results_dir, shard_size)

In [8]:
from unsloth import FastLanguageModel

max_seq_length = 4096
model1, tokenizer1 = FastLanguageModel.from_pretrained(
    model_name, max_seq_length=max_seq_length, load_in_4bit=True
)
tokenizer1.pad_token = tokenizer1.eos_token
tokenizer1.padding_side = "left"
# 
# model2, tokenizer2 = FastLanguageModel.from_pretrained(
#     model_name, max_seq_length=max_seq_length, load_in_4bit=True
# )
# tokenizer2.pad_token = tokenizer2.eos_token
# tokenizer2.padding_side = "left"



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.17: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    NVIDIA H100 PCIe. Num GPUs = 1. Max memory: 79.109 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
FastLanguageModel.for_inference(model1)
# FastLanguageModel.for_inference(model2)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584, padding_idx=151654)
    (layers): ModuleList(
      (0-3): 4 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
   

In [10]:
import torch
import re

SEED = 0
NUM_OF_RESULTS = 5
TEMPERATURE = 0.7
IS_REASONING_MODEL = True

torch.manual_seed(SEED)


def extract_cot_and_answer(response):
    # Extract content within <think>...</think>
    cot_match = re.search(r"<think>(.*?)</think>", response, re.DOTALL)
    cot = cot_match.group(1).strip() if cot_match else ""

    if not IS_REASONING_MODEL:
        cot = "NO THINKING"

    # Extract content after </think>
    answer_match = re.search(r"</think>\s*(.*)", response, re.DOTALL)
    answer = answer_match.group(1).strip() if answer_match else ""

    return {"cot": cot, "answer": answer}


def forward(inputs, tokenizer, model, max_new_tokens: int = 2048, temperature: float = 0.05) -> Optional[Sequence[str]]:
    logging.debug("Generating")
    all_results = []
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            tokenizer=tokenizer,
            max_new_tokens=max_new_tokens,
            stop_strings=["</s>"],
            temperature=temperature,
            use_cache=True,
            num_return_sequences=NUM_OF_RESULTS,
            do_sample=True,
            top_p=1
        )
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        logging.debug("Total decoded: " + str(len(decoded)))
        for idx in range(0, BATCH_SIZE):
            result = []
            prompt_results = decoded[idx * NUM_OF_RESULTS: (idx + 1) * NUM_OF_RESULTS]
            for seq_idx, text in enumerate(prompt_results):
                logging.debug(
                    f"Prompt {idx + 1} - Decoded Sequence {seq_idx + 1}: {text.replace(tokenizer.pad_token, '')}")
                logging.debug("_" * 70)
                result.append(extract_cot_and_answer(text))
            all_results.append(result)
        logging.debug(f"Total number of results for the batch: {len(all_results)}")
        print(len(all_results))
    return all_results

In [11]:
import torch
import gc


def review_comment_generation(model, tokenizer, model_name: str, test_name: str, shard_index: int, base_dir: str,
                              batch_size: int = 32):
    input_dir = os.path.join(base_dir, f"{test_name}_input")
    input_path = os.path.join(input_dir, f"shard_{shard_index}_input.json")
    input_data = jload(input_path)
    prompt_path = "prompt_thinking" if IS_REASONING_MODEL else "prompt_base"
    input_list = [{"hash": h, "value": v, "prompt": p} for h, v, p in
                  zip(input_data["hash"], input_data["value"], input_data[prompt_path])]

    output_dir = os.path.join(base_dir, prettify(model_name), f"{test_name}_output")
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"shard_{shard_index}_output.json")

    # Load existing results if they exist
    existing_results = jload(output_path) if os.path.exists(output_path) else {}

    # Filter out already processed hashes with 5 results
    filtered_input = [
        sample for sample in input_list
        if sample["hash"] not in existing_results or len(existing_results[sample["hash"]]) != NUM_OF_RESULTS
    ]

    for i in tqdm(range(0, len(filtered_input), batch_size)):  # TODO
        end_index = min(i + batch_size, len(filtered_input))
        batch = filtered_input[i:end_index]

        print(f"Processing batch {i} to {end_index}")
        logging.debug(f"Processing batch {i} to {end_index}")

        prompts = [v["prompt"] for v in batch]
        texts = tokenizer.apply_chat_template(prompts, add_generation_prompt=True, tokenize=False)

        inputs = tokenizer(texts, padding_side="left", padding="longest", return_tensors="pt").to("cuda")
        # print shape
        results = forward(inputs, model=model, tokenizer=tokenizer, temperature=TEMPERATURE)

        for sample, result in zip(batch, results):
            filtered_result = [
                r for r in result
                if r.get("cot", "").strip() != "" or r.get("answer", "").strip() != ""
            ]
            if filtered_result:
                existing_results[sample["hash"]] = filtered_result
        jdump(existing_results, output_path)

        torch.cuda.empty_cache()
        gc.collect()

    logging.info(f"Completed processing shard {shard_index}")

In [12]:
import random

TOTAL_SHARDS = 157

shard_indices = list(range(TOTAL_SHARDS))
random.shuffle(shard_indices)

stream1 = torch.cuda.Stream()
# stream2 = torch.cuda.Stream()

# pick 2 shards for testing at a time within loop
while len(shard_indices) > 0:
    shard_idx1 = shard_indices.pop()
    print(f"Processing shard {shard_idx1}")
    logging.info(f"Processing shard {shard_idx1}")

    shard_idx2 = shard_indices.pop()
    print(f"Processing shard {shard_idx2}")

    with torch.cuda.stream(stream1):
        review_comment_generation(model1, tokenizer1, model_name, test_name, shard_idx1, base_results_dir, 16)

    # with torch.cuda.stream(stream2):
    #     review_comment_generation(model2, tokenizer2, model_name, test_name, shard_idx2, base_results_dir, BATCH_SIZE)

    stream1.synchronize()
    # stream2.synchronize()

# 
# for shard_idx in tqdm(shard_indices):
#     print(f"Processing shard {shard_idx}")
#     logging.info(f"Processing shard {shard_idx}")
#     review_comment_generation(model_name, test_name, shard_idx, base_results_dir, BATCH_SIZE)

Processing shard 62
Processing shard 53


  0%|                                                                                                                                                                | 0/2 [00:00<?, ?it/s]

Processing batch 0 to 16
32


 50%|████████████████████████████████████████████████████████████████████████████                                                                            | 1/2 [01:33<01:33, 93.76s/it]

Processing batch 16 to 32


 50%|████████████████████████████████████████████████████████████████████████████                                                                            | 1/2 [01:34<01:34, 94.16s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.50 GiB. GPU 0 has a total capacity of 79.11 GiB of which 18.81 GiB is free. Including non-PyTorch memory, this process has 60.29 GiB memory in use. Of the allocated memory 59.25 GiB is allocated by PyTorch, and 404.50 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)