#### Dependencies

In [1]:
! pip install -U datasets transformers bitsandbytes accelerate \
    peft wandb loralib ipywidgets



#### Tokens

In [2]:
hf_token = "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"
project_name = "review-generation-v2"

#### Imports

In [3]:
# Pytorch
import os, sys, json 
from datetime import datetime
import torch
import torch.nn as nn
# Dataset
from datasets import load_dataset, Dataset
# Transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    RobertaTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    Trainer, 
    DataCollatorForSeq2Seq
)
# PEFT
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, PeftModel, get_peft_model, get_peft_model_state_dict, set_peft_model_state_dict
# Trainer
from trl import SFTTrainer # supervised finetuning

#### Environment Variables

In [4]:
os.environ['PYTORCH_CUDA_ALLOC_CONF']='max_split_size_mb:512'
# os.environ["CUDA_VISIBLE_DEVICES"]="0"

#### Bits and Bytes Config

In [5]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, # loading in 4 bit
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
)

#### Model

In [6]:
base_model="codellama/CodeLlama-7b-hf"
new_model = "asif-train-500e-all-codellama-7b-ft"

In [7]:
model = AutoModelForCausalLM.from_pretrained(
        base_model, quantization_config=bnb_config, device_map="auto" # or {"": 0} or multi gpu?
)
model.config.use_cache=False
model.config.pretraining_tp=1
# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
model = prepare_model_for_kbit_training(model)
model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32016, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
# print_trainable_parameters(model)

#### Tokenizer (?)

In [9]:
# or the roberta/codereviewer tokenizer?
tokenizer = AutoTokenizer.from_pretrained(base_model)
# or unk_token?
tokenizer.pad_token = tokenizer.eos_token # there is no padding token for llama
tokenizer.padding_side = "right" # or left?

In [10]:
inp = tokenizer("I <keep>")
print(inp)
tokenizer.decode(inp["input_ids"], skip_special_tokens=True)

{'input_ids': [1, 306, 529, 17462, 29958], 'attention_mask': [1, 1, 1, 1, 1]}


'I <keep>'

#### PEFT

In [11]:
# for modules: https://stackoverflow.com/questions/76768226/target-modules-for-applying-peft-lora-on-different-models
peft_config = LoraConfig(
    r=16, # rank, 8
    lora_alpha=32, # strength of the adapter, impact on model, 16
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    # more modules, more training, better performance
)

#### Base Model Performance

In [12]:
train_data_file_path = "Comment_Generation/msg-train.jsonl"
val_data_file_path = "Comment_Generation/msg-valid.jsonl"
test_data_file_path = "Comment_Generation/msg-test.jsonl"

In [13]:
def process_diff(diff):
    difflines = diff.split("\n")[1:]
    difflines = [line for line in difflines if len(line.strip()) > 0]
    map_dic = {"-": 0, "+": 1, " ": 2}
    def f(s):
        if s in map_dic:
            return map_dic[s]
        else:
            return 2
    labels = [f(line[0]) for line in difflines]
    difflines = [line[1:].strip() for line in difflines]
    inputstr = ""
    for label, line in zip(labels, difflines):
        if label == 1:
            inputstr += "<add>" + line
        elif label == 0:
            inputstr += "<del>" + line
        else:
            inputstr += "<keep>" + line

    return inputstr

In [14]:
with open(test_data_file_path, 'r') as f:
    line = f.readline()
    data = json.loads(line)
data = process_diff(data['patch'])
data

'<keep>model.getFiles().stream().map(ProtoFile::getProto).collect(Collectors.toList()))<keep>// Only the file to generate a client for (don\'t generate dependencies)<keep>.addFileToGenerate("multiple_services.proto")<del>.setParameter("language=java")<add>.setParameter("language=java,transport=grpc")<keep>.build();<keep>CodeGeneratorResponse response = ProtocGeneratorMain.generate(codeGeneratorRequest);'

In [15]:
prompt1 = """You are a powerful code reviewer model. Your job is to suggest review comment in natural language. You are given a question, and context regarding a diff hunk or code change in programming language. You must output appropriate, contextual review comment for that code change.

### Question:
What would be your suggested review comment?

### Code Change:
"""
prompt2 = """

### Review Comment:
"""
prompt = prompt1 + str(data) + prompt2
# print(prompt)
batch = tokenizer(prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**batch, max_new_tokens=50)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a powerful code reviewer model. Your job is to suggest review comment in natural language. You are given a question, and context regarding a diff hunk or code change in programming language. You must output appropriate, contextual review comment for that code change.

### Question:
What would be your suggested review comment?

### Code Change:
<keep>model.getFiles().stream().map(ProtoFile::getProto).collect(Collectors.toList()))<keep>// Only the file to generate a client for (don't generate dependencies)<keep>.addFileToGenerate("multiple_services.proto")<del>.setParameter("language=java")<add>.setParameter("language=java,transport=grpc")<keep>.build();<keep>CodeGeneratorResponse response = ProtocGeneratorMain.generate(codeGeneratorRequest);

### Review Comment:
<keep>The code change is to add a parameter to the code generation request. The parameter is "language=java,transport=grpc".<keep>

### Hint:
<keep>The code change is a diff


#### Training and Validation Dataset

In [16]:
import json
def read_jsonl(path):
    data = []
    with open(path) as f:
        for line in f:
            try:
                js = json.loads(line.strip())
            except:
                print("Error during reading json data.")
                continue
            data.append(js)
    return data

In [44]:
train_data = read_jsonl(train_data_file_path)
val_data = read_jsonl(val_data_file_path)
test_data = read_jsonl(test_data_file_path)

for i in range(len(train_data)):
    train_data[i]["idx"] = i
for i in range(len(val_data)):
    val_data[i]["idx"] = i

for dic in train_data:
    diff, msg = dic["patch"], dic["msg"]
    dic["diff"] = process_diff(diff)
for dic in val_data:
    diff, msg = dic["patch"], dic["msg"]
    dic["diff"] = process_diff(diff)
for dic in test_data:
    diff, msg = dic["patch"], dic["msg"]
    dic["diff"] = process_diff(diff)

In [18]:
# load the data into pandas dataframe in two columns named "diff" and "msg"
import pandas as pd
train_df = pd.DataFrame(train_data)
train_df = train_df[["diff", "msg"]]
# df.iloc[0]["diff"]
train_df

Unnamed: 0,diff,msg
0,"<keep>array_1d<double, 3> b = ZeroVector(3);<k...",I assumed that for CrossProduct the values wer...
1,<keep>For internal use only; no backwards-comp...,I think we should we avoid `import six` for co...
2,<keep>def should_render_revenue?<keep>revenue ...,"we call cities + towns . size a lot, maybe mak..."
3,"<keep>D_ERROR(""pool ""DF_UUID"" event %d failed:...",This will be removed.
4,"<keep>return middleware.Wrap(h, backendMiddlew...",nit: `firehoseLogHandler` vs. `firehoseMiddlew...
...,...,...
117734,<keep>########################################...,"For variants like this, you should _explicitly..."
117735,<keep>if user && user.student?<keep>if user.hi...,Align the parameters of a method call if they ...
117736,<keep>profit_trade = trade.calc_profit(rate=pr...,Below this - we'll need to make sure to use th...
117737,"<keep>return &evt, nil<keep>}<del>func populat...","I understand this new parameter is unused, may..."


In [19]:
# load the data into pandas dataframe in two columns named "diff" and "msg"
import pandas as pd
val_df = pd.DataFrame(val_data)
val_df = val_df[["diff", "msg"]]
# df.iloc[0]["diff"]
val_df

Unnamed: 0,diff,msg
0,<keep>)<keep>return rv<add>@app.template_test(...,Should we call it `is_list`?
1,<keep>configureSqlClientInstrumentationOptions...,"in the instrumentation example, should we use ..."
2,<keep>## Fields ##<keep>############<del>class...,Why this change ? Is it useful ?
3,<add>const titleNode = virtualNode.children.fi...,"I know this is a nitpick, but don't we always ..."
4,"<keep>assertEquals(false, EMailValidator.isEma...",We should reformat this emails in the test to ...
...,...,...
10314,<keep>dropped = [dims.index(d) for d in dims<k...,So calling `np.squeeze` will probably make thi...
10315,"<keep>assert frame1.stypes == frame2.stypes, (...",Won't this produce too much of output if frame...
10316,<keep>)<keep>type NoopProvider struct{}<del>ty...,This seems like a hold-over from the interface...
10317,<keep>}<keep>if (GetLevel() >= item->Click.Lev...,Do we not want a message here similar to the o...


In [62]:
# load the data into pandas dataframe in two columns named "diff" and "msg"
import pandas as pd
test_df = pd.DataFrame(test_data)
test_df = test_df[["diff", "msg"]]
# df.iloc[0]["diff"]
test_df

Unnamed: 0,diff,msg
0,<keep>model.getFiles().stream().map(ProtoFile:...,can we also test for `transport=rest`?
1,<keep>*/<keep>protected function createBackend...,"If record_batch_size is not set in config.ini,..."
2,"<keep><script type=""text/javascript""><keep>win...","I didn't realize we were hardcoding this, than..."
3,"<keep>}<keep></h4><keep><div class=""UppyDashbo...","We are trying to support IE 10-11, so we'll ne..."
4,<keep>function ResetButton( { children } ) {<k...,It looks like there's a new `isNavigatingTo( u...
...,...,...
10164,<keep>type ConsumerConfig struct {<keep>Public...,"Should be from lowercase `json:""ports""` the sa..."
10165,<keep># Purpose:<keep># sns-ruby-example-creat...,Simple Notification **Service** (singular)
10166,<keep>@Override<keep>public void runScript(Str...,This code should be executed in NashornEngineF...
10167,<keep>// NewTags creates a tags object<keep>fu...,this is not so great as it sets the global ran...


In [20]:
train_dataset = Dataset.from_pandas(train_df)
# final_data = final_data.map(lambda samples: tokenizer(samples['diff']), batched=True)
train_dataset

Dataset({
    features: ['diff', 'msg'],
    num_rows: 117739
})

In [21]:
val_dataset = Dataset.from_pandas(val_df)
# final_data = final_data.map(lambda samples: tokenizer(samples['diff']), batched=True)
val_dataset

Dataset({
    features: ['diff', 'msg'],
    num_rows: 10319
})

In [48]:
test_dataset = Dataset.from_pandas(test_df)
# final_data = final_data.map(lambda samples: tokenizer(samples['diff']), batched=True)
test_dataset

Dataset({
    features: ['diff', 'msg'],
    num_rows: 10169
})

In [22]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [23]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = prompt1 + str(process_diff(data_point['diff'])) + prompt2 + str(data_point['msg'])
    return tokenize(full_prompt)


In [24]:
tokenized_train_set = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_set = val_dataset.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/117739 [00:00<?, ? examples/s]

Map:   0%|          | 0/10319 [00:00<?, ? examples/s]

In [25]:
tokenized_train_set

Dataset({
    features: ['diff', 'msg', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 117739
})

In [26]:
tokenized_val_set

Dataset({
    features: ['diff', 'msg', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10319
})

#### Training

In [27]:
# resume_from_checkpoint = "asif-train-200e-all-codellama-7b-ft/adapter_model.bin" # set this to the adapter_model.bin file you want to resume from

# if resume_from_checkpoint:
#     if os.path.exists(resume_from_checkpoint):
#         print(f"Restarting from {resume_from_checkpoint}")
#         adapters_weights = torch.load(resume_from_checkpoint)
#         set_peft_model_state_dict(model, adapters_weights)
#     else:
#         print(f"Checkpoint {resume_from_checkpoint} not found")

In [28]:
torch.cuda.device_count()

4

In [29]:
if torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True

In [30]:
# ! wandb login --relogin

In [31]:
wandb_project = project_name
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project


In [32]:
batch_size = 64
per_device_train_batch_size = 16
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = project_name

training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=500,
        learning_rate=3e-4,
        fp16=True, # ?
        logging_steps=10,
        lr_scheduler_type="linear", # ?
        optim="adamw_torch", # ?
        evaluation_strategy="steps", # if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=100,
        save_steps=150,
        output_dir=output_dir,
        # save_total_limit=3,
        load_best_model_at_end=False,
        # ddp_find_unused_parameters=False if ddp else None,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
        report_to="wandb", # if use_wandb else "none",
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
    )
model = get_peft_model(model, peft_config)
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_set,
    eval_dataset=tokenized_val_set,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq( # ?
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

In [33]:
model.config.use_cache = False

old_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
)

# Red Flag
# if torch.__version__ >= "2" and sys.platform != "win32":
#     print("compiling the model")
#     model = torch.compile(model)

In [34]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m1805112[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/500 [00:00<?, ?it/s]



{'loss': 3.0338, 'learning_rate': 2.9999999999999997e-05, 'epoch': 0.01}
{'loss': 2.5548, 'learning_rate': 5.9999999999999995e-05, 'epoch': 0.01}
{'loss': 1.1353, 'learning_rate': 8.999999999999999e-05, 'epoch': 0.02}
{'loss': 0.6034, 'learning_rate': 0.00011999999999999999, 'epoch': 0.02}
{'loss': 0.3864, 'learning_rate': 0.00015, 'epoch': 0.03}
{'loss': 1.2878, 'learning_rate': 0.00017999999999999998, 'epoch': 0.03}
{'loss': 0.8884, 'learning_rate': 0.00020999999999999998, 'epoch': 0.04}
{'loss': 0.666, 'learning_rate': 0.00023999999999999998, 'epoch': 0.04}
{'loss': 0.4952, 'learning_rate': 0.00027, 'epoch': 0.05}
{'loss': 0.3047, 'learning_rate': 0.0003, 'epoch': 0.05}


  0%|          | 0/1290 [00:00<?, ?it/s]

{'eval_loss': 0.761552631855011, 'eval_runtime': 1133.418, 'eval_samples_per_second': 9.104, 'eval_steps_per_second': 1.138, 'epoch': 0.05}
{'loss': 1.2278, 'learning_rate': 0.00029249999999999995, 'epoch': 0.06}
{'loss': 0.8676, 'learning_rate': 0.000285, 'epoch': 0.07}
{'loss': 0.6472, 'learning_rate': 0.00027749999999999997, 'epoch': 0.07}
{'loss': 0.4963, 'learning_rate': 0.00027, 'epoch': 0.08}
{'loss': 0.2916, 'learning_rate': 0.0002625, 'epoch': 0.08}




{'loss': 1.233, 'learning_rate': 0.00025499999999999996, 'epoch': 0.09}
{'loss': 0.8345, 'learning_rate': 0.00024749999999999994, 'epoch': 0.09}
{'loss': 0.6433, 'learning_rate': 0.00023999999999999998, 'epoch': 0.1}
{'loss': 0.461, 'learning_rate': 0.00023249999999999999, 'epoch': 0.1}
{'loss': 0.2924, 'learning_rate': 0.000225, 'epoch': 0.11}


  0%|          | 0/1290 [00:00<?, ?it/s]

{'eval_loss': 0.7577228546142578, 'eval_runtime': 1135.7529, 'eval_samples_per_second': 9.086, 'eval_steps_per_second': 1.136, 'epoch': 0.11}
{'loss': 1.2154, 'learning_rate': 0.00021749999999999997, 'epoch': 0.11}
{'loss': 0.8466, 'learning_rate': 0.00020999999999999998, 'epoch': 0.12}
{'loss': 0.6436, 'learning_rate': 0.0002025, 'epoch': 0.13}
{'loss': 0.4784, 'learning_rate': 0.000195, 'epoch': 0.13}
{'loss': 0.3019, 'learning_rate': 0.00018749999999999998, 'epoch': 0.14}
{'loss': 1.2192, 'learning_rate': 0.00017999999999999998, 'epoch': 0.14}
{'loss': 0.8577, 'learning_rate': 0.00017249999999999996, 'epoch': 0.15}
{'loss': 0.6565, 'learning_rate': 0.000165, 'epoch': 0.15}
{'loss': 0.472, 'learning_rate': 0.00015749999999999998, 'epoch': 0.16}
{'loss': 0.2943, 'learning_rate': 0.00015, 'epoch': 0.16}


  0%|          | 0/1290 [00:00<?, ?it/s]

{'eval_loss': 0.7403162121772766, 'eval_runtime': 1135.6107, 'eval_samples_per_second': 9.087, 'eval_steps_per_second': 1.136, 'epoch': 0.16}




{'loss': 1.2152, 'learning_rate': 0.0001425, 'epoch': 0.17}
{'loss': 0.8361, 'learning_rate': 0.000135, 'epoch': 0.17}
{'loss': 0.6331, 'learning_rate': 0.00012749999999999998, 'epoch': 0.18}
{'loss': 0.4757, 'learning_rate': 0.00011999999999999999, 'epoch': 0.18}
{'loss': 0.3057, 'learning_rate': 0.0001125, 'epoch': 0.19}
{'loss': 1.2026, 'learning_rate': 0.00010499999999999999, 'epoch': 0.2}
{'loss': 0.8404, 'learning_rate': 9.75e-05, 'epoch': 0.2}
{'loss': 0.642, 'learning_rate': 8.999999999999999e-05, 'epoch': 0.21}
{'loss': 0.467, 'learning_rate': 8.25e-05, 'epoch': 0.21}
{'loss': 0.2959, 'learning_rate': 7.5e-05, 'epoch': 0.22}


  0%|          | 0/1290 [00:00<?, ?it/s]

{'eval_loss': 0.7357637286186218, 'eval_runtime': 1133.656, 'eval_samples_per_second': 9.102, 'eval_steps_per_second': 1.138, 'epoch': 0.22}
{'loss': 1.1983, 'learning_rate': 6.75e-05, 'epoch': 0.22}
{'loss': 0.8557, 'learning_rate': 5.9999999999999995e-05, 'epoch': 0.23}
{'loss': 0.6342, 'learning_rate': 5.2499999999999995e-05, 'epoch': 0.23}
{'loss': 0.4722, 'learning_rate': 4.4999999999999996e-05, 'epoch': 0.24}
{'loss': 0.3063, 'learning_rate': 3.75e-05, 'epoch': 0.24}




{'loss': 1.2004, 'learning_rate': 2.9999999999999997e-05, 'epoch': 0.25}
{'loss': 0.8332, 'learning_rate': 2.2499999999999998e-05, 'epoch': 0.26}
{'loss': 0.6374, 'learning_rate': 1.4999999999999999e-05, 'epoch': 0.26}
{'loss': 0.4794, 'learning_rate': 7.499999999999999e-06, 'epoch': 0.27}
{'loss': 0.2996, 'learning_rate': 0.0, 'epoch': 0.27}


  0%|          | 0/1290 [00:00<?, ?it/s]

{'eval_loss': 0.7340313196182251, 'eval_runtime': 1134.2305, 'eval_samples_per_second': 9.098, 'eval_steps_per_second': 1.137, 'epoch': 0.27}
{'train_runtime': 12871.2706, 'train_samples_per_second': 2.486, 'train_steps_per_second': 0.039, 'train_loss': 0.7833351621627808, 'epoch': 0.27}


TrainOutput(global_step=500, training_loss=0.7833351621627808, metrics={'train_runtime': 12871.2706, 'train_samples_per_second': 2.486, 'train_steps_per_second': 0.039, 'train_loss': 0.7833351621627808, 'epoch': 0.27})

In [35]:
# TrainOutput(global_step=200, training_loss=0.8567268979549408, metrics={'train_runtime': 7423.7516, 
# 'train_samples_per_second': 1.724, 'train_steps_per_second': 0.027, 'train_loss': 0.8567268979549408, 'epoch': 0.11})

# TrainOutput(global_step=500, training_loss=0.7833351621627808, metrics={'train_runtime': 12871.2706, 
# 'train_samples_per_second': 2.486, 'train_steps_per_second': 0.039, 'train_loss': 0.7833351621627808, 'epoch': 0.27})

#### Save and Push

In [36]:
# https://github.com/huggingface/transformers/issues/27397
trainer.model.save_pretrained(new_model, safe_serialization=False)

In [41]:
del model
del tokenizer
# del pipe
del trainer
torch.cuda.empty_cache()
import gc
gc.collect()
gc.collect()

0

In [37]:
# location = "asifhaider/" + new_model
# model.push_to_hub(location, use_auth_token=True)

#### Inference Example

In [42]:
model = AutoModelForCausalLM.from_pretrained(base_model,
   quantization_config=bnb_config, device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [86]:
# output_dir = output_dir + "/checkpoint-200"
output_dir = "asif-train-500e-all-codellama-7b-ft"
model = PeftModel.from_pretrained(model, output_dir)
model = model.merge_and_unload()



In [102]:
test_df.iloc[2]["msg"]

"I didn't realize we were hardcoding this, thanks for moving it to an env value."

In [103]:
eval_prompt = prompt1 + test_df.iloc[2]["diff"] + prompt2
# print(eval_prompt)



In [99]:
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=70)[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a powerful code reviewer model. Your job is to suggest review comment in natural language. You are given a question, and context regarding a diff hunk or code change in programming language. You must output appropriate, contextual review comment for that code change.

### Question:
What would be your suggested review comment?

### Code Change:
<keep><script type="text/javascript"><keep>window.analytics||(window.analytics=[]),window.analytics.methods=["identify","track","trackLink","trackForm","trackClick","trackSubmit","page","pageview","ab","alias","ready","group","on","once","off"],window.analytics.factory=function(t){return function(){var a=Array.prototype.slice.call(arguments);return a.unshift(t),window.analytics.push(a),window.analytics}};for(var i=0;i<window.analytics.methods.length;i++){var method=window.analytics.methods[i];window.analytics[method]=window.analytics.factory(method)}window.analytics.load=function(t){var a=document.createElement("script");a.type="text/java

In [105]:
# from transformers import pipeline
# pipe = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     torch_dtype=torch.float16,
#     device_map={"":0},
# )


The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

In [None]:
# # prompt = "'<keep>*/<keep>public CoreDescriptor(String name, Path instanceDir, Map<String, String> coreProps,<keep>Properties containerProperties, ZkController zkController) {<del>this.instanceDir = instanceDir;<add>this.instanceDir = instanceDir.toAbsolutePath();<keep>originalCoreProperties.setProperty(CORE_NAME, name);'"
# # instruction = f"### [Patch For Review]:\n{prompt}\n\n### [Review Comment]:\n"
# sequences = pipe(eval_prompt,
#     do_sample=True,
#     temperature=0.2,
#     top_p=0.9,
#     num_return_sequences=1,
#     eos_token_id=tokenizer.eos_token_id,
#     max_length=128,
#     truncation=True
# )
# for seq in sequences:
#     print(f"Result: {seq['generated_text']}")


#### Inference

In [101]:
# Assuming `test_df` is your DataFrame containing test data
tokenizer.pad_token = tokenizer.eos_token

# Modify the prompt generation and inference to run on batches
batch_size = 32  # Adjust the batch size as needed
total_data_points = 500  # Number of data points to process

for i in range(0, total_data_points, batch_size):
    batch_df = test_df.iloc[i:i+batch_size]
    eval_prompts = []

    for j in range(len(batch_df)):
        eval_prompt = prompt1 + batch_df.iloc[j]["diff"] + prompt2
        eval_prompts.append(eval_prompt)

    model_inputs = tokenizer(eval_prompts, padding=True, truncation=True, return_tensors="pt").to("cuda")

    with torch.no_grad():
        generated = model.generate(**model_inputs, max_new_tokens=70)

    for gen in generated:
        print(tokenizer.decode(gen, skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a powerful code reviewer model. Your job is to suggest review comment in natural language. You are given a question, and context regarding a diff hunk or code change in programming language. You must output appropriate, contextual review comment for that code change.

### Question:
What would be your suggested review comment?

### Code Change:
<keep>model.getFiles().stream().map(ProtoFile::getProto).collect(Collectors.toList()))<keep>// Only the file to generate a client for (don't generate dependencies)<keep>.addFileToGenerate("multiple_services.proto")<del>.setParameter("language=java")<add>.setParameter("language=java,transport=grpc")<keep>.build();<keep>CodeGeneratorResponse response = ProtocGeneratorMain.generate(codeGeneratorRequest);

### Review Comment:
<keep>The code change is to add a parameter to the code generation request. The parameter is "language=java,transport=grpc".<keep>

### Hint:
<keep>The code change is a diff hunk. You can use the diff hunk to find the co

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
