In [1]:
from datasets import load_from_disk

dataset_dict = load_from_disk('/home/grenders95/710/710_project/data/training/hf_datasets/newreqs_clearedfields')

test_dataset = dataset_dict['test']

In [2]:
print(test_dataset)

Dataset({
    features: ['org_repo_name', 'requirements', 'sbom', 'mod_manifest_newreqs', 'num_dependencies', 'sbom_data_cleared', '__index_level_0__'],
    num_rows: 532
})


In [3]:
import torch
from torch.cuda.amp import autocast
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "codellama/CodeLlama-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  
    quantization_config=bnb_config,  
    device_map="auto",
    trust_remote_code=True,
)

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, "/home/grenders95/model_ckpts/codellama_newreqscutcont_329/checkpoint-1000")

In [4]:
def tokenize(prompt):
    result = eval_tokenizer(prompt)
    result["labels"] = result["input_ids"].copy()
    return result

In [12]:
def generate_codellama_evaluation_prompt(data_point):
 
    instruction = f"Given a target manifest file, construct the corresponding SPDX SBOM in .json format.\n\n### Target manifest:\n{data_point['mod_manifest_newreqs']}\n\n### SPDX SBOM:\n"

    codellama_instruction = f"<s>[INST] {instruction} [/INST]</s>"

    full_prompt = f"{codellama_instruction}\n"
    
    return full_prompt

CodeLlama Evaluation Prompt:
 <s>[INST] Given a target manifest file, construct the corresponding SPDX SBOM in .json format.

### Target manifest:
Org: brainix
Repo: pottery
downloadLocation: git+https://github.com/brainix/pottery
licenseDeclared: Apache-2.0
Dependancies:
async-timeout==4.0.3
Authlib==1.3.0
bandit==1.7.7
certifi==2024.2.2
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
coverage==7.4.1
cryptography==42.0.3
docutils==0.20.1
dparse==0.6.4b0
flake8==7.0.0
hiredis==2.3.2
idna==3.6
importlib-metadata==7.0.1
iniconfig==2.0.0
isort==5.13.2
jaraco.classes==3.3.1
Jinja2==3.1.3
keyring==24.3.0
markdown-it-py==3.0.0
MarkupSafe==2.1.5
marshmallow==3.20.2
mccabe==0.7.0
mdurl==0.1.2
mmh3==4.1.0
more-itertools==10.2.0
mypy==1.8.0
mypy-extensions==1.0.0
nh3==0.2.15
packaging==23.2
pbr==6.0.0
pkginfo==1.9.6
pluggy==1.4.0
pycodestyle==2.11.1
pycparser==2.21
pydantic==1.10.14
pyflakes==3.2.0
Pygments==2.17.2
pytest==8.0.1
pytest-asyncio==0.23.5
pytest-cov==4.1.0
PyYAML==6.0.1
readme-r

In [7]:
def create_batches(dataset, batch_size=8):
    dataset_list = [item for item in dataset]
    batches = [dataset_list[i:i + batch_size] for i in range(0, len(dataset_list), batch_size)]
    return batches


In [8]:
def generate_for_batch(batch, tokenizer, model):
    prompts = [generate_codellama_evaluation_prompt(item) for item in batch]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=500).to("cuda")
    
    # Using autocast for mixed precision to enhance performance
    with torch.no_grad(), autocast():
        generated_tokens = model.generate(**inputs, max_new_tokens=1380, use_cache=True)
    
    generated_texts = [tokenizer.decode(tokens, skip_special_tokens=True) for tokens in generated_tokens]
    return generated_texts


In [None]:
eval_tokenizer.pad_token = eval_tokenizer.eos_token

In [9]:
from tqdm import tqdm

batch_size = 5  
batches = create_batches(test_dataset, batch_size)
all_generated_sboms = []

for batch in tqdm(batches, desc="Generating SBOMs"):
    generated_sboms = generate_for_batch(batch, eval_tokenizer, ft_model)
    all_generated_sboms.extend(generated_sboms)


Generating SBOMs:   0%|          | 0/107 [00:00<?, ?it/s]


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

: 

In [None]:
import pandas as pd

if not isinstance(test_dataset, pd.DataFrame):
    test_df = test_dataset.to_pandas()
else:
    test_df = test_dataset

In [None]:
test_df['generated_sbom'] = all_generated_sboms

In [None]:
csv_file_path = '/home/grenders95/710/710_project/data/eval/inference/april2_codellama_inference.csv'

test_df.to_csv(csv_file_path, index=False)

print(f"Updated results saved to {csv_file_path}")