In [1]:
from datasets import load_from_disk

dataset_dict = load_from_disk('/home/grenders95/710/710_project/data/training/hf_datasets/newreqs_clearedfields')

test_dataset_nr = dataset_dict['test']

In [2]:
print(test_dataset_nr)

Dataset({
    features: ['org_repo_name', 'requirements', 'sbom', 'mod_manifest_newreqs', 'num_dependencies', 'sbom_data_cleared', '__index_level_0__'],
    num_rows: 532
})


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "meta-llama/Llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  
    quantization_config=bnb_config,  
    device_map="auto",
    trust_remote_code=True,
)

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, "/home/grenders95/model_ckpts/llama2_newreqscutcont_329/checkpoint-1000")

In [4]:
def tokenize(prompt):
    result = eval_tokenizer(prompt)
    result["labels"] = result["input_ids"].copy()
    return result

In [11]:
def generate_llama2_evaluation_prompt(data_point):

    instruction = f"Given a target manifest file, construct the corresponding SPDX SBOM in .json format.\n\n### Target manifest:\n{data_point['mod_manifest_newreqs']}\n\n### SPDX SBOM:\n"
    
    llama2_instruction = f"<s>[INST] {instruction} [/INST]</s>"

    full_prompt = f"{llama2_instruction}\n"
    
    return full_prompt

Llama2 Evaluation Prompt:
 <s>[INST] Given a target manifest file, construct the corresponding SPDX SBOM in .json format.

### Target manifest:
Org: huawei-noah
Repo: smarts
downloadLocation: git+https://github.com/huawei-noah/SMARTS
licenseDeclared: MIT
Dependancies:
absl-py==2.1.0
aiosignal==1.3.1
astunparse==1.6.3
attrs==23.2.0
Automat==22.10.0
cachetools==5.3.2
certifi==2024.2.2
charset-normalizer==3.3.2
click==8.1.7
cloudpickle==3.0.0
constantly==23.10.4
coverage==7.4.1
decorator==5.1.1
dm-tree==0.1.8
exceptiongroup==1.2.0
execnet==2.0.2
filelock==3.13.1
flatbuffers==23.5.26
frozenlist==1.4.1
fsspec==2024.2.0
future==0.18.3
gast==0.4.0
google-auth==2.27.0
google-auth-oauthlib==1.0.0
google-pasta==0.2.0
grpcio==1.60.1
h5py==3.10.0
hyperlink==21.0.0
idna==3.6
importlib-metadata==7.0.1
importlib-resources==6.1.1
incremental==22.10.0
iniconfig==2.0.0
Jinja2==3.1.3
jsonschema==4.21.1
jsonschema-specifications==2023.12.1
keras==2.13.1
libclang==16.0.6
Markdown==3.5.2
MarkupSafe==2.1.5
m

In [None]:
def create_batches(dataset, batch_size=8):
    # Convert the dataset to a list of dictionaries (if not already in this format)
    dataset_list = [item for item in dataset]
    batches = [dataset_list[i:i + batch_size] for i in range(0, len(dataset_list), batch_size)]
    return batches


In [None]:
def generate_for_batch(batch, tokenizer, model):
    prompts = [generate_llama2_evaluation_prompt(item) for item in batch]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=500).to("cuda")
    
    # Using autocast for mixed precision to enhance performance
    with torch.no_grad(), autocast():
        generated_tokens = model.generate(**inputs, max_new_tokens=1380, use_cache=True)
    
    generated_texts = [tokenizer.decode(tokens, skip_special_tokens=True) for tokens in generated_tokens]
    return generated_texts


In [None]:
eval_tokenizer.pad_token = eval_tokenizer.eos_token

In [None]:
from tqdm import tqdm

batch_size = 3  
batches = create_batches(test_dataset_nr, batch_size)
all_generated_sboms = []

for batch in tqdm(batches, desc="Generating SBOMs"):
    generated_sboms = generate_for_batch(batch, eval_tokenizer, ft_model)
    all_generated_sboms.extend(generated_sboms)


In [None]:
import pandas as pd

if not isinstance(test_dataset_nr, pd.DataFrame):
    test_df = test_dataset_nr.to_pandas()
else:
    test_df = test_dataset_nr

In [None]:
test_df['generated_sbom'] = all_generated_sboms

In [None]:
csv_file_path = '/home/grenders95/710/710_project/data/excel/llama2_no_relationships_withGT/llama2_inference_april4_df.csv'

test_df.to_csv(csv_file_path, index=False)

print(f"Updated results saved to {csv_file_path}")