In [1]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [3]:
from datasets import load_from_disk

dataset_dict = load_from_disk('/home/grenders95/710/710_project/data/hf_datasets/newreqs_clearedfields')

train_dataset = dataset_dict['train']
eval_dataset = dataset_dict['validation']
test_dataset = dataset_dict['test']


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "meta-llama/Llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map={"": "cuda"})


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_eos_token=True,
    add_bos_token=True, 
)

In [7]:
def tokenize(prompt):
    result = tokenizer(prompt)
    result["labels"] = result["input_ids"].copy()
    return result

In [8]:
def generate_and_tokenize_prompt(data_point):
    # Constructing the instruction to include the target manifest directly
    instruction = f"Given a target manifest file, construct the corresponding SPDX SBOM in .json format.\n\n### Target manifest:\n{data_point['mod_manifest_newreqs']}\n\n### SPDX SBOM:\n"
    
    # Wrapping the complete instruction with CodeLlama's prompt format
    codellama_instruction = f"<s>[INST] {instruction} [/INST]</s>"
    
    full_prompt = f"""{codellama_instruction}

{data_point["sbom_data_cleared"]}
"""
    return tokenize(full_prompt)

In [9]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/1822 [00:00<?, ? examples/s]

Map:   0%|          | 0/304 [00:00<?, ? examples/s]

In [11]:
import numpy as np

token_lengths = [len(entry["input_ids"]) for entry in tokenized_train_dataset]
q1, q3 = np.percentile(token_lengths, [25, 75])
iqr = q3 - q1

# Define outliers as those beyond 1.5 * IQR from the quartiles
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

def filter_outliers(entry):
    return lower_bound <= len(entry["input_ids"]) <= upper_bound

filtered_train_dataset = tokenized_train_dataset.filter(filter_outliers)
filtered_val_dataset = tokenized_val_dataset.filter(filter_outliers)

Filter:   0%|          | 0/1822 [00:00<?, ? examples/s]

Filter:   0%|          | 0/304 [00:00<?, ? examples/s]

In [12]:
import matplotlib.pyplot as plt
import os


def plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset, output_dir, filename):
    lengths = [(x['input_ids'], x['org_repo_name']) for x in tokenized_train_dataset]
    lengths += [(x['input_ids'], x['org_repo_name']) for x in tokenized_val_dataset]
    
    # Sort by length in descending order
    lengths.sort(key=lambda x: len(x[0]), reverse=True)

    # Calculate average length
    avg_length = sum(len(x[0]) for x in lengths) / len(lengths)
    print("Average Length:", avg_length)

    # Print the organization names of the three highest length input_ids
    print("Organization Names of the Three Highest Length Input IDs:")
    for i in range(3):
        print(f"{i+1}. {lengths[i][1]}")

    plt.figure(figsize=(10, 6))
    plt.hist([len(x[0]) for x in lengths], bins=20, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.axvline(avg_length, color='red', linestyle='dashed', linewidth=1, label=f'Average Length: {avg_length:.2f}')
    plt.legend()

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Save the figure with the specified filename to the output directory
    full_path = os.path.join(output_dir, filename)
    plt.savefig(full_path)
    plt.close() 

In [13]:
plot_data_lengths(filtered_train_dataset, filtered_val_dataset, '/home/grenders95/710/710_project/data/figures', 'filtered_token_fullGHSBOM_newprompt.png')

Average Length: 2450.9221064179924
Organization Names of the Three Highest Length Input IDs:
1. peterh0323_smart_construction
2. zjunlp_deepke
3. home-assistant_supervisor


In [14]:
plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset, '/home/grenders95/710/710_project/data/figures', 'total_token_fullGHSBOM_newprompt.png')

Average Length: 12341.303857008466
Organization Names of the Three Highest Length Input IDs:
1. posthog_posthog
2. aimhubio_aim
3. quay_quay


In [14]:
max_length = 1380 

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,  
    add_bos_token=True,  
)
tokenizer.pad_token = tokenizer.eos_token


def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [15]:
tokenized_train_dataset = filtered_train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = filtered_val_dataset.map(generate_and_tokenize_prompt)


Map:   0%|          | 0/1577 [00:00<?, ? examples/s]

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

In [17]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [19]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [20]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

In [21]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

model = accelerator.prepare_model(model)

trainable params: 20277248 || all params: 3520690176 || trainable%: 0.5759452546613406


In [22]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Linear4bi

In [23]:
import wandb, os

wandb.login()
wandb_project = "sme_llama2_finetune_newreqformat_329"
output_dir = "/scratch/gmenderson/llama2_finetune_newreqformat_329"
wandb.init(project=wandb_project, dir=output_dir)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgmenderson[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [25]:
import transformers
from datetime import datetime

base_model_name = "llama2-7b"
run_name = base_model_name + "-" + wandb_project


tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        max_steps=1000,
        learning_rate=1e-4,
        logging_steps=50,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir=os.path.join(output_dir, "logs"),        
        save_strategy="steps",       
        save_steps=50,              
        evaluation_strategy="steps", 
        eval_steps=50,               
        do_eval=True,                
        report_to="wandb",           
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss
50,0.4857,0.269214
100,0.2427,0.248646
150,0.2367,0.2394
200,0.2311,0.233553
250,0.2143,0.229577
300,0.2131,0.225658
350,0.2066,0.222413
400,0.1985,0.221424
450,0.1888,0.220885
500,0.1901,0.218925




TrainOutput(global_step=1000, training_loss=0.20349905967712403, metrics={'train_runtime': 17546.5699, 'train_samples_per_second': 0.456, 'train_steps_per_second': 0.057, 'total_flos': 4.387392225275904e+17, 'train_loss': 0.20349905967712403, 'epoch': 5.07})