In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/qa-cybersecurity/Untitled spreadsheet - Sheet1.csv
/kaggle/input/b1-dataset-csv/processed_1.csv


In [2]:
pip install peft bitsandbytes trl datasets

Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting trl
  Downloading trl-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub>=0.25.0 (from peft)
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers (from peft)
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers->peft)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_

## Logging in Huggingface

In [3]:
from huggingface_hub import login
login('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')

# Import libraries

In [4]:
import os
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from google.colab import files
import pandas as pd

In [5]:
dataset = pd.read_csv('/kaggle/input/qa-cybersecurity/Untitled spreadsheet - Sheet1.csv')

dataset.head()

Unnamed: 0,index,Question,Answer
0,1,Which of the following would be the best examp...,"C. If you’re doing something as a deterrent, y..."
1,2,"Enacted in 2002, this U.S. law requires every ...",A. FISMA has been around since 2002 and was up...
2,3,Brad has done some research and determined a c...,"B. ALE = ARO × SLE. To determine ARO, divide t..."
3,4,An ethical hacker is hired to test the securit...,"A. In this example, an ethical hacker was hire..."
4,5,When an attack by a hacker is politically moti...,D. Hackers who use their skills and talents to...


In [6]:
dataset['text'] = '<s>[INST] ' + dataset['Question'] + '[/INST] ' + dataset['Answer'] +'</s>'

dataset = Dataset.from_pandas(dataset[['text']])
dataset

Dataset({
    features: ['text'],
    num_rows: 247
})

In [8]:
model_name = 'meta-llama/Llama-3.2-1B'

#LoRA settings for modifying attention mechanisms
lora_r = 64  # Dimension of LoRA attention
lora_alpha = 16  # Scaling factor for LoRA
lora_dropout = 0.1  # Dropout rate in LoRA layers

#4-bit precision settings for model efficiency
use_4bit = True  # Enable 4-bit precision
bnb_4bit_compute_dtype = "float16"  # Data type for computations
bnb_4bit_quant_type = "nf4"  # Quantization method
use_nested_quant = False  # Enable double quantization for more compression


In [9]:
#Training settings
output_dir = "results"  # Where to save model and results
num_train_epochs = 1  # Total number of training epochs
fp16 = False  # Use mixed precision training
bf16 = False  # Use bfloat16 precision with A100 GPUs
per_device_train_batch_size = 4  # Training batch size per GPU
per_device_eval_batch_size = 4  # Evaluation batch size per GPU
gradient_accumulation_steps = 1  # Number of steps for gradient accumulation
gradient_checkpointing = True  # Save memory during training
max_grad_norm = 0.3  # Max norm for gradient clipping
learning_rate = 2e-4  # Initial learning rate
weight_decay = 0.001  # Weight decay for regularization
optim = "paged_adamw_32bit"  # Optimizer choice
lr_scheduler_type = "cosine"  # Learning rate scheduler
max_steps = -1  # Set total number of training steps
warmup_ratio = 0.03  # Warmup ratio for learning rate
group_by_length = True  # Group sequences by length for efficiency
save_steps = 0  # Checkpoint save frequency
logging_steps = 10  # Logging frequency

#Sequence-to-sequence (SFT) training settings
max_seq_length = None  # Max sequence length
packing = False  # Pack short sequences together
device_map = {"": 0}  # Load model on specific GPU

In [10]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

#Configuring the 4-bit quantization and precision for the model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

#Verifying if the current GPU supports bfloat16 to suggest using it for better performance
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Accelerate training with bf16=True")
        print("=" * 80)

#Loading the specified model with the above quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False  # Disable caching to save memory
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # Setting the pad token
tokenizer.padding_side = "right"  # Adjusting padding to the right to avoid issues during training

#Configuring LoRA parameters for the model to fine-tune its attention mechanisms
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",  # Setting the bias option for LoRA
    task_type="CAUSAL_LM",  # Defining the task type as causal language modeling
)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [12]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,  # Grouping by length for efficient batching
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"  # Reporting to TensorBoard for monitoring
)

In [16]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=training_arguments,
)

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

In [17]:
trainer.train()

Step,Training Loss
10,2.8361
20,2.7617
30,2.4493
40,2.3813
50,2.4031
60,2.3569


TrainOutput(global_step=62, training_loss=2.5214006823878132, metrics={'train_runtime': 45.8218, 'train_samples_per_second': 5.39, 'train_steps_per_second': 1.353, 'total_flos': 281929146200064.0, 'train_loss': 2.5214006823878132, 'epoch': 1.0})

# Test the model

Đầu vào: Bộ dữ liệu B1 được cung cấp
Đầu ra: Câu trả lời gồm đáp án lựa chọn cùng lời giải thích

Độ chính xác của mô hình được đánh giá dựa vào số câu trả lời trùng với 'ground_truth' của bộ B1

## Tải bộ dataset B1

In [19]:
logging.set_verbosity(logging.CRITICAL)

test_dataset = pd.read_csv('/kaggle/input/b1-dataset-csv/processed_1.csv')
test_dataset.head()

Unnamed: 0,index,question,ground_truth
0,1,You have successfully logged on a Linux system...,A
1,2,What is the following command used for?\r\nsql...,B
2,3,"Sam, a web developer, was instructed to incorp...",D
3,4,Which of the following is assured by the use o...,D
4,5,What is not a PCI compliance recommendation?\r...,C


## Sử dụng model đã được finetune

In [47]:
tokenizer = AutoTokenizer.from_pretrained("gmeowmeow/Finetuned-Llama-model")
model = AutoModelForCausalLM.from_pretrained("gmeowmeow/Finetuned-Llama-model")

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

In [48]:
# Assuming `model` and `tokenizer` are already defined
pipe = pipeline(task='text-generation',
                model=model,
                tokenizer=tokenizer, 
                max_new_tokens=250)

for i in range(len(test_dataset)):
  prompt = test_dataset['question'][i]
  result = pipe(f"<s>[INST] {prompt} [/INST]")
  # Creating a new column 'answer' if it doesn't exist to store the results
  # OR Change 'answer' to 'Expected_answer' if your original data has an 'Expected_answer' column for storing the results
  test_dataset.loc[i, 'answer'] = result[0]['generated_text'].split("[/INST]")[1]

In [49]:
test_dataset

Unnamed: 0,index,question,ground_truth,answer
0,1,You have successfully logged on a Linux system...,A,B. auth.fesg</s>
1,2,What is the following command used for?\r\nsql...,B,D. SQLmap is used to search database stateme...
2,3,"Sam, a web developer, was instructed to incorp...",D,C. The OpenPGP standard is used by Sam to en...
3,4,Which of the following is assured by the use o...,D,D. Integrity\r\n</s>
4,5,What is not a PCI compliance recommendation?\r...,C,A. PCI compliance is a set of standards that ...
...,...,...,...,...
145,146,A DDOS attack is performed at layer 7 to take ...,B,D. A DDOS attack is performed at layer 7 to t...
146,147,A post-breach forensic investigation revealed ...,D,A. Correct. A vulnerability in a software co...
147,148,Mirai malware targets loT devices. After infil...,C,C. Mirai uses a DDoS attack to spread. It sen...
148,149,"Thomas, a cloud security professional, is perf...",C,C. Cloud cryptojacking is an attack that targ...


### Đánh giá độ tương đồng của câu trả lời với 'ground_truth'

In [50]:
count = 0
for i in range(len(test_dataset)):
  if (test_dataset['ground_truth'][i] == test_dataset['answer'][i][1]):
    count += 1

print(count)

52


Mô hình dự đoán đúng 52/150 câu, xác suất xấp xỉ 35%

## Test the meta-llama

In [32]:
test_dataset = pd.read_csv('/kaggle/input/b1-dataset-csv/processed_1.csv')

In [33]:
test_dataset

Unnamed: 0,index,question,ground_truth
0,1,You have successfully logged on a Linux system...,A
1,2,What is the following command used for?\r\nsql...,B
2,3,"Sam, a web developer, was instructed to incorp...",D
3,4,Which of the following is assured by the use o...,D
4,5,What is not a PCI compliance recommendation?\r...,C
...,...,...,...
145,146,A DDOS attack is performed at layer 7 to take ...,B
146,147,A post-breach forensic investigation revealed ...,D
147,148,Mirai malware targets loT devices. After infil...,C
148,149,"Thomas, a cloud security professional, is perf...",C


In [42]:
tokenizer_llama = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model_llama = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

pipe = pipeline(task='text-generation',
                model=model_llama,
                tokenizer=tokenizer_llama, 
                max_new_tokens=250)

for i in range(len(test_dataset)):
  prompt = 'You are an expert in cybersecurity. Choose the correct answer and give a succinct explanation' + test_dataset['question'][i]
  result = pipe(prompt)
  # Creating a new column 'answer' if it doesn't exist to store the results
  # OR Change 'answer' to 'Expected_answer' if your original data has an 'Expected_answer' column for storing the results
  test_dataset.loc[i, 'answer'] = result[0]['generated_text']

In [46]:
count = 0
for i in range(len(test_dataset)):
  if (test_dataset['ground_truth'][i] == test_dataset['answer'][i][1]):
    count += 1

print(count)

0


Mô hình khi chưa được finetune không trả về format được yêu cầu, dù đã được thử nhiều câu prompts để cố giúp mô hình trả lời theo định dạng -Choice-Reasoning. 