In [1]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install "torch==2.4.0" "xformers==0.0.27.post2" triton torchvision torchaudio
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset, Dataset
from transformers import TrainingArguments
from trl import SFTTrainer
import pandas as pd
import xformers.ops.fmha as xformers

# Parameters
max_seq_length = 2048
dtype = None
load_in_4bit = True

# Model and tokenizer setup
model_name = "unsloth/Meta-Llama-3.1-8B"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Define your Alpaca-style prompt
alpaca_prompt = """You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D. :

### Question: 
{Question}

### Input:
A) {Option_A}
B) {Option_B}
C) {Option_C}
D) {Option_D}

### Response:
{GT}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    questions = examples["Question"]
    option_a = examples["Option A"]
    option_b = examples["Option B"]
    option_c = examples["Option C"]
    option_d = examples["Option D"]
    gts = examples["GT"]  # This is the correct answer
    
    texts = []
    for question, a, b, c, d, gt in zip(questions, option_a, option_b, option_c, option_d, gts):
        text = alpaca_prompt.format(
            Question=question,
            Option_A=a,
            Option_B=b,
            Option_C=c,
            Option_D=d,
            GT=gt
        ) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

def load_tsv_to_dataset(tsv_path, encoding='ISO-8859-1'):
    df = pd.read_csv(tsv_path, sep='\t', encoding=encoding)
    return Dataset.from_pandas(df)

# Load datasets using 'ISO-8859-1' encoding
train_dataset = load_tsv_to_dataset("/kaggle/input/mydataa/cti_mcq_final.tsv")
test_dataset = load_tsv_to_dataset("/kaggle/input/mydataa/cti-mcq.tsv")

# Formatting prompts for training
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)

# Configure Trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=1,  # Set this for the full training run
        max_steps=60,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

# Train the model
trainer_stats = trainer.train()

# Save the model
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

# Load model for inference
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="lora_model",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(model)

# Prepare for inference
inputs = tokenizer(
    [
        alpaca_prompt.format(
            Question="An adversary leveraging the technique Abuse Elevation Control Mechanism: Setuid and Setgid is targeting which systems from the MITRE ATT&CK Enterprise matrix?", 
            Option_A="Windows",
            Option_B="macOS",
            Option_C="Linux and macOS",
            Option_D="Windows and Linux",
            GT=""  # Leave this blank for generation!
        )
    ],
    return_tensors="pt"
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

# Generate response
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Map:   0%|          | 0/4776 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Map (num_proc=2):   0%|          | 0/4776 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 4,776 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.2096
2,2.3173
3,2.2867
4,2.1414
5,1.8923
6,1.6028
7,1.2609
8,1.0899
9,1.1191
10,0.971


==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
<|begin_of_text|>You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D. :

### Question: 
An adversary leveraging the technique Abuse Elevation Control Mechanism: Setuid and Setgid is targeting which systems from the MITRE ATT&CK Enterprise matrix?

### Input:
A) Windows
B) macOS
C) Linux and macOS
D) Windows and Linux

### Response:
C<|end_of_text|>


In [3]:

FastLanguageModel.for_inference(model)

# Prepare for inference
inputs = tokenizer(
    [
        alpaca_prompt.format(
            Question="Which of the following commands would an adversary use to find files with the setgid bit set on a UNIX-based system?", 
            Option_A="find / -perm +4000 2>/dev/null",
            Option_B="find / -perm +2000 2>/dev/null",
            Option_C="ls -l | grep 's'",
            Option_D="grep -R setgid ",
            GT=""  # Leave this blank for generation!
        )
    ],
    return_tensors="pt"
).to("cuda")
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

# Generate response
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=60)





<|begin_of_text|>You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D. :

### Question: 
Which of the following commands would an adversary use to find files with the setgid bit set on a UNIX-based system?

### Input:
A) find / -perm +4000 2>/dev/null
B) find / -perm +2000 2>/dev/null
C) ls -l | grep's'
D) grep -R setgid 

### Response:
A<|end_of_text|>


In [4]:
from transformers import TextStreamer
FastLanguageModel.for_inference(model)

# Load the test dataset
test_dataset = load_tsv_to_dataset("/kaggle/input/mydataa/cti-mcq.tsv")

# Prepare to store model's responses
model_responses = []

# Process each example in the test dataset
for example in test_dataset:
    # Prepare the prompt using the Alpaca-style formatting
    prompt = alpaca_prompt.format(
        Question=example["Question"],
        Option_A=example["Option A"],
        Option_B=example["Option B"],
        Option_C=example["Option C"],
        Option_D=example["Option D"],
        GT=""  # Leave this blank for generation
    )
    
    # Tokenize the input prompt
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    
    # Prepare the text streamer for real-time output
    text_streamer = TextStreamer(tokenizer)
    
    # Generate the model's response
    generated_output = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)
    
    # Decode the generated response
    response = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    
    # Store the response
    model_responses.append(response)

# Save the results to a CSV or TSV file
output_df = pd.DataFrame({
    "Question": test_dataset["Question"],
    "Model_Response": model_responses
})

# Save the responses to a file
output_df.to_csv("model_responses.csv", index=False)


<|begin_of_text|>You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D. :

### Question: 
Which of the following mitigations involves preventing applications from running that haven't been downloaded from legitimate repositories?

### Input:
A) Audit
B) Execution Prevention
C) Operating System Configuration
D) User Account Control

### Response:
B<|end_of_text|>
<|begin_of_text|>You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D. :

### Question: 
Which data source is recommended for monitoring commands that may circumvent mechanisms designed to control elevation of privileges?

### Input:
A) Command
B) File
C) Pr

In [5]:
import pandas as pd
from transformers import TextStreamer
from unsloth import FastLanguageModel

# Assuming model and tokenizer are already loaded
FastLanguageModel.for_inference(model)

# Load the test dataset
test_dataset = load_tsv_to_dataset("/kaggle/input/mydataa/cti-mcq.tsv")

# Prepare to store model's responses
model_responses = []

# Process each example in the test dataset
for example in test_dataset:
    # Prepare the prompt using the Alpaca-style formatting
    prompt = alpaca_prompt.format(
        Question=example["Question"],
        Option_A=example["Option A"],
        Option_B=example["Option B"],
        Option_C=example["Option C"],
        Option_D=example["Option D"],
        GT=""  # Leave this blank for generation
    )
    
    # Tokenize the input prompt
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    
    # Prepare the text streamer for real-time output
    text_streamer = TextStreamer(tokenizer)
    
    # Generate the model's response
    generated_output = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)
    
    # Decode the generated response
    response = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    
    # Store only the response
    model_responses.append(response.strip())

# Save only the responses to a TSV file
output_df = pd.DataFrame({
    "Model_Response": model_responses
})

# Save the responses to a TSV file (tab-separated)
output_df.to_csv("My_model_responses.tsv", sep='\t', index=False)

print("Model responses saved to model_responses.tsv")


<|begin_of_text|>You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D. :

### Question: 
Which of the following mitigations involves preventing applications from running that haven't been downloaded from legitimate repositories?

### Input:
A) Audit
B) Execution Prevention
C) Operating System Configuration
D) User Account Control

### Response:
B<|end_of_text|>
<|begin_of_text|>You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset. Your task is to choose the best option among the four provided. Return your answer as a single uppercase letter: A, B, C, or D. :

### Question: 
Which data source is recommended for monitoring commands that may circumvent mechanisms designed to control elevation of privileges?

### Input:
A) Command
B) File
C) Pr

In [6]:
import re
import pandas as pd

# Load the model responses TSV file
model_responses_df = pd.read_csv("My_model_responses.tsv", sep='\t')

# Prepare to store cleaned responses
cleaned_responses = []

# Iterate over each row in the dataframe
for response in model_responses_df["Model_Response"]:
    # Use regular expression to extract the response after '### Response:'
    match = re.search(r'### Response:\s*([A-D])', response)
    
    if match:
        # Append the extracted response (A, B, C, or D) to the cleaned_responses list
        cleaned_responses.append(match.group(1))
    else:
        # If no match is found, append an empty string or placeholder
        cleaned_responses.append("")

# Create a new DataFrame with the cleaned responses
cleaned_df = pd.DataFrame({
    "Model_Response": cleaned_responses
})

# Save the cleaned responses to a new TSV file
cleaned_df.to_csv("cleaned_model_responses.tsv", sep='\t', index=False)

print("Cleaned responses saved to cleaned_model_responses.tsv")


Cleaned responses saved to cleaned_model_responses.tsv
