In [1]:
import os
import re
import yaml
import json
import torch
import pickle
from unsloth import FastLanguageModel
from tqdm import tqdm
import pandas as pd

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 10-07 08:25:09 [__init__.py:244] Automatically detected platform cuda.


In [2]:
sft_model = "/mnt/data/training-outputs/Llama-3.1-8B-Malware-Expert/checkpoint-271"

sft_system_message = """You are an AI Security Analyst in Cyberthreat Intelligence (CTI). 
                    Your task is to identify all malwares referenced or implied in a CTI report. 
                    You MUST return a json with a field "objects" being a list of json objects 
                    that describe malwares.
                    To describe a malware you should provide the fields id, type, name and is_family.
                    Instead of using UUID in the id field, use the rule type--name for generating ids.
                    If no malwares are identified return a json with an empty list "objects".
                    Identify all malwares in the folowing CTI report: """

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = sft_model,
    fast_inference = False,
    load_in_4bit = False,
    max_seq_length = None,
    gpu_memory_utilization = 0.8
)

==((====))==  Unsloth 2025.6.8: Fast Llama patching. Transformers: 4.53.0. vLLM: 0.9.1.
   \\   /|    NVIDIA H100 PCIe. Num GPUs = 1. Max memory: 79.179 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth 2025.6.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
from transformers import TextStreamer

FastLanguageModel.for_inference(model)
text_streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

def format_input_prompt(system_message, user_input):
    formatted_input = [
        {"role": "assistant", "content": system_message},
        {"role": "user", "content": user_input}
    ]
    return formatted_input

def format_validation_example_for_inference(example):
    return example.split("<|start_header_id|>user<|end_header_id|>")[1].split("<|eot_id|><|start_header_id|>assistant<|end_header_id|>")[0]

def inference(model, system_message, user_input, max_new_tokens=None, **kwargs):
    input_ids = tokenizer.apply_chat_template(
        format_input_prompt(system_message, user_input),
        add_generation_prompt=True,
        return_tensors = "pt").to("cuda")
    if not max_new_tokens:
        max_new_tokens = model.config.max_position_embeddings - input_ids.shape[-1]
    model.generate(input_ids, streamer = text_streamer, max_new_tokens=max_new_tokens, **kwargs)

def predict(model, system_message, user_input, max_new_tokens=None, **kwargs):
    input_ids = tokenizer.apply_chat_template(
        format_input_prompt(system_message, user_input),
        add_generation_prompt=True,
        return_tensors = "pt").to("cuda")
    if not max_new_tokens:
        max_new_tokens = model.config.max_position_embeddings - input_ids.shape[-1]
    
    output_ids = model.generate(input_ids, max_new_tokens=max_new_tokens, **kwargs)
    result = tokenizer.batch_decode(output_ids)
    processed_result = result[0].split("<|start_header_id|>assistant<|end_header_id|>\n\n")[-1].split("<|eot_id|>")[0]
    return processed_result

In [5]:
def load_json(path:str, filename:str):
    with open(os.path.join(path, filename), mode="r", encoding="utf-8") as f:
        return json.load(f)
    
def format_example(example:dict, system_message):
        formatted_example = [
            {"role": "assistant", "content": system_message},
            {"role": "user", "content": example["input"]},
            {"role": "assistant", "content": json.dumps(example["output"])}
        ]
        return formatted_example

In [6]:
train_path = "/mnt/data/openCTI/splitted-io-pairs/train"
eval_path = "/mnt/data/openCTI/splitted-io-pairs/validation"
train_inputs = []
train_outputs = []
eval_inputs = []
eval_outputs = []
include_cti_type = ["malware"]

for file in os.listdir(train_path):
    cti_type = file.split("--")[0]
    if cti_type not in include_cti_type:
        continue
    example = load_json(train_path, file)
    train_inputs.append(example["input"])
    train_outputs.append(example["output"])

for file in os.listdir(eval_path):
    cti_type = file.split("--")[0]
    if cti_type not in include_cti_type:
        continue
    example = load_json(eval_path, file)
    eval_inputs.append(example["input"])
    eval_outputs.append(example["output"])

In [7]:
system_message = sft_system_message

train_preds = [predict(model,
                 system_message,
                 user_input,
                 max_new_tokens=500,
                 temperature=0.6,
                 top_p=0.6,
                 repetition_penalty=1.1,
                 no_repeat_ngram_size=3,
                 do_sample=True) for user_input in tqdm(train_inputs)]

  0%|          | 0/1858 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
LlamaForCausalLM has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.
100%|██████████| 1858/1858 [1:19:43<00:00,  2.57s/it]


In [8]:
eval_preds = [predict(model,
                 system_message,
                 user_input,
                 max_new_tokens=500,
                 temperature=0.6,
                 top_p=0.6,
                 repetition_penalty=1.1,
                 no_repeat_ngram_size=3,
                 do_sample=True) for user_input in tqdm(eval_inputs)]

100%|██████████| 301/301 [12:39<00:00,  2.52s/it]


In [29]:
train_prompts = [
    tokenizer.apply_chat_template(
        format_input_prompt(system_message, user_input),
        add_generation_prompt=True,
        tokenize=False
    ) for user_input in train_inputs
]

train_chosen = [json.dumps(p) + "<|eot_id|>" for p in train_outputs]
train_rejected = [p + "<|eot_id|>" for p in train_preds]

train_df = pd.DataFrame(
    {
        "prompt":train_prompts,
        "chosen":train_chosen,
        "rejected":train_rejected
    }
)

In [34]:
train_df[train_df.chosen!=train_df.rejected].to_csv("malware_train_data_for_DPO.csv", index=False)

In [35]:
eval_prompts = [
    tokenizer.apply_chat_template(
        format_input_prompt(system_message, user_input),
        add_generation_prompt=True,
        tokenize=False
    ) for user_input in eval_inputs
]

eval_chosen = [json.dumps(p) + "<|eot_id|>" for p in eval_outputs]
eval_rejected = [p + "<|eot_id|>" for p in eval_preds]

eval_df = pd.DataFrame(
    {
        "prompt":eval_prompts,
        "chosen":eval_chosen,
        "rejected":eval_rejected
    }
)

In [38]:
eval_df[eval_df.chosen!=eval_df.rejected].to_csv("malware_eval_data_for_DPO.csv", index=False)