In [1]:
import warnings
warnings.filterwarnings("ignore")

from pprint import pprint

import json

import os

from unsloth import FastLanguageModel, FastModel
import torch

from multiprocessing import cpu_count
num_proc = cpu_count()

import yaml

from data_processor import SplittedJsonIoDataset
from customs import customize_tokenizer

from unsloth import UnslothTrainer, UnslothTrainingArguments

from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments, DataCollatorForSeq2Seq, DataCollatorForLanguageModeling
from unsloth import is_bfloat16_supported

from unsloth.chat_templates import train_on_responses_only

from unsloth import unsloth_train

from utils import save_log_history

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 09-30 09:18:49 [__init__.py:244] Automatically detected platform cuda.


In [2]:
def load_json(path:str, filename:str):
    with open(os.path.join(path, filename), mode="r", encoding="utf-8") as f:
        return json.load(f)
    
def format_example(example:dict, system_message):
        formatted_example = [
            {"role": "assistant", "content": system_message},
            {"role": "user", "content": example["input"]},
            {"role": "assistant", "content": json.dumps(example["output"])}
        ]
        return formatted_example

In [3]:
system_messages = {
    "domain-name":"""You are an AI Security Analyst in Cyberthreat Intelligence (CTI). 
                    Your task is to identify all domain names referenced in a CTI report. 
                    You MUST return a json with a field "objects" being a list of json objects 
                    that describe domain names.
                    To describe a domain name you should provide the fields id, type and value.
                    Instead of using UUID in the id field, use the rule type--value for generating ids.
                    If no domain names are identified return a json with an empty list "objects".
                    Identify all domain names in the folowing CTI report: """,

    "malware":"""You are an AI Security Analyst in Cyberthreat Intelligence (CTI). 
                 Your task is to identify all malwares referenced or implied in a CTI report. 
                 You MUST return a json with a field "objects" being a list of json objects that describe malwares.
                 To describe a malware you should provide the fields id, type, name and is_family.
                 Instead of using UUID in the id field, use the rule type--name for generating ids.
                 For example, an output in which the malware RandomMalware is identified and is not family
                 of some other malware should be like this:
                 
                 {
                     "objects": [
                         {
                             "id": "malware--RandomMalware",
                             "type": "malware",
                             "name": "RandomMalware",
                             "is_family": false
                         }
                     ]
                 }
                 
                 If no malwares are identified return a json with an empty list "objects".
                 Identify all malwares in the folowing CTI report: """, 
}

In [4]:
with open("config.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.SafeLoader)

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/Qwen3-30B-A3B-Instruct-2507",
    fast_inference = False,
    load_in_4bit = False,
    max_seq_length = None,
    gpu_memory_utilization = 0.8
)

NotImplementedError: Unsloth: unsloth/Qwen3-30B-A3B-Instruct-2507 is not supported in your current Unsloth version! Please update Unsloth via:

pip uninstall unsloth unsloth_zoo -y
pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
pip install --upgrade --no-cache-dir "git+https://github.com/unslothai/unsloth-zoo.git"


In [9]:
from unsloth.chat_templates import get_chat_template

# Example of chat template
convo = [
{"role": "assistant", "content": "SYSTEM MESSAGE PLACEHOLDER"},
{"role": "user", "content": "USER INPUT MESSAGE PLACEHOLDER"},
{"role": "assistant", "content": "MODEL RESPONSE MESSAGE PLACEHOLDER"}
    ]
res = tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False)
print(f"""\nIt follows an example of a formatted instruction using chat template. If instruction_part and
response_part have been defined in config.yaml, please verify their correctness.\n\nCHAT TEMPLATE\n\n{res}""")


It follows an example of a formatted instruction using chat template. If instruction_part and
response_part have been defined in config.yaml, please verify their correctness.

CHAT TEMPLATE

<|im_start|>assistant
SYSTEM MESSAGE PLACEHOLDER<|im_end|>
<|im_start|>user
USER INPUT MESSAGE PLACEHOLDER<|im_end|>
<|im_start|>assistant
<think>

</think>

MODEL RESPONSE MESSAGE PLACEHOLDER<|im_end|>



In [None]:
!pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo

In [5]:
for name, param in model.named_parameters():
    if name in ["base_model.model.lm_head.modules_to_save.default.weight", "base_model.model.model.embed_tokens.modules_to_save.default.weight"]:
        param.requires_grad = True

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total number of parameters: {total_params}')

Total number of parameters: 1134559232


In [19]:
from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

def format_input_prompt(system_message, user_input):
    formatted_input = [
        {"role": "assistant", "content": system_message},
        {"role": "user", "content": user_input}
    ]
    return formatted_input

def format_validation_example_for_inference(example):
    #return example.split("<|start_header_id|>user<|end_header_id|>")[1].split("<|eot_id|><|start_header_id|>assistant<|end_header_id|>")[0]
    return example.split("<|im_start|>user")[1].split("<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n")[0]
                         
def inference(model, system_message, user_input, max_new_tokens=None, **kwargs):
    input_ids = tokenizer.apply_chat_template(
        format_input_prompt(system_message, user_input),
        add_generation_prompt=True,
        return_tensors = "pt").to("cuda")
    if not max_new_tokens:
        max_new_tokens = model.config.max_position_embeddings - input_ids.shape[-1]
    model.generate(input_ids, streamer = text_streamer, max_new_tokens=max_new_tokens, **kwargs)

In [11]:
train_path = "/mnt/data/openCTI/splitted-io-pairs/train"
validation_path = "/mnt/data/openCTI/splitted-io-pairs/validation"

In [12]:
formatted_train_list = []
formatted_eval_list = []
include_cti_type = ["malware"]#["domain-name"]

for file in os.listdir(train_path):
    cti_type = file.split("--")[0]
    if cti_type not in include_cti_type:#if cti_type in ["relationship", "report"]:
        continue
    example = load_json(train_path, file)
    formatted_example = format_example(example, system_messages[cti_type])
    formatted_train_list.append(formatted_example)

for file in os.listdir(validation_path):
    cti_type = file.split("--")[0]
    if cti_type not in include_cti_type:#if cti_type in ["relationship", "report"]:
        continue
    example = load_json(validation_path, file)
    formatted_example = format_example(example, system_messages[cti_type])
    formatted_eval_list.append(formatted_example)

In [13]:
import datasets

# Add template of the model in examples
templated_train_list = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in formatted_train_list]
templated_eval_list = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in formatted_eval_list]
# Create hf seperated datasets
hf_train = datasets.Dataset.from_list([dict(text=ex) for ex in templated_train_list])
hf_eval = datasets.Dataset.from_list([dict(text=ex) for ex in templated_eval_list])
# Create a hf dataset dict
dataset = datasets.DatasetDict({"train":hf_train, "eval":hf_eval})
# Filter dataset
if config["filter_dataset"]:
    if not config["filter_threshold"]:
        config["filter_threshold"] = tokenizer.model_max_length
    dataset = dataset.filter(lambda x: len(tokenizer.encode(x["text"])) <= config["filter_threshold"])

Filter:   0%|          | 0/1858 [00:00<?, ? examples/s]

Filter:   0%|          | 0/301 [00:00<?, ? examples/s]

In [18]:
"""<|im_end|>
<|im_start|>assistant
<think>

</think>
"""

'<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n'

In [14]:
print(dataset["train"]["text"][0])

<|im_start|>assistant
You are an AI Security Analyst in Cyberthreat Intelligence (CTI). 
                 Your task is to identify all malwares referenced or implied in a CTI report. 
                 You MUST return a json with a field "objects" being a list of json objects that describe malwares.
                 To describe a malware you should provide the fields id, type, name and is_family.
                 Instead of using UUID in the id field, use the rule type--name for generating ids.
                 For example, an output in which the malware RandomMalware is identified and is not family
                 of some other malware should be like this:

                 {
                     "objects": [
                         {
                             "id": "malware--RandomMalware",
                             "type": "malware",
                             "name": "RandomMalware",
                             "is_family": false
                         }
               

In [20]:
print(format_validation_example_for_inference(dataset["train"]["text"][0]))


External reference URL: https://www.fortinet.com/blog/threat-research/new-midgedropper-variant

CTI REPORT

# New MidgeDropper Variant

Affected Platforms: Windows

Impacted Users: Windows users

Impact: Potential to deploy additional malware for additional purposes

Severity Level: Medium

One of the most exciting aspects of malware analysis is coming across a family that is new or rare to the reversing community. Determining the function of the malware, who created it, and the reasons behind it become a mystery to solve. The previously unseen dropper variant we recently found, named MidgeDropper, has a complex infection chain that includes code obfuscation and sideloading, making it an interesting use case. Although we couldn’t obtain the final payload, this blog will still explore what makes this dropper tick.

## Initial Infection Vector

The initial infection vector was not available to FortiGuard Labs at the time of our investigation. However, we strongly suspect it to be a phis

In [21]:
system_message = system_messages["domain-name"]
user_input = format_validation_example_for_inference(dataset["train"]["text"][0])
inference(model,
          system_message, 
          user_input, 
          max_new_tokens=None,
          temperature=0.7,
          top_p=0.6,
          repetition_penalty=1.1,
          no_repeat_ngram_size=3,
          do_sample=True)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Qwen3ForCausalLM has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.


<think>
Okay, let's tackle this query step by step. The user wants me to extract all domain references from the provided CTI reports and format them into a JSON structure with specific fields. 

First, I need to go through both CTI documents carefully. Let me start with the first one from Fortinet. Looking at the sections, there's a mention of URLs in the Network-based IOC table. The entries include:

- hXXps://172.16.1.1:8080 (but wait, no, looking back, actually in the Fortinet report, the URLs listed are:
  - hXXhttp://192.0.2.2:8880
  - http://example.com
  Wait, no. Let's check again. In the FortiNet report, under Network-based IoCs, the entries are:

  - 198.51.64.1 (IP address)
  - https://10.00.101.0:8443 (another IP)
  But wait, the actual Fortinet CTI Report provided earlier had these entries?

Wait, no—the original user input included two CTI Reports. Let my recheck.

Looking back at the user's input, the first CTI is from FortiGate, and then another from OTX. Letme parse ea

In [11]:
config["lora_parameters"]["r"] = 32
config["lora_parameters"]["lora_alpha"] = 32
pprint(config["lora_parameters"])

{'bias': 'none',
 'loftq_config': 'None',
 'lora_alpha': 32,
 'lora_dropout': 0,
 'r': 32,
 'target_modules': ['q_proj',
                    'k_proj',
                    'v_proj',
                    'o_proj',
                    'gate_proj',
                    'up_proj',
                    'down_proj',
                    'lm_head',
                    'embed_tokens'],
 'use_gradient_checkpointing': 'unsloth',
 'use_rslora': True}


In [12]:
# Add LoRA weights
model = FastLanguageModel.get_peft_model(
    model=model,
    **config["lora_parameters"]
)

Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2025.6.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


In [9]:
_train_on_responses_only_bool = True
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [10]:
config["training_arguments"]["output_dir"] = "Llama-3.1-8B-Domains-Expert"
config["training_arguments"]["seed"] = 4321
config["lr_scheduler_type"] = "cosine"

In [11]:
# Initiate trainer
trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset["train"],
    eval_dataset = dataset["eval"],
    data_collator = data_collator,
    dataset_text_field = "text",
    max_seq_length = config["model_loading_args"]["max_seq_length"], # Used only when packing=True for creating a ConstantLengthDataset.
    packing = config["sft_trainer_arguments"]["apply_packing"],
    dataset_num_proc = num_proc,
    args = UnslothTrainingArguments(
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        **config["training_arguments"]
    )
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/1870 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/310 [00:00<?, ? examples/s]

In [12]:
# Wrap trainer for apply training using only the assistant part
if _train_on_responses_only_bool:
    trainer = train_on_responses_only(
        trainer,
        instruction_part = config["instruction_part"],
        response_part = config["response_part"]
    )

Map (num_proc=30):   0%|          | 0/1870 [00:00<?, ? examples/s]

Map (num_proc=30):   0%|          | 0/310 [00:00<?, ? examples/s]

In [13]:
config["early_stopping_patience"] = False

if config["early_stopping_patience"]:
    from transformers import EarlyStoppingCallback
    early_stopping_callback = EarlyStoppingCallback(early_stopping_patience = config["early_stopping_patience"])
    trainer.add_callback(early_stopping_callback)

In [None]:
# Start training
trainer_stats = unsloth_train(trainer, resume_from_checkpoint = True)

Unsloth: Setting lr = 5.00e-06 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 5.00e-06 instead of 5.00e-05 for lm_head.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,870 | Num Epochs = 5 | Total steps = 585
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 1,134,559,232/9,164,820,480 (12.38% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
46,0.0188,0.036557
47,0.0319,0.037018
48,0.0105,0.033565
49,0.018,0.032617
50,0.0032,0.033017
51,0.0269,0.032828
52,0.0103,0.032525
53,0.1685,0.032635
54,0.0121,0.032708
55,0.0773,0.032247


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
