In [1]:
from unsloth import FastLanguageModel
from transformers import PreTrainedModel, PreTrainedTokenizerBase

from typing import Optional, Any
from llama_index.core.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback


class TransformersLLM(CustomLLM):

    model_name: str
    fast_inference: bool
    load_in_4bit: bool
    max_seq_length: Optional[int] = None
    gpu_memory_utilization: Optional[float] = 0.8
    context_window: Optional[int] = None
    num_output: Optional[int] = None
    model:PreTrainedModel = None
    tokenizer:PreTrainedTokenizerBase = None

    def __init__(self, **kwargs: Any):
        super().__init__(**kwargs)

        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = self.model_name,
            fast_inference = self.fast_inference,
            load_in_4bit = self.load_in_4bit,
            max_seq_length = self.max_seq_length,
            gpu_memory_utilization = self.gpu_memory_utilization
        )

        self.model = FastLanguageModel.for_inference(model)
        self.tokenizer = tokenizer
        if not self.context_window:
            self.context_window = self.model.config.max_position_embeddings

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            model_name=self.model_name,
            load_in_4bit=self.load_in_4bit,
            max_seq_length=self.max_seq_length,
            fast_inference=self.fast_inference,
            gpu_memory_utilization=self.gpu_memory_utilization,
            context_window=self.context_window,
            num_output=self.num_output
        )
    
    def format_input_prompt(self, system_message, user_input):
        formatted_input = [
            {"role": "assistant", "content": system_message},
            {"role": "user", "content": user_input}
        ]
        return formatted_input

    def format_response(self, response):
        return response.split("<|start_header_id|>assistant<|end_header_id|>\n\n")[-1].replace("<|eot_id|>", "")
    
    def inference(self, system_message, user_input, max_new_tokens=None, **kwargs):
        input_ids = self.tokenizer.apply_chat_template(
            self.format_input_prompt(system_message, user_input),
            add_generation_prompt=True,
            return_tensors = "pt").to("cuda")
        if not max_new_tokens:
            max_new_tokens = self.model.config.max_position_embeddings - input_ids.shape[-1]

        output_ids = self.model.generate(input_ids, max_new_tokens=max_new_tokens, **kwargs)
        response = self.tokenizer.batch_decode(output_ids)
        actual_response = self.format_response(response[0])
        return actual_response

    @llm_completion_callback()
    def complete(self, prompt:str, system_message: Optional[str], user_input:Optional[str], **kwargs: Any) -> CompletionResponse:
        response = self.inference(system_message,
                                  user_input,
                                  max_new_tokens=self.num_output,
                                  **kwargs)
        return CompletionResponse(text=response)

    @llm_completion_callback()
    def stream_complete(self, prompt:str, system_message: Optional[str], user_input:Optional[str], **kwargs: Any) -> CompletionResponseGen:
        response = self.inference(system_message,
                                  user_input,
                                  max_new_tokens=self.num_output,
                                  **kwargs)
        for token in self.dummy_response:
            response += token
            yield CompletionResponse(text=response, delta=token)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 09-03 11:44:57 [__init__.py:244] Automatically detected platform cuda.


In [2]:
llm = TransformersLLM(
            model_name="/mnt/data/training-outputs/cti-model",
            fast_inference=False,
            load_in_4bit=False
        )

==((====))==  Unsloth 2025.6.8: Fast Llama patching. Transformers: 4.53.0. vLLM: 0.9.1.
   \\   /|    NVIDIA H100 PCIe. Num GPUs = 1. Max memory: 79.19 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
import json

system_message = """You are an AI Security Analyst in Cyberthreat Intelligence (CTI).
    Your task is to identify all domain names referenced in a CTI report.
    You MUST return a json with a field "objects" being a list of json objects
    that describe domain names.
    To describe a domain name you should provide the fields id, type and value.
    Instead of using UUID in the id field, use the rule type--value for generating ids.
    If no domain names are identified return a json with an empty list "objects".
    Identify all domain names in the following CTI report:"""

with open("/mnt/data/openCTI/io-pairs/test/000e110f-3b22-46e0-b7db-9f121d818236.json") as f:
    user_input = json.load(f)["input"]

resp = llm.complete(prompt="", 
                    system_message=system_message, 
                    user_input=user_input,
                    temperature=0.7,
                    top_p=0.6,
                    repetition_penalty=1.1,
                    no_repeat_ngram_size=3,
                    do_sample=True)

In [6]:
print(resp)

{"objects": [{"id": "domain-name--bolt-food.site", "type": "url", "value": "bolt-food(site)"}, {"id": ("domain-name-" + "boltfood.site"), "type":"url","value":"boltfood(site)" }]}
