### Load model and processor

In [1]:
import sys
import os

# For interactive environments like Jupyter Notebook
current_dir = os.getcwd()  # Use the current working directory
janus_folder = os.path.join(current_dir, "Janus")
if janus_folder not in sys.path:
    sys.path.insert(0, janus_folder)

def load_janus_pro(
    model_id: str = "deepseek-ai/Janus-Pro-7B",
    return_attention: bool = False,
    return_logits: bool = False,
    quantization: str = None,
):
    """
    Load the Janus Pro multi-modality model and its associated VLChatProcessor.

    Args:
        model_id (str): Identifier or path of the Janus Pro model.
            For example: "deepseek-ai/Janus-Pro-1B" or "deepseek-ai/Janus-Pro-7B".
        return_attention (bool): If True, the processor/model will be configured to return attention maps.
        return_logits (bool): If True, the model will be configured to return logits/scores.

    Returns:
        tuple: A tuple containing:
            - model: The loaded MultiModalityCausalLM model.
            - processor: The loaded VLChatProcessor.
            - tokenizer: The tokenizer used by the VLChatProcessor.
    """
    import torch
    from transformers import AutoModelForCausalLM
    # Import the Janus objects from your package
    from janus.models import MultiModalityCausalLM, VLChatProcessor

    # Load the processor (this includes loading the tokenizer)
    processor = VLChatProcessor.from_pretrained(
        model_id
    )
    
    tokenizer = processor.tokenizer


    # Configure quantization
    bnb_config = None
    if quantization == "4b":
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8b":
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_quant_type="nf4",
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "16b":
        torch_dtype = torch.bfloat16
    else:
        torch_dtype = torch.float32


    # Load the model using Hugging Face's AutoModelForCausalLM.
    # Here, trust_remote_code=True is used because Janus may contain custom model code.
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
 
        device_map="cuda",
        quantization_config=bnb_config if bnb_config else None,
        torch_dtype=torch.bfloat16 if quantization in ["4b", "8b"] else torch_dtype,
        output_attentions=return_attention,
        output_scores=return_logits,
        return_dict_in_generate=True
    )

    # Move model to GPU using bfloat16 and set it to evaluation mode.
    model = model.to(torch.bfloat16).cuda().eval()

    return model, processor


In [None]:
model, processor = load_janus_pro(model_id="deepseek-ai/Janus-Pro-1B",quantization="16b")



Python version is above 3.10, patching the collections module.


In [None]:
model

### Tokenizer

In [None]:
#from janus.utils.io import load_pil_images
from PIL import Image

question = "Given the image. Describe this image"

image = 'Janus/images/equation.png'
image = Image.open(image)

conversation = [
    {
        "role": "<|User|>",
        "content": f"<image_placeholder>\n{question}",
        "images": [image],
    },
    {"role": "<|Assistant|>", "content": ""},
]

# load images and prepare for inputs
#pil_images = load_pil_images(conversation)

prepare_inputs = processor(
    conversations=conversation, images=[image], force_batchify=True
).to(model.device)

In [None]:
prepare_inputs

In [None]:
# Sequence shape:
prepare_inputs.images_seq_mask.shape

### Data preparation

In [None]:
# Generate image and text embeddings
inputs_embeds = model.prepare_inputs_embeds(**prepare_inputs)

In [None]:
model.prepare_inputs_embeds

In [None]:
# Sequence embedding shape
inputs_embeds.shape

### Text generation LLM

In [None]:
model.language_model

In [None]:
outputs = model.language_model.generate(
    inputs_embeds=inputs_embeds,
    attention_mask=prepare_inputs.attention_mask,
    pad_token_id=processor.tokenizer.eos_token_id,
    bos_token_id=processor.tokenizer.bos_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    max_new_tokens=512,
    do_sample=False,
    use_cache=True,
    output_attentions=True,
    output_scores=True,
    return_dict_in_generate=True
)

In [None]:
outputs.keys()

In [None]:
outputs.scores[0].shape

In [None]:
len(outputs.scores)

### Decode output

In [None]:
outputs.sequences.shape

In [None]:
processor.tokenizer.decode(outputs.sequences[0].cpu().tolist(), skip_special_tokens=True)

### Remove and empty cache

In [None]:
import torch
del model, processor
torch.cuda.empty_cache()