In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import re

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-72B-Instruct", 
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
    cache_dir="/scratch/workspace/ctpham_umass_edu-llama/.cache/"
)

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

  from .autonotebook import tqdm as notebook_tqdm
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards:  61%|██████    | 23/38 [06:06<03:33, 14.21s/it]

In [None]:
# Stage 1: Extracting text from comic strips

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "data/030.jpg",
            },
            {
                "type": "text", 
                "text": open("text_extraction.md").read()},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")
 
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=2000, do_sample=False)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])

<row_0>
    <text_1>
    私の依頼を
    最初から
    叶える気なんて
    なかったん
    じゃない
    </text_1>
</row_0>

<row_1>
    <text_2>
    框さんっ
    </text_2>
    <text_3>
    え…？
    </text_3>
    <text_4>
    叶えましたよ
    </text_4>
    <text_5>
    あなたの
    本当の依頼なら
    </text_5>
</row_1>

<row_2>
    <text_6>
    あの…
    </text_6>
    <text_7>
    これ
    </text_7>
    <text_8>
    お借りした
    化粧筆
    お返しします
    </text_8>
</row_2>

<row_3>
    <text_9>
    おかげで
    後悔なく
    送れます
    妹ちゃんを
    </text_9>
    <text_10>
    ありがとうございました！
    </text_10>
</row_3>


In [None]:
# Reverse order of extracted texts to fit manga reading order

def reverse_texts_with_tag_renaming(detected):
    # Pattern to find each row with its content
    row_pattern = r'(<row_\d+>)(.*?)(</row_\d+>)'
    # Pattern to find individual <text_n> elements within each row
    text_pattern = r'(<text_\d+>)(.*?)(</text_\d+>)'
    
    # Function to reverse texts in a single row
    def reverse_row(match):
        row_start, row_content, row_end = match.groups()
        # Find all <text_n> elements in the row
        texts = re.findall(text_pattern, row_content, re.DOTALL)
        # Reverse the order and rename tags to reflect the new order
        reversed_texts = []
        for i, (_, text_content, _) in enumerate(texts[::-1], start=1):
            # Update the tag name to match the new order (e.g., <text_1>, <text_2>, ...)
            new_text = f"<text_{i}>\n{text_content}\n</text_{i}>"
            reversed_texts.append(new_text)
        # Join reversed and renamed texts
        return f"{row_start}\n    " + "\n    ".join(reversed_texts) + f"\n{row_end}"
    
    # Apply the reverse_row function to each row match
    reversed_detected = re.sub(row_pattern, reverse_row, detected, flags=re.DOTALL)
    
    return reversed_detected

# Reverse the texts in each row, rename tags, and print the result
reversed_output = reverse_texts_with_tag_renaming(output_text[0])
print(reversed_output)

<row_0>
    <text_1>

    私の依頼を
    最初から
    叶える気なんて
    なかったん
    じゃない
    
</text_1>
</row_0>

<row_1>
    <text_1>

    あなたの
    本当の依頼なら
    
</text_1>
    <text_2>

    叶えましたよ
    
</text_2>
    <text_3>

    え…？
    
</text_3>
    <text_4>

    框さんっ
    
</text_4>
</row_1>

<row_2>
    <text_1>

    化粧筆
    お借りした
    お返しします
    
</text_1>
    <text_2>

    これ
    
</text_2>
    <text_3>

    あの…
    
</text_3>
</row_2>

<row_3>
    <text_1>

    ありがとうございました！
    
</text_1>
    <text_2>

    おかげで
    後悔なく
    送れます
    妹ちゃんを
    
</text_2>
</row_3>


In [None]:
# Translating texts

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "data/030.jpg",
            },
            {
                "type": "text", 
                "text": open("translation.md").read().format(orig=reversed_output)},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")
 
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=2000, do_sample=False)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])

<row_0>
    <text_1>
    私の依頼を
    最初から
    叶える気なんて
    なかったん
    じゃない
    </text_1>
    <translation_1>
    You never intended to fulfill my request from the beginning, right?
    </translation_1>
</row_0>

<row_1>
    <text_1>
    あなたの
    本当の依頼なら
    </text_1>
    <translation_1>
    If it was your real request,
    </translation_1>
    <text_2>
    叶えましたよ
    </text_2>
    <translation_2>
    I fulfilled it.
    </translation_2>
    <text_3>
    え…？
    </text_3>
    <translation_3>
    Huh...?
    </translation_3>
    <text_4>
    框さんっ
    </text_4>
    <translation_4>
    Kaku-san!
    </translation_4>
</row_1>

<row_2>
    <text_1>
    化粧筆
    お借りした
    お返しします
    </text_1>
    <translation_1>
    I borrowed a makeup brush,
    I'll return it now.
    </translation_1>
    <text_2>
    これ
    </text_2>
    <translation_2>
    This one.
    </translation_2>
    <text_3>
    あの…
    </text_3>
    <translation_3>
    Um...
    </translation_3>
</row_2>

<row_3>
    <text_1>
    ありがと