In [None]:
# !pip install torch transformers pillow numpy tqdm
# !pip install pdf2image
# !apt-get install -y poppler-utils
!pip install -r requirements.txt

In [None]:
# Convert PDF to a list of Image files
from pdf2image import convert_from_path

!wget -O enbj001.zip https://densho810.com/free/dl/enbj001.zip
!unzip enbj001.zip -d .

pdf_path = './enbj001/enbj01.pdf' # TODO: Path to upload
images = convert_from_path(pdf_path, dpi=150) # or 300 dpi if memory permits

for i, image in enumerate(images):
    image.save(f'page_{i + 1}.png', 'PNG')

In [None]:
import os
from PIL import Image
import numpy as np
from transformers import AutoModel
import torch

# Ensure model runs on GPU if available, else fallback to CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
os.environ['CUDA_VISIBLE_DEVICES'] = '0' if device == "cuda" else "-1"

# Load model
model = AutoModel.from_pretrained("ragavsachdeva/magiv2", trust_remote_code=True, cache_dir="./cache").to(device).eval()

# Function to read images
def read_image(path_to_image):
    if not os.path.exists(path_to_image):
        print(f"⚠️ Warning: Image {path_to_image} not found!")
        return None
    with open(path_to_image, "rb") as file:
        image = Image.open(file).convert("L").convert("RGB")
        return np.array(image)

# Process Chapter Pages
num_pages = 60  # First chapter is 60 pages
chapter_pages = [f"page_{i+1}.png" for i in range(num_pages)]

# Check if all pages exist
missing_pages = [p for p in chapter_pages if not os.path.exists(p)]
if missing_pages:
    print(f"❌ Missing pages: {missing_pages}")
    raise FileNotFoundError("Some pages are missing! Ensure all pages exist before running.")

# Load character images (need to change this to work)
character_bank = {
    "images": ["announcer.png", "director.png", "hurt_patient1.png", "hurt_patient2.png", 
               "nurse-a1.png", "nurse-a2.png", "nurse-b1.png", "nurse-b2.png", "nurse-b3.png", 
               "dr_ushida1.png", "dr_ushida2.png", "dr_ushida3.png", "dr_ushida7.png", 
               "dr_ushida10.png", "dr_saito1.png", "dr_saito3.png", "dr_saito7.png", 
               "dr_saito9.png", "dr_saito12.png", "dr_saito14.png"],  
    "names": ["Announcer", "Senior Surgeon", "Badly Injured Patient", "Badly Injured Patient", 
              "Nurse 1", "Nurse 1", "Nurse 2", "Nurse 2", "Nurse 2", "Dr. Ushida", "Dr. Ushida", 
              "Dr. Ushida", "Dr. Ushida", "Dr. Ushida", "Dr. Saito", "Dr. Saito", "Dr. Saito", 
              "Dr. Saito", "Dr. Saito", "Dr. Saito"]
}

# Convert character images
character_bank["images"] = [read_image(x) for x in character_bank["images"] if os.path.exists(x)]

# Read pages into memory
chapter_pages = [read_image(x) for x in chapter_pages if os.path.exists(x)]

# Run model prediction
print("🔄 Processing pages with AI model...")
with torch.no_grad():
    per_page_results = model.do_chapter_wide_prediction(chapter_pages, character_bank, use_tqdm=True, do_ocr=True)

# Generate transcript
transcript = []
for i, (image, page_result) in enumerate(zip(chapter_pages, per_page_results)):
    model.visualise_single_image_prediction(image, page_result, f"page_{i+1}.png")

    # Associate text with character names
    speaker_name = {
        text_idx: page_result["character_names"][char_idx] 
        for text_idx, char_idx in page_result["text_character_associations"]
    }

    # Extract essential dialogues
    for j in range(len(page_result["ocr"])):
        if not page_result["is_essential_text"][j]:
            continue
        name = speaker_name.get(j, "unsure") 
        transcript.append(f"<{name}>: {page_result['ocr'][j]}")
    
    print(f"✅ Page {i+1} processed.")

# Save transcript
with open("transcript.txt", "w", encoding="utf-8") as t:
    for line in transcript:
        t.write(line + "\n")

print("🎉 Transcript processing complete!")

# Display transcript in Jupyter Notebook
from IPython.display import display, Markdown
display(Markdown("\n".join(transcript[:50])))  # Show first 50 lines


In [None]:
# Preparing Transcript and Images

import os, base64
from textwrap import dedent
from openai import OpenAI

client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])


# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# Path to transcript
transcript_path = "./transcript.txt"
with open(transcript_path, 'r', encoding="utf-8") as f:
    transcript = f.read()

# Path to your image
image_paths = [f"./page_{i+1}.png" for i in range(60)] # TODO: Dynamic Loading

# Getting the Base64 string
base64_images = [encode_image(image_path) for image_path in image_paths]

In [None]:
# Constructing the OpenAI message
from textwrap import dedent
messages=[{
        "role": "system",
        "content": dedent("""You are a graphic novel storyteller specializing in Japanese manga.
        Given the pages of a manga chapter and the transcription of the character speeches in the entire chapter, skillfully narrate the entire story portrayed by the sequence of image panels.
        Allow the narrator to be as verbose and detailed as possible to properly describe the context in the image panels.
        Include as many detailed descriptions of the scene as possible. There is more value in being able to reach 15 minutes long.
        Introduce the personality of the characters by conveying the appropriate emotions of their spoken words.
        Portray as vivid an imagery as possible, and generate a narrative with dynamic emotions.
        Pay particular attention to the description of characters that are introduced into a scene without any speeches. It provides additional context to the panel.
        For the format of the script, use the following <style>:
            <Narrator>: "Speech..."
            <Character>: "Speech..."
        Use appropriate capitalization and punctuation when stronger emotional emphasis is needed.
        Example:
            <Character>: "Wait..hold on. THAT'S NOT WHAT I SAID!..."
        Maintain a consistent name for all the <Characters>
        Adhere to the temporality of events and spoken dialogue in the transcript.
        The goal is to fuse together both text from the transcript, and flowery details from the panel drawings to convert a manga into an audiobook narrative.
        The narrative will be sent to a TTS system like ElevenLabs.""")
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": f"||-TRANSCRIPT-||\n{transcript}",
            },
        ]
    }
]

messages[1]['content'].extend(
    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}}
    for image in base64_images
)

# import instructor
# from typing import Literal
# from pydantic import BaseModel, Field
# client = instructor.from_openai(
#     client=OpenAI(),
#     mode=instructor.Mode.TOOLS,
# )
# class Speech(BaseModel):
#     speaker: Literal['Dr. Saito', 'Dr. Ushida', 'Announcer', 'Senior Surgeon', 'Narrator', 'Other'] = Field(description="The name of the character delivering the dialogue or narration.")
#     dialogue: str = Field(description="The lines delivered by the speaker.")

# class Script(BaseModel):
#     script: list[Speech]

client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
response = client.chat.completions.create(
    # model="gpt-4o-mini",
    model="o1",
    messages=messages,
    # response_model=Script # TODO: Test with Structured Outputs
)
with open('final_transcript.txt', 'r', encoding='utf-8') as ft:
    transcript = ft.read()

In [None]:
# Generate a list(speaker, dialogue)
import re

raw_content = response.choices[0].message.content
with open('final_transcript.txt', 'w', encoding='utf-8') as ft:
    ft.write(raw_content)

pattern = re.compile(r'<([^>]+)>:\s*(.*?)(?=\n\n<|$)', re.DOTALL)
script = pattern.findall(raw_content)
speakers = {speaker for speaker, _ in script}

In [None]:
# Print the script and play the output
from dotenv import load_dotenv
from elevenlabs.client import ElevenLabs
from elevenlabs import play

load_dotenv()

speakers = {speaker for speaker, _ in script}

speakers = {
    'Dr. Saito': "bIHbv24MWmeRgasZH58o", # Will
    'Dr. Ushida': "iP95p4xoKVk53GoZ742B", # Chris
    'Announcer': "CwhRBWXzGAHq8TQ4Fs17", # Roger
    'Senior Surgeon': "N2lVS1w4EtoT3dr4eOWO", # Callum
    'Narrator': "nPczCjzI2devNBz1zQrb", # Brian
    'Other': "JBFqnCBsd6RMkjVDRZzb", # George
    'Nurse 1': "cgSgspJ2msm6clMCkdW9", # Jessica
    'Nurse 2': "cgSgspJ2msm6clMCkdW9" # Jessica
}

client = ElevenLabs()

def create_audio(speaker, text):
    audio = client.text_to_speech.convert(
        text= text,
        voice_id= speakers[speaker] if speaker in speakers.keys() else "JBFqnCBsd6RMkjVDRZzb",
        model_id="eleven_multilingual_v2",
        output_format="mp3_44100_128",
    )
    return audio

for speaker, dialogue in script:
    audio = create_audio(speaker, dialogue)
    from rich import print as rprint; rprint(f"{speaker.upper()}:\t{dialogue}")
    play(audio)