# Music Booklet for Children

Generate a children booklet based on uploaded music sheet, which help children to understand the content of music

In [None]:
%pip install -q transformers torch gradio Pillow reportlab

In [None]:
# imports

import os, re, textwrap
from PIL import Image
from huggingface_hub import login
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

from google.colab import userdata, drive
from transformers import pipeline
import torch
import gradio as gr

from diffusers import StableDiffusionPipeline

In [None]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

MUSIC_KEYWORDS = r"\b(Allegro|Andante|Adagio|Moderato|Vivace|Largo|Presto|mf|mp|ff|pp|forte|piano|crescendo|diminuendo)\b"

In [None]:
def extract_sheet_cues(image_paths):
  """
  Extract cues from sheet images (caption + OCR)
  """

  # caption
  captioner = pipeline(
      "image-to-text",
      model="Salesforce/blip-image-captioning-large",
      device=0 if DEVICE == "cuda" else -1
  )

  # OCR
  ocr = pipeline(
      "image-to-text",
      model="microsoft/trocr-base-printed",
      device=0 if DEVICE == "cuda" else -1
  )

  captions = []
  ocr_texts = []
  for path in image_paths:
    image = Image.open(path.name).convert("RGB")
    captions.append(captioner(image, max_new_tokens=40)[0]["generated_text"])
    ocr_texts.append(ocr(image, max_new_tokens=64)[0]["generated_text"])

  combined = " | ".join(captions + ocr_texts)

  keywords = re.findall(MUSIC_KEYWORDS, combined, flags=re.IGNORECASE)

  return {
      "captions": captions,
      "ocr": ocr_texts,
      "keywords": sorted(set(k.lower() for k in keywords)),
      "raw": combined
  }

In [None]:
def generate_book_plan(cues, num_pages=12):
  """
  Generated booklet text
  """
  storyteller = pipeline(
      "text2text-generation",
      model="google/flan-t5-large",
      device=0 if DEVICE == "cuda" else -1
  )

  prompt = f"""
  Create a children's picture book inspired by a piece of music.

  Mood cues from the sheet:
  Captions: {cues['captions']}
  OCR text: {cues['ocr']}
  Musical keywords: {cues['keywords']}

  Requirements:
  - Target age: 4 - 7
  - {num_pages} pages
  - Each page: (1) 1- 2 short sentences, (2) an illustration prompt in brackets starting with "ILLUSTRATION:"
  - Keep characters consistent.
  - Let the musical mood influence the story pacing (fast = playful adventure; slow = gentle bedtime, etc.)
  - Do not mention 'sheet music' or 'OCR'.

  Output format exactly:
  TITLE: ...
  CHARACTERS: ....
  PAGE 1: ...
  [ILLUSTRATION: ...]
  PAGE 2: ...
  [ILLUSTRATION: ...]
  ...
  """

  out = storyteller(prompt, max_new_tokens=800)[0]["generated_text"]
  return out

In [None]:
def parse_pages(book_text):
  """
  output booklet
  """
  title = "My Music Story"
  m = re.search(r"TITLE:\s*(.*)", book_text)
  if m:
    title = m.group(1).strip()


  pages = []
  page_blocks = re.split(r"\bPAGE\s+\d+:\s*", book_text)[1:]
  for block in page_blocks:
    illu = ""
    illu_m = re.search(r"\[ILLUSTRATION:\s*(.*?)\]", block, flags=re.DOTALL)
    if illu_m:
      illu = illu_m.group(1).strip()
      story = re.sub(r"\[ILLUSTRATION:.*?]", "", block, flags=re.DOTALL).strip()
    else:
      story = block.strip()

    if story:
      pages.append((story, illu))
  return title, pages


In [None]:
def generate_illustrations(pages, out_dir="illus", style="watercolor storybook"):
  """
  Generate illustrations using stable diffusion
  """
  os.makedirs(out_dir, exist_ok=True)

  pipe = StableDiffusionPipeline.from_pretrained(
      "runwayml/stable-diffusion-v1-5",
      dtype=torch.float16 if DEVICE == 'cuda' else torch.float32,
      safety_checker=None
  ).to(DEVICE)

  image_paths = []

  for i, (_, illu_prompt) in enumerate(pages, start=1):
    base = illu_prompt or "A joyful child-friendly scene with cute animal friends"
    prompt = f"{base}, {style}, children's picture book, soft lighting, simple shapes, high readability"
    img = pipe(prompt, num_inference_steps=30, guidance_scale=7.0).images[0]
    path = os.path.join(out_dir, f"page_{i:02d}.png")
    img.save(path)
    image_paths.append(path)
  return image_paths

In [None]:
def make_pdf(title, pages, illus_paths, output_pdf="picture_book.pdf"):
  """
  make a pdf
  """
  c = canvas.Canvas(output_pdf, pagesize=letter)
  W, H = letter

  # Cover
  c.setFont("Helvetica-Bold", 28)
  c.drawCentredString(W/2, H*0.70, title)
  c.setFont("Helvetica", 12)
  c.drawCentredString(W/2, H*0.65, "A story inspired by music")
  c.showPage()

  for i, ((text, _), img_path) in enumerate(zip(pages, illus_paths), start=1):
    # Image
    img = Image.open(img_path).convert("RGB")
    iw, ih = img.size
    margin = 36
    img_max_w = W - 2 * margin
    img_max_h = H * 0.62
    scale = min(img_max_w/iw, img_max_h/ih)
    draw_w, draw_h = iw * scale, ih * scale
    x = (W - draw_w) / 2
    y = H * 0.35
    c.drawInlineImage(img_path, x, y, width=draw_w, height=draw_h)

    # Text
    c.setFont("Helvetica", 14)
    wrapped = textwrap.wrap(text, width=55)
    text_y = H * 0.28
    for line in wrapped[:4]:
      c.drawCentredString(W/2, text_y, line)
      text_y -= 18

    # Page number
    c.setFont("Helvetica", 10)
    c.drawCentredString(W/2, 18, str(i))
    c.showPage()

  c.save()
  return output_pdf

In [None]:
def process_files(files, num_pages):
  """
  process generate file when generate button clicked
  """
  cues = extract_sheet_cues(files)
  book_text = generate_book_plan(cues, num_pages=1)

  title, pages = parse_pages(book_text)
  illus_paths = generate_illustrations(pages, out_dir="illus", style="watercolor storybook")
  pdf_path = make_pdf(title, pages, illus_paths, output_pdf="picture_book.pdf")
  print("Done:", pdf_path)
  return pdf_path



# Gradio
with gr.Blocks() as demo:
  gr.Markdown("## Upload multiple png music score files -> one children's booklet")

  files = gr.File(
      file_count="multiple",
      label="Upload files"
  )

  num_pages = gr.Textbox(label="Number of Pages to Generate")

  run_btn= gr.Button("Generate")

  output = gr.File(label="Download Booklet")

  run_btn.click(process_files,inputs=[files,num_pages],outputs=output)


demo.launch(debug=True, prevent_thread_lock=True)



