In [1]:
# Load libraries
import random
import os
import numpy as np
import pandas as pdd

# Presentation generation
from pptx import Presentation
from pptx.util import Pt, Inches
from pptx.enum.shapes import MSO_SHAPE
from PIL import Image, ImageFont

# Partitioning
import json
from tqdm import tqdm
import pickle

# Summarisation
from transformers import pipeline

# Alignment
from sentence_transformers import SentenceTransformer, util

# Prompt generation
from g4f.client import Client

# PKL File creation (w/ encoding)
import torchvision.models as models
import torchvision.transforms as transforms

# Image generation
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter, StableDiffusionPipeline
from diffusers.utils import export_to_gif

# DOC2PPT models
import torch as T
import torch.nn as nn
from dataset import *
from model import *
from mlp_layout import *

## Presentation helper functions

In [43]:
# Wrap text to fit within a given width for a PowerPoint TextFrame.
def wrap_text(text, max_width, text_frame, font_size=18, font_path='calibri.ttf'):
    # Initialize PIL font
    font = ImageFont.truetype(font_path, font_size)

    # Get the width of the text when rendered with the given font.
    def get_text_width(text):
        bbox = font.getbbox(text)
        return bbox[2] - bbox[0]
    
    # Function to add a paragraph with text and formatting
    def add_paragraph(text, font_size, text_frame):
        paragraph = text_frame.add_paragraph()
        paragraph.text = text
        paragraph.font.size = Pt(font_size)
        return paragraph

    # Add extra newlines between sentences
    sentences_ = text.split('. ')
    sentences = [sentence + '.' for sentence in sentences_[:-1] if sentence]  # Re-add the period
    sentences.append(sentences_[-1])
    paragraphs = [" ".join(sentences)]
        
    for paragraph_text in paragraphs:
        # Add paragraph to text frame
        paragraph = add_paragraph(paragraph_text, font_size, text_frame)

        # Split the paragraph text into words
        words = paragraph_text.split()
        current_line = []

        for word in words:
            # Create a copy of the current line with the new word
            current_line_copy = current_line + [word]
            line_text = ' '.join(current_line_copy)
            
            # Measure the width of the line
            text_width = get_text_width(line_text)
            
            # Convert max_width to pixels (Pillow uses pixels)
            max_width_pixels = max_width.pt
            
            if text_width <= max_width_pixels:
                # If it fits, update the current line
                current_line = current_line_copy
            else:
                # If it doesn't fit, finalize the current line and start a new line
                paragraph.text = ' '.join(current_line)
                current_line = [word]
                paragraph = add_paragraph(' '.join([word]), font_size, text_frame)
                current_line = []

        # Set the last line if there is any
        if current_line:
            paragraph.text = ' '.join(current_line)

## Creating Ground-truth Slide Decks with BART-summarisation

### Partitioning of Paragraphs within Chapters
- chapters == paper
- partitions == section

In [2]:
# Get files from github
with open('./build_dataset/chapter_summary_aligned_train_split.jsonl.gathered') as fd:
    book = [json.loads(line) for line in fd]

def get_partitions(text):
  # Add the first chapter and its text
        super_list = text

        # Partition base on number of words = 1024 tokens
        partitions = []
        counter = 0

        section = []
        for paragraph in super_list:
          tmp_count = counter + len(paragraph.split(" "))
          # If counter is over 1024, append section and reset counter and section
          if tmp_count > 512:
            partitions.append(section)
            counter = 0
            section = []
          # Otherwise, update counter and add paragraph to section
          counter += len(paragraph.split(" "))
          section.append(paragraph)

        # For last section
        if section != []:
          partitions.append(section)
        return partitions

books = {}
chapter_list = {}
chosen_books = ['The Hound of the Baskervilles', 'Frankenstein', 'The Goose Girl', 'A Christmas Carol', 'Wuthering Heights', 'A Tale of Two Cities', 'Little Women', 'Candide', 'The Turn of the Screw', 'Treasure Island']

for ix, example in enumerate(book):
    b_list = example['book_id'].split('.')
    if b_list[0] in chosen_books:
        if len(b_list) == 2:
            book, chap  = b_list
        # If book has volumes - consider it as part of title (title+volume)
        elif len(b_list) == 3:
            book, chap = b_list[0] + "." + b_list[1], b_list[2]
        if book not in books:
            books[book] = {}
            books[book][chap] = get_partitions(example['text'])

            # Initialise chapter
            chapter_list[book] = []

        if chap not in chapter_list:
            chapter_list[book].append(chap)

        if chap not in books[book]:
            books[book][chap] = get_partitions(example['text'])

with open('./build_dataset/book_dataset.pkl', 'wb') as file:
    pickle.dump(books, file)

### Summarisation

In [45]:
device = T.device("cuda" if T.cuda.is_available() else "cpu")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)

In [None]:
books = pickle.load(open('./build_dataset/book_dataset.pkl', 'rb'))

book_sum = {}
for book in tqdm(books, desc=f"Summarising books"):
    title_check = book.split(".")
    if title_check[0] in chosen_books:
        chap_sum = {}
        for chap in tqdm(books[book], desc=f"Summarising Chapters in {book}", leave=False):
            section = books[book][chap]
            summaries = []
            for par in section:
                s = " ".join(par)
                ARTICLE = f"""{s}"""

                try:
                    out = summarizer(ARTICLE, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
                    summaries.append(out)
                except:
                    print("ERROR!")
            chap_sum[chap] = summaries
        book_sum[book] = chap_sum   

with open('./build_dataset/book_summaries.pkl', 'wb') as file:
    pickle.dump(book_sum, file)

### Aligning Raw Text with Summaries

In [5]:
books = pickle.load(open('./build_dataset/book_dataset.pkl', 'rb'))
summ = pickle.load(open('./build_dataset/book_summaries.pkl', 'rb'))

# Load a pre-trained sentence transformer model
model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')

book_labels = {}
for title in tqdm(books, desc="Processing books..."):
    chap_labels = []
    for chapter in books[title]:
        section_labels = []
        for summ_sen, paragraphs in zip(summ[title][chapter], books[title][chapter]):
            if summ_sen and paragraphs:
                # Generate embeddings
                sentence_embedding = model.encode(summ_sen)
                sentence_list_embeddings = model.encode(paragraphs)

                # Compute cosine similarities
                cosine_similarities = util.cos_sim(sentence_embedding, sentence_list_embeddings).flatten()

                # Find the index of the best match
                best_match_index = cosine_similarities.argmax().item()
                best_match_sentence = paragraphs[best_match_index]

                section_labels.append(best_match_index)
            else:
                section_labels.append(None)
        chap_labels.append(section_labels)
    book_labels[title] = chap_labels

with open('./build_dataset/book_labels.pkl', 'wb') as file:
    pickle.dump(book_labels, file)

Processing books...: 100%|████████████████████████| 1/1 [00:30<00:00, 30.72s/it]


### Presentation-making (Ground Truth)

In [8]:
# Create directory for gifs
slide_dir = './groundtruth_slides'
if not os.path.exists(slide_dir):
    os.makedirs(slide_dir)

def normalise_layout(left, top, width, height, slide_width, slide_height):
    normalized_bbox = [
        left / slide_width,   # x_min
        top / slide_height,  # y_min
        width / slide_width,   # x_max
        height / slide_height   # y_max
    ]
    return normalized_bbox # divided by 914400 to convert to inches

summaries = pickle.load(open('./build_dataset/book_summaries.pkl', 'rb'))
type_scene = [title.lower().replace(" ", "_") for title in summaries.keys()]
predefined_layouts = [{"text": [0.25, 5.25, 9.5, 1], "image": [2.5, 0.1, 5, 5]}, {"text": [0.25, 0.1, 9.5, 1], "image": [2.5, 2.25, 5, 5]}, {"text": [5.25, 1, 4.5, 5], "image": [0.1, 1, 5, 5]}, {"text": [0.1, 1, 4.5, 5], "image": [4.85, 1, 5, 5]}]

gd_bbox = {}
for bk_title, gifs in zip(summaries, type_scene):
    chapters = summaries[bk_title]
    chap_bbox = {}
    for ch in chapters:
        # Create a presentation object
        prs = Presentation()
        text = summaries[bk_title][ch]

        slide_objs = []
        for sentence in text:
            layout = random.choice([0, 1, 2, 3])
            # Image bounding box 
            i_left, i_top, i_width, i_height = [Inches(pt) for pt in predefined_layouts[layout]['image']]
            # Text bounding box 
            t_left, t_top, t_width, t_height = [Inches(pt) for pt in predefined_layouts[layout]['text']]

            # Add a slide with a title and content layout
            slide_layout = prs.slide_layouts[6]
            slide = prs.slides.add_slide(slide_layout)
            # Get the slide width and height 
            slide_width = prs.slide_width 
            slide_height = prs.slide_height

            # BBOX
            slide_object = {'text': normalise_layout(t_left, t_top, t_width, t_height, slide_width, slide_height), 'image': normalise_layout(i_left, i_top, i_width, i_height, slide_width, slide_height)}
            slide_objs.append(slide_object)

            # Add placeholder images (plain rectangle)
            slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, left=i_left, top=i_top, width=i_width, height=i_height)

            # Add text
            text_box = slide.shapes.add_textbox(left=t_left, top=t_top, width=t_width, height=t_height)
            text_frame = text_box.text_frame

            wrap_text(text=sentence, max_width=t_width, text_frame=text_frame, layout=layout)
        chap_bbox[ch] = slide_objs
        prs.save(f'./groundtruth_slides/{gifs}_{ch}.pptx')
    gd_bbox[bk_title] = chap_bbox

with open('./build_dataset/book_bbox.pkl', 'wb') as file:
    pickle.dump(gd_bbox, file)

### Paper-slide Pair PKL File Creation

In [None]:
# Load the pre-trained ResNet-152 model for image embeddings
img_model = models.resnet152(weights='IMAGENET1K_V1')
# Remove the final fully connected layer
img_model = T.nn.Sequential(*(list(img_model.children())[:-1]))
img_model.eval()

# Placeholder image
image_array = np.zeros((480, 480, 3), dtype=np.uint8)  # Replace this with your actual image array
# Convert the NumPy array to a Pytorch tensor
image_tensor = T.from_numpy(image_array).permute(2, 0, 1).float()
# Normalize the pixel values to [0, 1]
image_tensor /= 255.0
# Define the normalization transform (ImageNet normalization)
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225],
)
# Apply the normalization
image_tensor = normalize(image_tensor)
# Add a batch dimension
input_batch = image_tensor.unsqueeze(0)

device = T.device("cuda" if T.cuda.is_available() else "cpu")
img_model.to(device)
input_batch = input_batch.to(device)

# Get the embeddings from the model
with T.no_grad():
    embeddings = img_model(input_batch).cpu()
# Reshape the embeddings to a 1D tensor [2048,]
img_embeddings = embeddings.squeeze()

# Establish encoding pre-trained model
sen_model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
def get_embeddings(text):
    embeddings = sen_model.encode(text)
    return embeddings

# Get files
books = pickle.load(open('./build_dataset/book_dataset.pkl', 'rb'))
slide_text = pickle.load(open('./build_dataset/book_summaries.pkl', 'rb'))
slide_labels = pickle.load(open('./build_dataset/book_labels.pkl', 'rb'))
slide_bboxs = pickle.load(open('./build_dataset/book_bbox.pkl', 'rb'))

for b in books:
    pkl_file = []
    print(b)
    # Get the papers (chapters)
    chapters = books[b]
    idd = 0
    for chap in chapters:
        chp_dict = {}
        # Add id of the book
        chp_dict['idd'] = idd

        # PAPER PART 
        chp_dict['paper'] = {}
        # Add the title of the chapter
        chp_dict['paper']['title'] = {'text': b + ": " + chap, 'embedding': get_embeddings(b + ": " + chap)}
        print("PAPER-chapter:", b + "." + chap)
        # Treat the paragraphs as sections
        sections = chapters[chap]
        sec_list = []
        for paragraphs in sections:
            sentences = []
            for sentence in paragraphs:
                sen_dict = {'text': sentence, 'embedding': get_embeddings(sentence)}
                sentences.append(sen_dict)
            sec_list.append(sentences)
            
        chp_dict['paper']['sections'] = sec_list

        # Put placeholder image [plain rectangular figure]
        chp_dict['paper']['figures'] = [{'name': 'placeholder_img.png', 'caption': "", 'embedding':[], 'pixel': np.zeros((480, 480, 3), dtype=np.uint8), 'feature': img_embeddings.tolist()}]

        # SLIDE PART - get summary sentences of current book from summaries ==============================
        # Initialise
        chp_dict['slide'] = {'pages': []}

        section_texts = slide_text[b][chap]
        section_labels = slide_labels[b][idd]
        section_bboxs = slide_bboxs[b][chap]
        # print(section_bboxs, section_labels, section_texts)
        
        for i in range(len(section_texts)):
            # Initialise the chapter dictionary for this page 
            section_slides = {'page': [], 'figure': []}
            # List of sections and their corresponding slides
            object_dict = {'text': section_texts[i], 'embedding': get_embeddings(section_texts[i]), 'bbox': section_bboxs[i]['text'], 'label': section_labels[i]} # the last character of title refer to the placement of the sentence in the section 
            
            # Fill section_slides - each section has one text object and one image object
            section_slides['page'].append(object_dict)
            section_slides['figure'].append({'name': 'placeholder_img.png', 'bbox': section_bboxs[i]['image'], 'label': 0})

            # Add section slides to pages
            chp_dict['slide']['pages'].append([section_slides])

        # Add book to pkl file after extracting info
        pkl_file.append(chp_dict)
        idd += 1

    # Create pkl file
    with open(f'./books/{b}.pkl', 'wb') as file:
        pickle.dump(pkl_file, file)

## Visual Generation

In [2]:
# PROMPT GENERATION
client = Client()

def get_prompt(text, prev, model="gpt-4-turbo"):
    prompt = f"""
            You are an excellent film maker who can imagine scenes based on given text from a book. 
            Your job is to describe me a picture of a scene in one English sentence based on the following text: {text}
            and the context of the prior text: {prev}. Please focus more on the setting and actions present in the texts.

            Here is an example:
            text: "I think," said I, following as far as I could the methods of my companion, "that Dr. Mortimer is a successful, elderly medical man, well-esteemed since those who know him give him this mark of their appreciation" "Good!" said Holmes. "I think also that the probability is in favour of his being a country practitioner who does a great deal of his visiting on foot." "Then I was right." "To that extent." "But that was all."
            context: In a dimly lit room, Sherlock Holmes stands by the hearth-rug, examining a thick, bulbous-headed stick engraved with the words \"To James Mortimer, M.R.S., from his friends of the C.C.H., 1884,\"
            output: 'In a dimly lit room, he is having a serious discussion with his companion whilst holding a bulbous-headed stick'

            If the context is empty, please generate the output based from the text only.
            You must strictly follow the desired output format (a single string).
            Your output must be only strictly English and within 70 words.
            """
    
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
            model=model,
            messages=messages)
    return response.choices[0].message.content

In [46]:
# ANIMATEDIFF
def generate_visuals_vid(pipe, prompt, chap_title, section):
    # Create directory for gifs
    gif_dir = f'./generated_test_slides/{chap_title}_gifs'
    if not os.path.exists(gif_dir):
        os.makedirs(gif_dir)
    
    output = pipe(
        prompt=(prompt),
        negative_prompt="bad quality, worse quality",
        num_frames=16,
        guidance_scale=7.5,
        num_inference_steps=25,
        generator=T.Generator(device).manual_seed(42),
    )
    frames = output.frames[0]
    export_to_gif(frames, f"{gif_dir}/{chap_title}_scene_{section}.gif")
    return f"{gif_dir}/{chap_title}_scene_{section}.gif"

In [47]:
# STABLEDIFF
def generate_visuals_img(pipe, prompt, chap_title, section):
    png_dir = f'./generated_test_slides/{chap_title}_pngs'
    if not os.path.exists(png_dir):
        os.makedirs(png_dir)
        
    image = pipe(prompt).images[0]    
    image.save(f"{png_dir}/{chap_title}_scene_{section}.png")
    return f"{png_dir}/{chap_title}_scene_{section}.png"


## Pipeline Comparisons

### Loading Models - Base DOC2PPT and Fine-tuned DOC2PPT

In [3]:
device = T.device("cuda" if T.cuda.is_available() else "cpu")

loaded_model = T.load('./models/model_hse-tf.pt', map_location=T.device('cpu'))
loaded_loc_model = T.load('./models/ciou_locmodel_best_early.pt', map_location=T.device('cpu'))
story_loaded_model = T.load('./models/storybook_model_dropout_best.pt', map_location=T.device('cpu'))
story_loaded_loc_model = T.load('./models/ciou_storyloc_best_scratch.pt', map_location=T.device('cpu'))

prefix_to_remove = 'module.'
loc_model_state_dict = {key[len(prefix_to_remove):] if key.startswith(prefix_to_remove) else key: value for key, value in loaded_loc_model.items()}
story_loc_model_state_dict = {key[len(prefix_to_remove):] if key.startswith(prefix_to_remove) else key: value for key, value in story_loaded_loc_model.items()}

#  PIPELINE 1 + 2 - testing base DOC2PPT with scientific papers + novels
# base model
base_model = Model().to(device)
base_model.load_state_dict(loaded_model)

# base object placer
base_loc_model = MLPlayout().to(device)
base_loc_model.load_state_dict(loc_model_state_dict)


# PIPELINE 3 + 4 - testing fine-tuned DOC2PPT with novels and scientific papers
# story model
story_model = Model().to(device)
story_model.load_state_dict(story_loaded_model)

# story object placer
story_loc_model = MLPlayout().to(device)
story_loc_model.load_state_dict(story_loc_model_state_dict)

<All keys matched successfully>

In [4]:
paper_dat = {}
stories_dat = {}

for conf in json.load(open('./data/v1.0/train_val_test_2.json', 'r')): # train_val_test_2.json is the json file containing only 14 out of 19 conferences; train_val_test.json contains the full dataset
   pkl = pickle.load(open(f'./data/v1.0/{conf}.pkl', 'rb'))
   paper_dat[conf] = {}
   for item in pkl:
       idd = item['idd']
       paper_dat[conf][idd] = item

for conf in json.load(open('./data/v1.0/book_json.json', 'r')):
    pkl = pickle.load(open(f'./books/{conf}.pkl', 'rb'))
    stories_dat[conf] = {}
    for item in pkl:
        idd = item['idd']
        stories_dat[conf][idd] = item
    
# Dataloader
sci_papers = DLoader(paper_dat, 'test', domain="original")
stories = DLoader(stories_dat, 'test', domain="stories")

### Generate Desired Objects

In [5]:

def pad_tensor(vec, pad, dim):
        pad_size = list(vec.shape)
        pad_size[dim] = pad - vec.size(dim)
        return T.cat([vec, T.zeros(*pad_size, device=device)], dim=dim)

def get_objects_labels(data, model, loc_model, sec_slide_count=False):
    outputs = {}
    count = 0
    for book in data:
        out = model(book)
        outputs[book['conf'] + "." + str(book['idd'])] = out
        count += 1

    # Get labels of desired objects
    book_labels = {}
    for book_out in outputs:
        book_labels[book_out] = []
        book_objs = outputs[book_out]['pd_obj']
        if sec_slide_count:
            num_sec_slides = outputs[book_out]['out_tok_page'] # Needed to get number of slides per each section (for evaluations)
    
            pred_sec_slide_num = [len(sec[:-1]) for sec in num_sec_slides] # Don't include action '1' as that indicates the end of the slide, not the slide itself
            with open(f'./generated_test_slides/slide_section_counts/{book_out}_sec_counts.pkl', 'wb') as file: # Save to pickle files
                pickle.dump(pred_sec_slide_num, file)

        for section_objs in book_objs:
            section_labels = []

            for slide_objs in section_objs:
                slide = {}
                slide_labels = []

                for obj in slide_objs[:-1]:
                    label = T.argmax(obj)
                    slide_labels.append(label.item())
                slide['slide_objs'] = slide_labels

                # Calculate bboxes
                if slide_objs[:-1] != []:
                    pd = T.cat(slide_objs[:-1], dim=0)
                    pd_padded = [pad_tensor(i, 1024, 0) for i in pd]
                    pd_bbox = [loc_model(p).cpu().detach().numpy() for p in pd_padded]

                slide['locations'] = pd_bbox

                section_labels.append(slide)
            book_labels[book_out].append(section_labels)
    return book_labels

In [6]:
science_labels = get_objects_labels(sci_papers, base_model, base_loc_model)
story_on_base_labels = get_objects_labels(stories, base_model, base_loc_model)
story_labels = get_objects_labels(stories, story_model, story_loc_model, sec_slide_count=True)
science_on_story_labels = get_objects_labels(sci_papers, story_model, story_loc_model)

### PROGRESS TRACKER: Object Selection Performance

In [8]:
def get_label_accuracies(data, pred, sample=50):
    pred_labels = []
    for paper in pred:
        sec_labels = []
        for sections in pred[paper]:
            slide_labels = []
            for slides in sections:
                objs_labels = []
                for obj in slides['slide_objs']:
                    label = obj
                    objs_labels.append(label)
                slide_labels.append(objs_labels)
            sec_labels.append(slide_labels)
        pred_labels.append(sec_labels)

    count = 0
    means = []
    for e,f in zip(data, pred_labels):  
        if count > sample:
            break
        dat = e['out_obj']

        # Extra for-loop due to scientific papers having more than one slides for some sections
        cleaned_data = [[[elem for elem in subsublist if elem is not None] for subsublist in sublist if subsublist != []] for sublist in dat if sublist != [] ]
        gt = sum(cleaned_data, [])
        gt = np.array(sum(gt, []))

        # Predicted
        pd = sum(f, [])
        pd = np.array(sum(pd, []))

        means.append(np.mean(gt == pd))
        count += 1
    return means

In [9]:
variants = [(sci_papers, science_labels), (stories, story_on_base_labels), (stories, story_labels), (sci_papers, science_on_story_labels)]
model = ["model 1", "model 2", "model 3", "model 4"]
for m, v in zip(model, variants):
    print(f"accuracy score for {m}:", np.mean(get_label_accuracies(v[0], v[1])))

accuracy score for model 1: 0.6602599330876657
accuracy score for model 2: 0.08723724532548062
accuracy score for model 3: 0.7009956546721254
accuracy score for model 4: 0.3815357351936311


  means.append(np.mean(gt == pd))


### OBJECT PLACER: Bounding Box Prediction Performance (Mean IoU)

In [10]:
def get_bboxs(data, pred, type="paper"):
    pred_location = []
    for paper in pred:
        sec_location = []
        for sections in pred[paper]:
            slide_location = []
            for slides in sections:
                objs_location = []
                for obj in slides['locations']:
                    label = obj
                    objs_location.append(label)
                slide_location.append(objs_location)
            sec_location.append(slide_location)
        pred_location.append(sec_location)
    
    count = 0
    pd_bbox = []
    gd_bbox = []
    slide_sizes = []
    for e,f in zip(data, pred_location):
        # Ground truth    
        dat = e['out_bbox']

        # Extra for-loop due to scientific papers having more than one slides for some sections
        cleaned_data = [[[elem for elem in subsublist if elem is not None] for subsublist in sublist if subsublist != []] for sublist in dat if sublist != [] ]
        gt = sum(cleaned_data, [])
        gt = np.array(sum(gt, []))

        # Predicted
        pd = sum(f, [])
        pd = np.array(sum(pd, []))

        pd_bbox.append(pd)
        gd_bbox.append(gt)
        if type == "paper":
            slide_sizes.append(e['slide_size'])
        else:
            slide_sizes.append([960, 720])

        count += 1
    return pd_bbox, gd_bbox, slide_sizes


In [11]:
pd_base, gd_base, sizes_base = get_bboxs(sci_papers, science_labels)
pd_mix, gd_mix, sizes_mix = get_bboxs(stories, story_on_base_labels, type="story")
pd_story, gd_story, sizes_story = get_bboxs(stories, story_labels, type="story")
pd_mix_, gd_mix_, sizes_mix_ = get_bboxs(sci_papers, science_on_story_labels)

In [12]:
# [left, top, width, height] -> [x1, y1, x2, y2]
def convert_bbox_format(bbox, slide_size):
    slide_width, slide_height = slide_size

    left, top, width, height = bbox
    x1 = left * slide_width
    y1 = top * slide_height
    x2 = (left + width) * slide_width
    y2 = (top + height) * slide_height
    
    return [x1, y1, x2, y2]

def calculate_iou(box1, box2, slide_size):
    # [left, top, width, height] -> [x1, y1, x2, y2]
    box1 = convert_bbox_format(box1, slide_size) 
    box2 = convert_bbox_format(box2, slide_size) 

    # Calculate intersection coordinates
    x1_inter = max(box1[0], box2[0])
    y1_inter = max(box1[1], box2[1])
    x2_inter = min(box1[2], box2[2])
    y2_inter = min(box1[3], box2[3])
    
    # Calculate intersection area
    inter_width = max(0, x2_inter - x1_inter)
    inter_height = max(0, y2_inter - y1_inter)
    inter_area = inter_width * inter_height
    
    # Calculate area of both bounding boxes
    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
    box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
    
    # Calculate union area
    union_area = box1_area + box2_area - inter_area
    
    # Calculate IoU
    iou = inter_area / union_area if union_area > 0 else 0
    return iou

# Averages IOU scores of slide objects within slide deck
def mean_iou(pred_boxes, gt_boxes, slide_size):
    ious = []
    for pred_box, gt_box in zip(pred_boxes, gt_boxes):
        iou = calculate_iou(pred_box, gt_box, slide_size)
        ious.append(iou)
    
    return np.mean(ious)


In [13]:
variants = [(sci_papers, science_labels, "paper"), (stories, story_on_base_labels, "stories"), (stories, story_labels, "stories"), (sci_papers, science_on_story_labels, "paper")]
model = ["model 1", "model 2", "model 3", "model 4"]

for m, v in zip(model, variants):
    pd_base, gd_base, sizes = get_bboxs(v[0], v[1], type=v[2])
    means = []
    for pred_boxes, gt_boxes, slide_size in zip(pd_base, gd_base, sizes):
        mean_iou_score = mean_iou(pred_boxes, gt_boxes, slide_size)
        means.append(mean_iou_score)
    print(f"MIOU score for {m}:", np.mean(means))

MIOU score for model 1: 0.09829466291333415
MIOU score for model 2: 0.22393091893337758
MIOU score for model 3: 0.33465671285270704
MIOU score for model 4: 0.07724153442317817


## Presentation making for Two Different Pipelines

### Prepare Presentation Content

In [7]:
def get_presentation_content(data, book_labels):
    presentation_content = {}

    for book, book_label in zip(data, book_labels):
        input_text = book['inp_text']
        section_labels = book_labels[book_label]

        book_presentation = []
        chap_num = 1
        for section, sec_label in zip(input_text, section_labels):
            if section is None:
                continue
            # print(f"================== CHAPTER {chap_num} =====================")
            section_slides = []

            # Concat input text with input figures to create singular list of possible slide objects
            section = section + book['inp_fig']

            slide_count = 1
            for slide_labels in sec_label:
                # print(f"SLIDE {slide_count}")
                slide_sentences = []
                for label, bbox in zip(slide_labels['slide_objs'], slide_labels['locations']):
                    object_ = section[label]

                    if any(object_.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']):
                        if object_ == 'placeholder_img.png': 
                            chosen_obj = {'figure': object_, 'bbox': bbox}
                        else:
                            # Find pixels of figure in source doc
                            fig = book['inp_fig'].index(object_)
                            chosen_obj = {'figure': book['inp_pix'][fig], 'bbox': bbox}
                    else:    
                        chosen_obj = {'text': object_, 'bbox': bbox}
                    slide_sentences.append(chosen_obj)
                slide_count += 1
                section_slides.append(slide_sentences)
            chap_num += 1
            book_presentation.append(section_slides)
        presentation_content[book['conf'] + "." + str(book['idd'])] = book_presentation
    return presentation_content

In [8]:
story_content = get_presentation_content(stories, story_labels)
story_base_content = get_presentation_content(stories, story_on_base_labels)
sci_paper_content = get_presentation_content(sci_papers, science_labels)
sci_paper_story_content = get_presentation_content(sci_papers, science_on_story_labels)

### Make BOOK2PPT Presentations

In [None]:
def create_prompts(chapter):
    generated_text = []
   
    for section in chapter:
        for slide in section: # each section only has one slide with two objects (text and image)
            slide_text = {}
            for object_ in slide: 
                if 'text' in object_:
                    current_text = object_['text']
                    prev = generated_text[-1]['prompt'] if generated_text != [] else "" # get previous prompt for context; empty for the first generated prompt
                    scene_prompt = get_prompt(current_text, prev)
                    
                    print("context:", prev) 
                    print("prompt:", scene_prompt)
                    print()
                    
                    slide_text['text'] = current_text
                    slide_text['prompt'] = scene_prompt
                    slide_text['context'] = prev
                    generated_text.append(slide_text)
    return generated_text 

chosen_chapters = ['The Goose Girl.5', 'Wuthering Heights.0', 'Frankenstein.volume 1.2']

# Create test slide decks for evaluation
for book_src in chosen_chapters: 
    prompts = create_prompts(story_content[book_src]) 
    with open(f'./generated_test_slides/{book_src}_text_prompts.pkl', 'wb') as file:
        pickle.dump(pkl_file, file)

In [67]:
def create_slides(prs, chp, source, layout="predicted", generate_images=False, prompts=None, vis_generator=""):
    book = {chp: source[chp]}
    
    # Initialize the image generation pipeline
    if vis_generator == "animatediff":
        adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=T.float16)
        model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
        pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=T.float16)
        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", beta_schedule="linear", steps_offset=1)
        pipe.scheduler = scheduler
        pipe.enable_vae_slicing()
        pipe.enable_model_cpu_offload()
    elif vis_generator == "stablediff":
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=T.float16).to("cuda")

    prompts_used = [0]
    
    for chp, sections in book.items():
        for sect in sections:
            for slide_ in sect:
                slide = prs.slides.add_slide(prs.slide_layouts[6])

                # Define four possible layouts for ground truth bounding boxes
                if layout == "ground_truth":
                    predefined_layouts = [{"text": [0.25, 5.25, 9.5, 1], "image": [2.5, 0.1, 5, 5]}, {"text": [0.25, 0.1, 9.5, 1], "image": [2.5, 2.25, 5, 5]}, {"text": [5.25, 1, 4.5, 5], "image": [0.1, 1, 5, 5]}, {"text": [0.1, 1, 4.5, 5], "image": [4.85, 1, 5, 5]}]
                    layout_idx = random.choice([0, 1, 2, 3])
                    t_left, t_top, t_width, t_height = predefined_layouts[layout_idx]['text']
                    i_left, i_top, i_width, i_height = predefined_layouts[layout_idx]['image']
                else:
                    t_left = t_top = t_width = t_height = i_left = i_top = i_width = i_height = 0
                
                # For each chosen object in slide
                for obj in slide_:
                    # Define which layout to use: ground-truth or predicted; if former get a random predefined layout, Otherwises, get predicted bbox of current slide object
                    if layout == "predicted":
                        left, top, width, height = obj['bbox']
                        left = max(0, left * prs.slide_width) / 914400
                        top = max(0, top * prs.slide_height) / 914400
                        width = max(0, width * prs.slide_width) / 914400
                        height = max(0, height * prs.slide_height) / 914400
                    else:
                        left, top, width, height = (i_left, i_top, i_width, i_height) if 'figure' in obj else (t_left, t_top, t_width, t_height)
                    
                    if 'text' in obj:
                        # Paraphrase the text if it has 70 tokens or more 
                        text_box = slide.shapes.add_textbox(left=Inches(left), top=Inches(top), width=Inches(width), height=Inches(height))
                        text_frame = text_box.text_frame
                        if len(obj['text']) > 70:
                            text = summarizer(obj['text'], max_length=150, min_length=30, do_sample=False)[0]['summary_text']
                        else:
                            text = obj['text']
                        #  Ensures that the text fits within the confines of the bounding box
                        wrap_text(text=text, max_width=Inches(width), text_frame=text_frame)
                    
                    elif 'figure' in obj:
                        # If chosen figure states name of placeholder image, generate visuals (or placeholder rectangles for groundtruth slide deck creation)
                        if isinstance(obj['figure'], str):
                            if generate_images:
                                counter = prompts_used[-1]
                                scene_prompt = prompts[counter]['prompt'] if "can I help you today?" and "Join our discord for more:" not in prompts[counter]['prompt'] else prompts[counter]['context']
                                if vis_generator == "animatediff":
                                    image = generate_visuals_vid(pipe, scene_prompt, chap_title=chp, section=counter)
                                    prompts_used.append(counter + 1)
                                else:
                                    image = generate_visuals_img(pipe, scene_prompt, chap_title=chp, section=counter)
                                    prompts_used.append(counter + 1)

                                slide.shapes.add_picture(image, left=Inches(left), top=Inches(top), width=Inches(width), height=Inches(height))
                            else:
                                slide.shapes.add_shape(MSO_SHAPE.RECTANGLE, left=Inches(left), top=Inches(top), width=Inches(width), height=Inches(height))

                        else:
                            image = Image.fromarray(obj['figure'])
                            image_path = "./temp_image.png"
                            image.save(image_path)
                            
                            slide.shapes.add_picture(image_path, left=Inches(left), top=Inches(top), width=Inches(width), height=Inches(height))


In [None]:
# Generate different variations of the model
variants = ['story', 'story_on_base', 'sci_paper', 'sci_paper_on_story'] 
content = [story_content, story_base_content, sci_paper_content, sci_paper_story_content]

for v, type_ in zip(variants[2:3], content[2:3]):
    # Get sample chapters and turn them into a smaller dictionary
    chosen_chapters = random.sample(list(type_.items()), 1)
    for book_src in chosen_chapters:
        prs = Presentation()
        if "story" in v:
            create_slides(prs=prs, chp=book_src[0], source=type_, layout="predicted", generate_images=False)
        elif "sci" in v:
            create_slides(prs=prs, chp='cvpr10.29', source=type_, layout="predicted")
        prs.save(f'{v}.pptx')

In [None]:
chosen_chapters = ['The Goose Girl.5', 'Wuthering Heights.0', 'Frankenstein.volume 1.2']

# Create test slide decks for evaluation
for book_src in chosen_chapters:
    print(book_src)
    prs_1 = Presentation()
    prs_2 = Presentation()
    
    prompts = pickle.load(open(f'./generated_test_slides/{book_src}_text_prompts.pkl', 'rb'))
    
    create_slides(prs=prs_1, chp=book_src, source=story_content, layout="ground_truth", generate_images=True, prompts=prompts, vis_generator="stablediff")
    prs_1.save(f'./generated_test_slides/{book_src}_stablediff.pptx')
    
    create_slides(prs=prs_2, chp=book_src, source=story_content, layout="ground_truth", generate_images=True, prompts=prompts, vis_generator="animatediff")
    prs_2.save(f'./generated_test_slides/{book_src}_animatediff.pptx')
