In [None]:
import json
import numpy as np
import time
import cv2
from PIL import Image, ImageDraw, ImageFont
from tqdm import tqdm
import base64
from langchain_openai import ChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage
from key import OPENAI_KEY # Import your own OpenAI key.
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [None]:
text_dir = "../data/text/Subtask_2_test.json" # path to json file
anno = json.load(open(text_dir))

#### Creating images corresponding to each video utterances

In [None]:
def write_text(image, text, fname):
    """
    Function that writes given text on image and
    saves the file with name as fname.
    """
    width, height = 512, 192
    im = Image.open(image)
    draw = ImageDraw.Draw(im)
    
    text_width = width * 0.90
    text_max_height = height * 0.8
    
    size = 36
    # Dynamic resizing
    while size > 1:
        font_path = "OpenSans-Regular.ttf" # Insert your own font path here
        font = ImageFont.truetype(font_path, size)
        lines = []
        line = ""
        for word in text.split():
          proposed_line = line
          if line:
            proposed_line += " "
          proposed_line += word
          if font.getlength(proposed_line) <= text_width:
            line = proposed_line
          else:
            # If this word was added, the line would be too long
            # Start a new line instead
            lines.append(line)
            line = word
        if line:
          lines.append(line)
        text = "\n".join(lines)
        
        x1, y1, x2, y2 = draw.multiline_textbbox((0, 0), text, font)
        w, h = x2 - x1, y2 - y1
        if h <= text_max_height:
          break
        else:
          # The text did not fit comfortably into the image
          # Try again at a smaller font size
          size -= 1
    
    draw.multiline_text((width / 2 - w / 2 - x1, height / 2 - h / 2 - y1), text, font=font, align="center", fill =(0, 0, 0))
    im.save("captions/"+fname+".jpg")

def make_video_grid(video_path, video_dir):
    """
    Reads the frames of video file at video_path and extracts 9 equidistant
    frames from video. It then organizes the 9 frames into a 3x3 grid to make up
    a single image. The text bar is added below the frame grid to make up the final
    image representing the video of the utterance along with its text.
    """
    # Open video
    video_id = video_path.split(".")[0]
    cap = cv2.VideoCapture(os.path.join(video_dir,video_path))
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    # Grid size
    grid_size = 3
    
    # Frame skipping interval
    if frame_count >= 9:
    frame_skip = frame_count // 9
    else:
    print("frame count:",frame_count)
    frame_skip = 1
    # Initialize frame counter
    frame_num = 1
    
    # Initialize output image
    grid_image = np.zeros((frame_height*grid_size, frame_width*grid_size, 3), np.uint8)
    
    frame_buffer = []
    while cap.isOpened():
      ret, frame = cap.read()
    
      if ret == True:
          # Skip frames
          if frame_num % frame_skip == 0: frame_buffer.append(frame)
          frame_num += 1
          if len(frame_buffer) == grid_size*grid_size:
              break
      else:
          break
    cap.release()

    # Load text bar
    text = cv2.imread("captions/"+video_id+".jpg")
    num_frames = 0
    for i in range(grid_size):
    for j in range(grid_size):
      if num_frames < len(frame_buffer):
        grid_image[i*frame_height:(i+1)*frame_height, j*frame_width:(j+1)*frame_width] = frame_buffer[i*grid_size+j]
        num_frames += 1
      else:
        break
      
    image = cv2.resize(grid_image, (512, 320))
    image = np.vstack([image, text])
    cv2.imwrite("frames/"+video_id+".jpg", image)

In [None]:
# Creating a white bar which will be added below the frames
# Containing the speaker and utterance content.

bar = np.ones((512 - 320, 512, 3), np.uint8) * 255
cv2.imwrite("bar.jpg", bar)

os.makedirs("frames", exist_ok=True)
os.makedirs("captions", exist_ok=True)

# Creating all possible white bar to be added to each utterance images.
for an in anno:
  conv_id = an['conversation_ID']
  conversation = an['conversation']
  for utt in conversation:
    write_text("bar.jpg", f"{utt['speaker']}: \"{utt['text']}\"", f"dia{conv_id}utt{utt['utterance_ID']}")

# Creating images for all utterance videos
for an in tqdm(anno):
  conv_id = an['conversation_ID']
  conversation = an['conversation']
  for utt in conversation:
    video_path = utt['video_name']
    if not os.path.exists("frames/" + video_path.split(".")[0] + ".jpg"):
      make_video_grid(video_path, "eval_videos/videos/")

#### Prompting GPT-4 Vision API

In [None]:
def get_conv_imgs_batched(conversation, batch_size=10):
    """
    Utterance images belonging to a conversation are batched to a particular batch_size.
    """
    conv_imgs = []
    for utt in conversation:
        img_path = "frames/" + utt['video_name'].split(".")[0] + ".jpg"
        conv_imgs.append(encode_image(img_path))
    
    k = int(np.ceil(len(conv_imgs)/float(batch_size)))
    batches = []
    for i in range(k):
        batches.append(conv_imgs[i*batch_size: (i+1)*batch_size])
    return batches


In [None]:
chain = ChatOpenAI(model="gpt-4-vision-preview",openai_api_key=OPENAI_KEY)

def get_caption(batch):
    """
    Prompts GPT-4 Vision API to describe a series of utterance images from a conversation where each utterance
    image represents 9 frames in a 3x3 grid from the utterance video along with text below for what the speaker 
    was saying.
    """
    out = chain.invoke(
        [   
            AIMessage(
                content="You are an expert of Friends TV Show. You can understand a video scene from a few of its frames shown in sequence. You give precise descriptive analysis."
                ),
            HumanMessage(
                    content=[
                        "Describe what is likely going on in following images of video frames of each utterance in conversation. The caption below provides speaker context. Give output as:\n\
                        Scene Description: {}",
                        *map(lambda x: {"image": x}, batch),
                    ]
                )
        ]
    )
    return out.content

In [None]:
video_raw_out = {}
# video_raw_out = json.load(open("eval_raw_out.json")) # if resuming

In [None]:
# Processing all the conversations to get video captions
# Save steps and sleeps are to make sure that progress is not lost due to hitting
# GPT-4 API Rate Limits. Please be careful!!

save_step = 10
for i,an in enumerate(anno):
    conv_id = an['conversation_ID']
    if str(conv_id) in video_raw_out or conv_id in video_raw_out: continue
    batches = get_conv_imgs_batched(an['conversation'])
    outputs = []
    for batch in tqdm(batches):
        out = get_caption(batch)
        outputs.append(out)
        time.sleep(0.5)
    video_raw_out[conv_id] = outputs
    print("[{}/{}] Processed Conv {}".format(i+1, len(anno), conv_id))
    if i%save_step == 0:
        print("json dump...")
        json.dump(video_raw_out, open("eval_raw_out.json", "w"))

In [None]:
json.dump(video_raw_out, open("eval_raw_out.json", "w"))

#### Postprocessing Raw Batched Video Captions to Coherent Caption for Whole Conversation

In [None]:
video_raw_out = json.load(open("eval_raw_out.json"))

In [None]:
conv_ids = []
descs = []
for conv_id, outs in video_raw_out.items():
    desc = "\n".join(outs)
    conv_ids.append(conv_id)
    descs.append({"description": desc})

In [None]:
summary_prompt = \
"""
Following is a descriptions of video clip from Friends TV show for a particular conversation.
The descriptions are broken from each other. Stitch the description into a continous coherent
narrative of the whole scene
{description}
"""
summary_prompt = ChatPromptTemplate.from_template(summary_prompt)

model = ChatOpenAI(openai_api_key=OPENAI_KEY)
output_parser = StrOutputParser()
description_chain = summary_prompt | model | output_parser

In [None]:
outs = description_chain.batch(descs, config={"max_concurrency": 10})

In [None]:
video_proc_out = {}
for i, out in enumerate(outs):
    video_proc_out[conv_ids[i]] = out    

In [None]:
json.dump(video_proc_out, open("eval_proc_out.json", "w"))