In [1]:
!pip install ipython-autotime
%load_ext autotime

Defaulting to user installation because normal site-packages is not writeable
time: 221 µs (started: 2024-02-07 23:43:01 -05:00)


In [2]:
# !pip install --quiet ipython-autotime
# !pip install moviepy==2.0.0.dev2
# !pip install imageio==2.25.1
# !sudo apt install imagemagick

time: 221 µs (started: 2024-02-07 23:43:02 -05:00)


In [3]:
!cat /etc/ImageMagick-6/policy.xml | sed 's/none/read,write/g'> /etc/ImageMagick-6/policy.xml

# If there is a problem with installing imagemagick please follow'
# the instrauctions for the policy file as explained here
# https://www.reddit.com/r/moviepy/comments/4nin6q/update_imagemagik_and_moviepy_has_broken/

/bin/bash: line 1: /etc/ImageMagick-6/policy.xml: Permission denied
cat: write error: Broken pipe
time: 108 ms (started: 2024-02-07 23:43:04 -05:00)


**Script takes.json files with list of words with start and end points in seconds, and creates karaoke-style captions that play simultaneously over audio to QC forced-alignment and speech-to-text ouput files**

Adapted from:
https://github.com/ramsrigouthamg/Supertranslate.ai/tree/main/Descript_like_wordhighlights_subtitles

In [4]:
import json
from pathlib import Path

from moviepy.editor import (
    ColorClip,
    CompositeVideoClip,
    TextClip,
    VideoFileClip,
)

time: 642 ms (started: 2024-02-07 23:43:07 -05:00)


In [5]:
# Support functions and export parameters


def split_text_into_lines(data: str) -> dict[str, str]:
    """Splits the text into lines.

    Args:
      data: Word extracted from the  AA aligned json file.

    Returns:
      subtitles: Subtitle transcript.
    """
    max_chars = 80
    # maximum duration in seconds
    max_duration = 3.0

    # Split if nothing is spoken (gap) for these many seconds
    max_gap = 1.5
    subtitles = []
    line = []
    line_duration = 0

    for idx, word_data in enumerate(data):
        onset = word_data["onset"]
        offset = word_data["offset"]

        line.append(word_data)
        line_duration += offset - onset

        temp = " ".join(item["word"] for item in line)

        # Check if adding a new word exceeds the maximum character count
        # or duration
        new_line_chars = len(temp)

        duration_exceeded = line_duration > max_duration
        chars_exceeded = new_line_chars > max_chars
        if idx > 0:
            gap = word_data["onset"] - data[idx - 1]["offset"]
            max_gap_exceeded = gap > max_gap
        else:
            max_gap_exceeded = False

        if duration_exceeded or chars_exceeded or max_gap_exceeded:
            if line:
                subtitle_line = {
                    "word": " ".join(item["word"] for item in line),
                    "onset": line[0]["onset"],
                    "offset": line[-1]["offset"],
                    "textcontents": line,
                }
                subtitles.append(subtitle_line)
                line = []
                line_duration = 0

    if line:
        subtitle_line = {
            "word": " ".join(item["word"] for item in line),
            "onset": line[0]["onset"],
            "offset": line[-1]["offset"],
            "textcontents": line,
        }
        subtitles.append(subtitle_line)

    return subtitles


def create_caption(
    text_json: json,
    frame_size: tuple[int],
    font: str = "Helvetica-Bold",
    font_size: int = 80,
    color: str = "white",
    bg_color: str = "blue",
) -> list:
    """Combines the video frames with the subtitles.

    Args:
      text_json: text extracted from the subtitles
      frame_size: apirori defined output video frame size.
      font: (Optional) Font for the subtitles
      font_size: (Optional) font_size of the subtitles
      color: (Optional) Font color of the subtitles
      bg_color: (Optional) Background of the video frame

    Return:
      word_clips: aligned word clips

    """
    full_duration = text_json["offset"] - text_json["onset"]

    word_clips = []
    xy_textclips_positions = []

    x_pos = 0
    y_pos = 0
    frame_width = frame_size[0]
    frame_height = frame_size[1]
    x_buffer = frame_width * 1 / 10
    y_buffer = frame_height * 1 / 5

    space_width = ""
    space_height = ""

    for index, word_json in enumerate(text_json["textcontents"]):
        duration = word_json["offset"] - word_json["onset"]
        word_clip = (
            TextClip(
                word_json["word"],
                font=font,
                fontsize=font_size,
                color=color,
            )
            .set_start(text_json["onset"])
            .set_duration(full_duration)
        )
        word_clip_space = (
            TextClip(" ", font=font, fontsize=font_size, color=color)
            .set_start(text_json["onset"])
            .set_duration(full_duration)
        )
        word_width, word_height = word_clip.size
        space_width, space_height = word_clip_space.size
        if x_pos + word_width + space_width > frame_width - 2 * x_buffer:
            # Move to the next line
            x_pos = 0
            y_pos = y_pos + word_height + 40

            # Store info of each word_clip created
            xy_textclips_positions.append(
                {
                    "x_pos": x_pos + x_buffer,
                    "y_pos": y_pos + y_buffer,
                    "width": word_width,
                    "height": word_height,
                    "word": word_json["word"],
                    "onset": word_json["onset"],
                    "offset": word_json["offset"],
                    "duration": duration,
                },
            )

            word_clip = word_clip.set_position(
                (x_pos + x_buffer, y_pos + y_buffer)
            )
            word_clip_space = word_clip_space.set_position(
                (x_pos + word_width + x_buffer, y_pos + y_buffer),
            )
            x_pos = word_width + space_width
        else:
            # Store info of each word_clip created
            xy_textclips_positions.append(
                {
                    "x_pos": x_pos + x_buffer,
                    "y_pos": y_pos + y_buffer,
                    "width": word_width,
                    "height": word_height,
                    "word": word_json["word"],
                    "onset": word_json["onset"],
                    "offset": word_json["offset"],
                    "duration": duration,
                },
            )

            word_clip = word_clip.set_position(
                (x_pos + x_buffer, y_pos + y_buffer),
            )
            word_clip_space = word_clip_space.set_position(
                (x_pos + word_width + x_buffer, y_pos + y_buffer),
            )

            x_pos = x_pos + word_width + space_width

        word_clips.append(word_clip)
        word_clips.append(word_clip_space)

    for highlight_word in xy_textclips_positions:
        word_clip_highlight = (
            TextClip(
                highlight_word["word"],
                font=font,
                fontsize=font_size,
                color=color,
                bg_color=bg_color,
            )
            .set_start(highlight_word["onset"])
            .set_duration(highlight_word["duration"])
        )
        word_clip_highlight = word_clip_highlight.set_position(
            (highlight_word["x_pos"], highlight_word["y_pos"]),
        )
        word_clips.append(word_clip_highlight)

    return word_clips

time: 3.02 ms (started: 2024-02-07 23:43:09 -05:00)


In [6]:
# Amend the details regarding the dataset and directory paths
stimuli_name = "stimuli_set_name"

# set up your local video input path
vid_local = "path_to_mkv_directory"

# set up your local alignment path
align_path_aa = "path_to_AssemblyAI_annotations"


# set up your local output path for the produced alignment video
output_local = "path_to_output_directory"

frame_size = (1080, 1080)

time: 337 µs (started: 2024-02-07 23:43:18 -05:00)


In [7]:
# Export AssemblyAI alignment as karaoke-style video


mkv_dir = Path(vid_local) / stimuli_name
mkv_files = sorted(mkv_dir.glob("*.mkv"))

# iterate across the movie files
for segment_file in mkv_files:
    segment_name = segment_file.stem
    out_dir = Path(output_local) / stimuli_name
    out_dir.mkdir(parents=True, exist_ok=True)
    out_file = Path(out_dir) / f"{segment_name}.mp4"

    j_path = Path(align_path_aa) / stimuli_name / f"{segment_name}.json"

    with open(j_path) as json_file:
        j_file = json.load(json_file)

    words = j_file["results"]["channels"][0]["alternatives"][0]["words"]

    linelevel_subtitles = split_text_into_lines(words)

    all_linelevel_splits = []

    for line in linelevel_subtitles:
        out = create_caption(line, frame_size)
        all_linelevel_splits.extend(out)

    # Load the input video
    input_video = VideoFileClip(segment_file)

    # Get the duration of the input video
    input_video_duration = input_video.duration

    # Create a color clip with the given frame size, color, and duration
    background_clip = ColorClip(size=frame_size, color=(0, 0, 0)).set_duration(
        input_video_duration
    )

    # If you want to overlay this on the original video uncomment
    # and also change frame_size, font size and color accordingly.
    # final_video = CompositeVideoClip([input_video] + all_linelevel_splits)

    final_video = CompositeVideoClip([background_clip] + all_linelevel_splits)

    # Set the audio of the final video to be the same as the input video
    final_video = final_video.set_audio(input_video.audio)

    # Save the final clip as a video file with the audio included
    final_video.write_videofile(
        out_file,
        fps=24,
        codec="libx264",
        audio_codec="aac",
    )

Moviepy - Building video /home/isil/ComplexBrains/alignment_dev/qc_outputs/wolf/wolf01.mp4.
MoviePy - Writing audio in wolf01TEMP_MPY_wvf_snd.mp4


                                                                        

MoviePy - Done.
Moviepy - Writing video /home/isil/ComplexBrains/alignment_dev/qc_outputs/wolf/wolf01.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready /home/isil/ComplexBrains/alignment_dev/qc_outputs/wolf/wolf01.mp4
time: 8min 10s (started: 2024-02-07 23:43:27 -05:00)
