This code is divided in three steps.

<h3>First step: create subtitles CSV</h3>

In the initial step, utilizing regex, we focus on creating and saving a CSV file. This CSV file contains all the subtitles sourced from the XML screenplay document, along with their respective dialog start and end timestamps.



In [None]:
import re
import csv

# Read the XML content from the external file
xml_file = "eyes-wide-shut-1999-transcription.xml"
with open(xml_file, "r", encoding="utf-8") as f:
    xml_content = f.read()

# Define regular expressions for extracting relevant data
timeline_pattern = r'<timeline xml:id="(.*?)".*?<when xml:id="line_start" absolute="(.*?)".*?<when xml:id="line_end" absolute="(.*?)".*?</timeline>'
matches = re.findall(timeline_pattern, xml_content, re.DOTALL)

# Initialize the CSV data list with headers
csv_data = [["index", "line_start", "line_end"]]

# Populate CSV data
for match in matches:
    index, line_start, line_end = match
    csv_data.append([index, line_start, line_end])

# Write data to CSV file
csv_filename = "output.csv"
with open(csv_filename, "w", newline="", encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(csv_data)

print(f"CSV file '{csv_filename}' created successfully.")

<h3>Second step: cut movie clip</h3>

In this passage we process the full-length MP4 video file and extract a clip of our choice. Users have the flexibility to extract a specific clip of interest by inputting the desired subtitle ID.

In [None]:
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import pandas as pd
import os 

# Specify the input video file path
input_video_path = 'EWS_movie.mp4'

# Load the CSV file
csv_path = 'output.csv'
df = pd.read_csv(csv_path)

def convert_to_seconds(time_str):
    h, m, s = map(float, time_str.split(':'))
    return h * 3600 + m * 60 + s

# Specify the desired index from the CSV
desired_index = "SUB357"  # Change this to the desired index value

row = df[df['index'] == desired_index].iloc[0]

start_time_str = row.line_start
end_time_str = row.line_end

# Convert timestamp strings to seconds and frames
start_time = convert_to_seconds(start_time_str)
end_time = convert_to_seconds(end_time_str)

# Specify the name of the new folder
output_folder = 'output_clips'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Specify the output video file path within the output folder
output_video_path = os.path.join(output_folder, 'output_clip.mp4')

# Extract the clip
ffmpeg_extract_subclip(input_video_path, start_time, end_time, targetname=output_video_path)


print(f"Clip extracted from {start_time} to {end_time} seconds and saved to {output_video_path}")


<h3>Thirs step: save frames</h3>

In this step we capture and save frames from the selected clip in image format. Users also have the option to customize the number of frames to be extracted from the clip, as well as the temporal intervals between each frame.



In [None]:
import os
import random
import imageio
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from moviepy.video.io.VideoFileClip import VideoFileClip

# Specify the folder containing the input MP4 files
input_folder = 'output_clips'

# List all MP4 files in the input folder
mp4_files = [f for f in os.listdir(input_folder) if f.endswith('.mp4')]

# Check if there are any MP4 files in the folder
if not mp4_files:
    print("No MP4 files found in the input folder.")
else:
    # Choose a random MP4 file from the list
    random_mp4_file = random.choice(mp4_files)

    # Specify the input video file path
    input_video_path = os.path.join(input_folder, random_mp4_file)

    # Specify the number of frames to extract
    num_frames = 3

    # Specify the name of the new folder
    output_folder = 'frames'

    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Open the input video
    video = VideoFileClip(input_video_path)

    # Get the duration of the video in seconds
    video_duration = video.duration

    # Calculate the time interval between frames
    time_interval = video_duration / (num_frames + 1)

    # Generate random times to extract frames
    random_times = [random.uniform(time_interval, video_duration - time_interval) for _ in range(num_frames)]

    # Sort the random times in ascending order
    random_times.sort()

    for i, time in enumerate(random_times):
        # Get the frame at the specified time
        frame = video.get_frame(time)

        # Specify the output frame file path within the output folder
        output_frame_path = os.path.join(output_folder, f'frame_{i + 1}.jpg')

        # Save the frame as an image using imageio
        imageio.imsave(output_frame_path, frame)

print(f"Random frames extracted from '{random_mp4_file}' and saved to {output_folder}")
