In [1]:
import pandas as pd
import os
from datetime import datetime, timedelta
import srt
import re

In [2]:
video_dir = "../../Dataset/Videos/"
subtitle_dir = "../../Dataset/Subtitles/"

In [23]:
def get_partial_subtitles(vid_dir, sub_dir):
    #    Regex pattern to identify timestamps
    re_pattern = r"[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} -->"
    re_pattern = re.compile(re_pattern)

    video_files = [
        f
        for f in os.listdir(vid_dir)
        if os.path.isfile(os.path.join(vid_dir, f)) and os.path.splitext(os.path.join(vid_dir, f))[1] in [".mp4"]
    ]

    input_data = {}
    for video_file in video_files:
        split_name = os.path.splitext(video_file)[0].split("_")
        input_data[video_file] = {
            "srt_file": os.path.join(sub_dir, f"{'_'.join(split_name[:-2])}.srt"),
            "start": int(split_name[-2]),
            "end": int(split_name[-1]),
            "output_srt": os.path.join(vid_dir, f"{os.path.splitext(video_file)[0]}.srt"),
        }
    final_df_list = []

    for file_name, file_data in input_data.items():
        try:
            with open(file_data["srt_file"], "r") as h:
                subs = h.readlines()
        except FileNotFoundError as e:
            print(e)
            continue

        start_end_times = list(filter(re_pattern.search, subs))
        start_times = [time.split(" --> ")[0] for time in start_end_times]
        end_times = [time.split(" --> ")[1][:-1] for time in start_end_times]

        # Get lines
        lines = []
        for idx in range(len(subs)):
            if (idx + 2) % 4 == 0:
                lines.append(subs[idx][:-1])

        df = pd.DataFrame({"file_name": file_name, "line": lines, "start_time": start_times, "end_time": end_times})
        df["start_time"] = pd.to_datetime(df["start_time"], format="%H:%M:%S,%f")
        df["start_time"] = df["start_time"].apply(
            lambda x: x.hour * 3600 + x.minute * 60 + x.second + x.microsecond / 1000000
        )
        df["end_time"] = pd.to_datetime(df["end_time"], format="%H:%M:%S,%f")
        df["end_time"] = df["end_time"].apply(
            lambda x: x.hour * 3600 + x.minute * 60 + x.second + x.microsecond / 1000000
        )

        temp = df[
            ((df["start_time"] <= file_data["start"]) & (df["end_time"] >= file_data["start"]))
            | ((df["start_time"] <= file_data["end"]) & (df["end_time"] >= file_data["end"]))
            | (df["start_time"] >= file_data["start"]) & (df["end_time"] <= file_data["end"])
        ]

        final_df_list.append(temp)

        temp_list = temp.values.tolist()

        srt_list = []
        for idx in range(len(temp_list)):
            _start_time = temp_list[idx][-2] - file_data["start"] if file_data["start"] <= temp_list[idx][-2] else 0
            srt_list.append(
                srt.Subtitle(
                    index=idx + 1,
                    start=timedelta(seconds=_start_time),
                    end=timedelta(seconds=temp_list[idx][-1] - file_data["start"]),
                    content=temp_list[idx][1],
                )
            )

        srt_string = srt.compose(srt_list)
        with open(
            file_data["output_srt"],
            "w",
        ) as f:
            f.write(srt_string)

    final_df = pd.concat(final_df_list)
    final_df.reset_index(drop=True, inplace=True)
    final_df["start_time"] = final_df.apply(
        lambda row: row["start_time"] - input_data[row["file_name"]]["start"]
        if row["start_time"] >= input_data[row["file_name"]]["start"]
        else 0,
        axis=1,
    )
    final_df["end_time"] = final_df.apply(lambda row: row["end_time"] - input_data[row["file_name"]]["start"], axis=1)
    
    final_df["words"] = final_df["line"].str.split(' ')
    
    final_df = final_df.explode("words").reset_index(drop=True)
    
    final_df.to_excel(
        os.path.join("../../Dataset", f"Truncated_Transcriptions_{datetime.now().strftime('_%Y%m%d_%H%M%S')}.xlsx"),
        index=False,
    )

    return final_df


In [24]:
df = get_partial_subtitles(video_dir, subtitle_dir)
df

Unnamed: 0,file_name,line,start_time,end_time,words
0,MagnusCarlson_542_599.mp4,I'm going to name a sport you have to,0.00,1.64,I'm
1,MagnusCarlson_542_599.mp4,I'm going to name a sport you have to,0.00,1.64,going
2,MagnusCarlson_542_599.mp4,I'm going to name a sport you have to,0.00,1.64,to
3,MagnusCarlson_542_599.mp4,I'm going to name a sport you have to,0.00,1.64,name
4,MagnusCarlson_542_599.mp4,I'm going to name a sport you have to,0.00,1.64,a
...,...,...,...,...,...
3532,StarTalk_Sleep_748_796.mp4,so isn't it isn't it a strange thing and,45.36,48.56,it
3533,StarTalk_Sleep_748_796.mp4,so isn't it isn't it a strange thing and,45.36,48.56,a
3534,StarTalk_Sleep_748_796.mp4,so isn't it isn't it a strange thing and,45.36,48.56,strange
3535,StarTalk_Sleep_748_796.mp4,so isn't it isn't it a strange thing and,45.36,48.56,thing


In [6]:
video_files = [
    f
    for f in os.listdir(video_dir)
    if os.path.isfile(os.path.join(video_dir, f)) and os.path.splitext(os.path.join(video_dir, f))[1] in [".mp4"]
]
video_files

['MagnusCarlson_542_599.mp4',
 'NDT_India_19_88.mp4',
 'StarTalk_CMBR_190_225.mp4',
 'StarTalk_CMBR_270_308.mp4',
 'StarTalk_CMBR_319_356.mp4',
 'StarTalk_CMBR_92_152.mp4',
 'StarTalk_FlyingVehicles_1001_1043.mp4',
 'StarTalk_FlyingVehicles_1980_2040.mp4',
 'StarTalk_FlyingVehicles_2446_2508.mp4',
 'StarTalk_FlyingVehicles_2670_2710.mp4',
 'StarTalk_FlyingVehicles_300_340.mp4',
 'StarTalk_FlyingVehicles_674_719.mp4',
 'StarTalk_FlyingVehicles_780_811.mp4',
 'StarTalk_FlyingVehicles_949_1000.mp4',
 'StarTalk_Sleep_1152_1211.mp4',
 'StarTalk_Sleep_1602_1639.mp4',
 'StarTalk_Sleep_1980_2041.mp4',
 'StarTalk_Sleep_2099_2160.mp4',
 'StarTalk_Sleep_2379_2443.mp4',
 'StarTalk_Sleep_2470_2551.mp4',
 'StarTalk_Sleep_382_450.mp4',
 'StarTalk_Sleep_748_796.mp4']

In [39]:
input_data = {}
for video_file in video_files:
    split_name = os.path.splitext(video_file)[0].split("_")
    input_data[video_file] = {
        "srt_file": f"{'_'.join(split_name[:-2])}.srt",
        "start": int(split_name[-2]),
        "end": int(split_name[-1]),
        "output_srt": f"{os.path.splitext(video_file)[0]}.srt"
    }
input_data

{'MagnusCarlson_542_599.mp4': {'srt_file': 'MagnusCarlson.srt',
  'start': 542,
  'end': 599,
  'output_srt': 'MagnusCarlson_542_599.srt'},
 'NDT_India_19_88.mp4': {'srt_file': 'NDT_India.srt',
  'start': 19,
  'end': 88,
  'output_srt': 'NDT_India_19_88.srt'},
 'StarTalk_CMBR_190_225.mp4': {'srt_file': 'StarTalk_CMBR.srt',
  'start': 190,
  'end': 225,
  'output_srt': 'StarTalk_CMBR_190_225.srt'},
 'StarTalk_CMBR_270_308.mp4': {'srt_file': 'StarTalk_CMBR.srt',
  'start': 270,
  'end': 308,
  'output_srt': 'StarTalk_CMBR_270_308.srt'},
 'StarTalk_CMBR_319_356.mp4': {'srt_file': 'StarTalk_CMBR.srt',
  'start': 319,
  'end': 356,
  'output_srt': 'StarTalk_CMBR_319_356.srt'},
 'StarTalk_CMBR_92_152.mp4': {'srt_file': 'StarTalk_CMBR.srt',
  'start': 92,
  'end': 152,
  'output_srt': 'StarTalk_CMBR_92_152.srt'},
 'StarTalk_FlyingVehicles_1001_1043.mp4': {'srt_file': 'StarTalk_FlyingVehicles.srt',
  'start': 1001,
  'end': 1043,
  'output_srt': 'StarTalk_FlyingVehicles_1001_1043.srt'},
 'Sta

In [76]:
try:
    with open("../../Dataset/Subtitles/NDT_India1.srt", 'r') as h:
        sub = h.readlines()
except FileNotFoundError as e:
    print(e)

[Errno 2] No such file or directory: '../../Dataset/Subtitles/NDT_India1.srt'


In [12]:
import re

with open("../../Dataset/Subtitles/NDT_India.srt", 'r') as h:
    sub = h.readlines()

re_pattern = r'[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} -->'
regex = re.compile(re_pattern)
# Get start times
start_end_times = list(filter(regex.search, sub))
start_times = [time.split(' --> ')[0] for time in start_end_times]
end_times = [time.split(' --> ')[1][:-1] for time in start_end_times]

# Get lines
lines = []
for idx in range(len(sub)):
    if (idx + 2) % 4 == 0:
        lines.append(sub[idx][:-1])

In [27]:
df = pd.DataFrame({"line": lines, "start_time": start_times, "end_time": end_times})
df["start_time"] = pd.to_datetime(df["start_time"], format='%H:%M:%S,%f')
df["start_time"] = df["start_time"].apply(lambda x: x.hour*3600 + x.minute * 60 + x.second + x.microsecond/1000000)
df["end_time"] = pd.to_datetime(df["end_time"], format='%H:%M:%S,%f')
df["end_time"] = df["end_time"].apply(lambda x: x.hour*3600 + x.minute * 60 + x.second + x.microsecond/1000000)
df

Unnamed: 0,line,start_time,end_time
0,we choose to go to the Moon,0.120,2.540
1,we choose to go to the Moon,2.540,6.560
2,we choose to go to the moon in this,10.920,13.500
3,decade and do the other things not,13.500,15.839
4,because they are easy but because they,15.839,18.359
...,...,...,...
76,watching don't miss the Today show every,179.040,180.959
77,weekday at 11 A.M Eastern 8 Pacific on,180.959,184.140
78,our streaming Channel today all day to,184.140,186.360
79,watch head to today.com all day or click,186.360,190.680


In [None]:
(df["start_time"] >= start_time) & (df["end_time"] <= end_time)

In [70]:
start_time = 19
end_time = 88
temp = df[
    ((df["start_time"] <= start_time) & (df["end_time"] >= start_time))
    | ((df["start_time"] <= end_time) & (df["end_time"] >= end_time))
    | (df["start_time"] >= start_time) & (df["end_time"] <= end_time)
]

In [80]:
temp_list = temp.values.tolist()
temp_list

[['are hard', 18.359, 19.32],
 ['that by the way one of the most iconic', 19.32, 21.72],
 ['speeches in American history that was', 21.72, 24.119],
 ['September 1962 that was John F Kennedy', 24.119, 27.06],
 ["announcing America's intention to put a", 27.06, 29.76],
 ['man on the moon well here we are 61', 29.76, 32.098],
 ["years later and we're talking about", 32.098, 33.719],
 ['another historic Moon moment yes we are', 33.719, 35.94],
 ['because India just became the fourth', 35.94, 38.76],
 ['Nation to successfully land the', 38.76, 40.92],
 ['spacecraft on the moon but India is the', 40.92, 43.92],
 ['only country that has ever landed near', 43.92, 45.899],
 ["the moon's South Pole in that", 45.899, 47.28],
 ['significant here to explain why this is', 47.28, 49.14],
 ['so significant we have astrophysicist', 49.14, 50.82],
 ['and author Neil deGrasse Tyson here to', 50.82, 52.62],
 ['break it down for us good morning to you', 52.62, 54.059],
 ["good morning so let's put this in"

In [73]:
srt_list = []
for idx in range(len(temp_list)):
    _start_time = temp_list[idx][1] - start_time if start_time <= temp_list[idx][1] else 0
    srt_list.append(
        srt.Subtitle(
            index=idx + 1,
            start=timedelta(seconds=_start_time),
            end=timedelta(seconds=temp_list[idx][2] - 19),
            content=temp_list[idx][0],
        )
    )
srt_list

[Subtitle(index=1, start=datetime.timedelta(0), end=datetime.timedelta(microseconds=320000), content='are hard', proprietary=''),
 Subtitle(index=2, start=datetime.timedelta(microseconds=320000), end=datetime.timedelta(seconds=2, microseconds=720000), content='that by the way one of the most iconic', proprietary=''),
 Subtitle(index=3, start=datetime.timedelta(seconds=2, microseconds=720000), end=datetime.timedelta(seconds=5, microseconds=119000), content='speeches in American history that was', proprietary=''),
 Subtitle(index=4, start=datetime.timedelta(seconds=5, microseconds=119000), end=datetime.timedelta(seconds=8, microseconds=60000), content='September 1962 that was John F Kennedy', proprietary=''),
 Subtitle(index=5, start=datetime.timedelta(seconds=8, microseconds=60000), end=datetime.timedelta(seconds=10, microseconds=760000), content="announcing America's intention to put a", proprietary=''),
 Subtitle(index=6, start=datetime.timedelta(seconds=10, microseconds=760000), end=

In [74]:
srt_string = srt.compose(srt_list)
with open(
    os.path.join(
        video_dir,
        input_data['NDT_India_19_88.mp4']['output_srt'],
    ),
    "w",
) as f:
    f.write(srt_string)