# Re-Align HEB SRT

When transcribing Hebrew audio using the Whisper transcription system (not WhisperX), the resulting subtitle blocks can sometimes be too short. This code aims to address this issue by merging subtitle blocks and returning an updated SRT file.

The goal is to improve the readability and coherence of the subtitles by combining shorter blocks into longer ones, providing a more natural flow for the viewer.

Yedidya Harris


## Functions

In [1]:
import pandas as pd
import math

def parse_srt_to_df(srt_file):
    # Initialize an empty list to store block dictionaries
    blocks = []

    with open(srt_file, "r") as file:
        lines = file.readlines()

    block_id = None
    timestamp_start = None
    timestamp_end = None
    text = ""

    for line in lines:
        line = line.strip()
        if not line:  # Skip empty lines
            continue
        if block_id is None:
            block_id = int(line)
        elif timestamp_start is None:
            timestamp_start, timestamp_end = line.split(" --> ")
        else:
            if line.isdigit():  # Check if the line is a block ID for the next block
                # Add the completed block to the list
                blocks.append({
                    "block_id": block_id,
                    "timestamp_start": timestamp_start,
                    "timestamp_end": timestamp_end,
                    "text": text.strip()
                })

                # Reset variables for the next block
                block_id = int(line)
                timestamp_start = None
                timestamp_end = None
                text = ""
            else:
                text += line + " "

    # Add the last block to the list
    blocks.append({
        "block_id": block_id,
        "timestamp_start": timestamp_start,
        "timestamp_end": timestamp_end,
        "text": text.strip()
    })

    # Create a DataFrame from the list of block dictionaries
    df = pd.DataFrame(blocks)

    return df


In [2]:

def merge_text_pairs(df):
    # Create an empty DataFrame for the new merged data
    merged_df = pd.DataFrame(columns=['block_id', 'timestamp_start', 'timestamp_end', 'text'])

    # Iterate through the original DataFrame rows in pairs
    for i in range(0, len(df) - 1, 2):
        row1 = df.iloc[i]
        row2 = df.iloc[i + 1]

        # Merge the text values with a comma in between
        merged_text = row1['text'] + ', ' + row2['text']

        # Extract the start and end timestamps from the rows
        timestamp_start = row1['timestamp_start']
        timestamp_end = row2['timestamp_end']

        # Create a new row for the merged data
        new_row = {
            'block_id': row1['block_id'],
            'timestamp_start': timestamp_start,
            'timestamp_end': timestamp_end,
            'text': merged_text
        }

        # Append the new row to the merged DataFrame
        merged_df = pd.concat([merged_df, pd.DataFrame(new_row, index=[0])], ignore_index=True)
        merged_df['block_id'] = range(1, len(merged_df) + 1)

    return merged_df


In [3]:

def round_down_timestamps(df):
    # Clear the timestamp_end column
    df['timestamp_end'] = ''

    # Fill in timestamp_end with rounded-down timestamp_start from the next row
    for i in range(len(df) - 1):
        start_time = df.loc[i, 'timestamp_start']
        next_start_time = df.loc[i + 1, 'timestamp_start']
        seconds = int(next_start_time.split(':')[2].split(',')[0])
        rounded_down = math.floor(seconds)
        df.loc[i, 'timestamp_end'] = next_start_time.split(':')[0] + ':' + next_start_time.split(':')[1] + ':' + str(rounded_down).zfill(2)

    # Fill in the last row with an empty value for timestamp_end
    df.loc[len(df) - 1, 'timestamp_end'] = ''

    return df



In [4]:
def create_srt_file(df, filename):
    with open(filename, 'w') as f:
        for i, row in df.iterrows():
            f.write(str(row['block_id']) + '\n')
            f.write(str(row['timestamp_start']).replace(',', '.') + ' --> ' + str(row['timestamp_end']).replace(',', '.') + '\n')
            f.write(str(row['text']) + '\n')  # Convert 'text' to string
            f.write('\n')

In [5]:

def get_last_timestamp_end(df):
    last_row = df.iloc[-1]  # Get the last row of the DataFrame
    timestamp_end = last_row['timestamp_end']  # Extract the 'timestamp_end' value from the last row
    return timestamp_end


## Usage

In [8]:
# Example usage
source_file = '/content/eli_Yomjerusalem_5777 (2).srt' # enter your source srt here

df = parse_srt_to_df(source_file)
last_timestamp_end = get_last_timestamp_end(df)
df = merge_text_pairs(df)
df = round_down_timestamps(df)
df.iloc[-1, df.columns.get_loc('timestamp_end')] = last_timestamp_end
create_srt_file(df, f'{source_file[0:-4]}_updated.srt')
