<a href="https://colab.research.google.com/github/detektor777/colab_list_video/blob/main/split_subtitles_into_sentences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title ##**Upload subtitles (.str)** { display-mode: "form" }

from google.colab import files
import os

uploaded = files.upload()

subtitle_filename = list(uploaded.keys())[0]
print(f"Uploaded: {subtitle_filename}")

In [None]:
#@title ##**Split subtitles into sentences** { display-mode: "form" }
import re
from google.colab import files
import os

def srt_time_to_seconds(srt_time):
    hours, minutes, seconds = map(float, srt_time.replace(',', '.').split(':'))
    return hours * 3600 + minutes * 60 + seconds

def seconds_to_srt_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds - int(seconds)) * 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{millis:03}"

try:
    with open(subtitle_filename, 'r', encoding='utf-8') as file:
        srt_content = file.read()

    subtitle_blocks = re.split(r'\n\n+', srt_content.strip())
    sentences = []
    current_sentence = []
    current_start = None
    current_end = None

    for block in subtitle_blocks:
        lines = block.strip().split('\n')
        if len(lines) < 3:
            continue
        timing = lines[1].split(' --> ')
        start_time = srt_time_to_seconds(timing[0])
        end_time = srt_time_to_seconds(timing[1])
        text = ' '.join(lines[2:]).strip()

        if current_start is None:
            current_start = start_time
        current_end = end_time
        current_sentence.append(text)

        if text.strip().endswith(('.', '!', '?')):
            sentences.append({
                'start': current_start,
                'end': current_end,
                'text': ' '.join(current_sentence)
            })
            current_sentence = []
            current_start = None

    if current_sentence and current_start is not None:
        sentences.append({
            'start': current_start,
            'end': current_end,
            'text': ' '.join(current_sentence)
        })

    new_srt_content = ""
    for i, sentence in enumerate(sentences):
        start_time = seconds_to_srt_time(sentence['start'])
        end_time = seconds_to_srt_time(sentence['end'])
        new_srt_content += f"{i+1}\n{start_time} --> {end_time}\n{sentence['text']}\n\n"

    output_filename = os.path.splitext(subtitle_filename)[0] + "_reformatted.srt"
    with open(output_filename, 'w', encoding='utf-8') as srt_file:
        srt_file.write(new_srt_content)

    print(f"The reformatted SRT file is saved as {output_filename}")

    files.download(output_filename)

except FileNotFoundError:
    print("Subtitle file not found. Please upload the file again.")
except Exception as e:
    print(f"An error occurred: {str(e)}")