<a href="https://colab.research.google.com/github/guilhermelaviola/SRTTranslator/blob/main/SRTAnalyser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Importing all the necessary libraries:
import re
from datetime import timedelta

In [10]:
# Parsing the .srt file:
def parse_srt(file_path):
    with open(file_path, 'r', encoding='utf-8-sig') as file: # Changed encoding to 'utf-8-sig'
        content = file.read()

    # Spliting the file into blocks:
    blocks = content.strip().split("\n\n")

    subtitles = []
    for block in blocks:
        lines = block.split("\n")

        # Extracting sequence number, timestamp, and text:
        sequence_number = int(lines[0])
        timestamp = lines[1]
        text = " ".join(lines[2:])

        # Parsing timestamps:
        start_time, end_time = map(parse_timestamp, timestamp.split(" --> "))

        subtitles.append({
            'sequence': sequence_number,
            'start_time': start_time,
            'end_time': end_time,
            'text': text,
        })

    return subtitles

In [14]:
# Parsing timestamp:
def parse_timestamp(timestamp):
    # Splitting the timestamp using regex ensuring only 3 groups are captured
    match = re.match(r'(\d+):(\d+):(\d+[,]\d+)', timestamp)
    if match:
        hours, minutes, seconds = match.groups()
        # Extracting seconds and milliseconds
        seconds, milliseconds = map(int, seconds.split(','))
        return timedelta(hours=int(hours), minutes=int(minutes), seconds=seconds, milliseconds=milliseconds)
    else:
        # Handling cases where timestamp format is unexpected
        print(f"Warning: Invalid timestamp format: {timestamp}")
        return timedelta()  # Returning a zero timedelta for invalid timestamps

In [15]:
# Analysing the .srt file:
def analyze_srt(subtitles):
    total_duration = timedelta()
    word_count = 0

    for sub in subtitles:
        duration = sub['end_time'] - sub['start_time']
        total_duration += duration

        word_count += len(sub['text'].split())

    print(f"Total Duration: {total_duration}")
    print(f"Total Word Count: {word_count}")

In [16]:
# Testing:
file_path = "All the Little Animals (1998) - EN.srt"
subtitles = parse_srt(file_path)
analyze_srt(subtitles)

Total Duration: 0:38:21.544000
Total Word Count: 6114
