In [None]:
#Transcipt cleaning
from pathlib import Path
import re
def clean_transcript(file_path, output_path):
    try: 
        lines = file_path.read_text(encoding='utf-8').splitlines() 
    except Exception as e:
        print(f"Error: Failed to read {file_path}: {e}") 
        return
    timestamp_pattern = re.compile(r'^(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3}$)')
    numbering_pattern = re.compile(r'^\d+$')
    speaker_pattern = re.compile(r"^([\w\s\'-]+):\s+(.+)")
    cleaned_lines = []
    prev_speaker = None
    prev_text = ""
    current_timestamp = None
    speaker_timestamp = None
    for line in lines:
        line = line.strip()
        if numbering_pattern.match(line):
            continue
        timestamp_match = timestamp_pattern.match(line)
        if timestamp_match:
            current_timestamp = timestamp_match.group(1)
            continue
        speaker_match = speaker_pattern.match(line)
        if speaker_match:
            speaker, text = speaker_match.groups()
            if prev_speaker and speaker == prev_speaker:
                prev_text += " " + text
            else:
                if prev_speaker is not None:
                    cleaned_lines.append(f"{speaker_timestamp} \n{prev_speaker}: {prev_text}\n\n")
                prev_speaker = speaker
                prev_text = f"{text}"
                speaker_timestamp = current_timestamp
        else:
            prev_text += " " + line
    if prev_speaker:
        cleaned_lines.append(f"{speaker_timestamp} \n{prev_speaker}: {prev_text}\n\n")
    try:
        output_path.write_text("".join(cleaned_lines), encoding='utf-8')
        print(f"{output_path} has been created")
    except Exception as e:
        print(f"Error: Failed to write to {output_path}: {e}")

In [None]:
# Function to loop through a list of transcript paths
def transcript_loop(list_path_str):
    list_path = Path(list_path_str.strip().strip('"'))
    try:
        input_files = list_path.read_text(encoding='utf-8').splitlines()
    except FileNotFoundError:
        print(f"Error: List file not found at {list_path}")
        return
    for input_file_str in input_files:
        input_path = Path(input_file_str.strip().strip('"'))
        if not input_path.exists():
            print(f"Error: Input file does not exist: {input_path}")
            continue
        if input_path.suffix.lower() == '.vtt':
            output_path = input_path.with_name(input_path.stem + "_cleaned.txt")
        elif input_path.suffix.lower() == '.txt':
            output_path = input_path.with_name(input_path.stem + "_cleaned.txt")
        else:
            print(f"Error: Unsupported file type: {input_path.name}")
            continue
        clean_transcript(input_path, output_path)

In [None]:
# Invocation
if __name__ == "__main__":
    user_input = input("Enter the file path: ").strip()
    transcript_list_file = user_input.encode("unicode_escape").decode()
    transcript_loop(transcript_list_file)