In [1]:
import re
import os
from fpdf import FPDF


def split_into_pages(text):
    pattern = r'(?=\[[0-9]{2}:[0-9]{2}:[0-9]{2}\])'
    parts = re.split(pattern, text)
    return [p.strip() for p in parts if p.strip()]


def txt_to_pdf(input_path, output_path,
               font_name='Arial', font_size=12, line_height=8,
               unicode_font_path=None, unicode_font_name='DejaVu'):
    
    with open(input_path, 'r', encoding='utf-8-sig') as f:
        content = f.read()

    pages = split_into_pages(content)
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)

    # Register Unicode font if provided, else use default
    if unicode_font_path:
        pdf.add_font(unicode_font_name, '', unicode_font_path, uni=True)
        pdf.set_font(unicode_font_name, size=font_size)
    else:
        pdf.set_font(font_name, size=font_size)

    for page in pages:
        pdf.add_page()
        for line in page.splitlines():
            pdf.multi_cell(0, line_height, line)

    pdf.output(output_path)
    print(f"Created PDF: {output_path}")


def process_directory(input_dir, output_dir,
                      font_name='Arial', font_size=12, line_height=8,
                      unicode_font_path=None, unicode_font_name='DejaVu'):
    
    os.makedirs(output_dir, exist_ok=True)
    for fname in os.listdir(input_dir):
        if fname.lower().endswith('.txt'):
            in_path = os.path.join(input_dir, fname)
            out_path = os.path.join(output_dir, os.path.splitext(fname)[0] + '.pdf')
            txt_to_pdf(in_path, out_path,
                       font_name, font_size, line_height,
                       unicode_font_path, unicode_font_name)

In [3]:
process_directory('../../RAG/data/Videos_TRUMPF/', '../../RAG/data/Videos_TRUMPF/')

Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_teil4.pdf
Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_teil5.pdf
Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_teil7.pdf
Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_teil6.pdf
Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_teil2.pdf
Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_Teil3.pdf
Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_teil1.pdf
Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_teil10.pdf
Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_teil11.pdf
Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_teil8.pdf
Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_teil9.pdf
Created PDF: ../../RAG/data/Videos_TRUMPF/transcription_teil12.pdf
