In [118]:
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
import os


In [119]:
load_dotenv(dotenv_path=".env")  # Relative to root directory

TRANSCRIPTIONS_DIR = Path(os.getenv("TRANSCRIPTIONS_DIR")).resolve()
TRANSCRIPTIONS_CLEANED_DIR = Path(os.getenv("TRANSCRIPTIONS_CLEANED_DIR")).resolve()
assert TRANSCRIPTIONS_DIR.is_dir(), "Could not find transcriptions source directory."
assert (
    TRANSCRIPTIONS_DIR != TRANSCRIPTIONS_CLEANED_DIR
), "Transcription directory and cleaned transcription directory should be separate."

TSV_COLUMNS = ["start", "end", "text"]


In [120]:
TRANSCRIPTIONS_CLEANED_DIR.mkdir(parents=False, exist_ok=True)

tsv_files = sorted(list(TRANSCRIPTIONS_DIR.glob("*.tsv")))

In [121]:
filepath = tsv_files[0]
df = pd.read_csv(str(filepath), sep="\t")
df

Unnamed: 0,start,end,text
0,46800,52400,Der Reich ist in voller Schärfe entbrannt. Wie...
1,52900,58220,"Wieder werden die Sammelpunkte des Feindes, se..."
2,58700,63940,Die Verbindungen im Hinterlande sollen vernich...
3,64660,78920,Nächtlicher Angriff auf Le Ava. Die Feinds sin...
4,78920,91330,Der Feind fühlt sich hier bereits geschlagen. ...
...,...,...,...
193,2480730,2482750,in dem Napoleon I. beigesetzt ist.
194,2501870,2502510,Deutscher Hand.
195,2503550,2508650,Die sofort anschließende Eroberung Verdans und...
196,2508650,2512050,"veranlassten Marschall Pétain zu der Erklärung,"


In [122]:
# df.at[0, "start"] = 0

In [123]:
def convert_ms_to_mmss(miliseconds: int) -> str:
    """Converts miliseconds to minute number format"""

    (seconds, miliseconds) = divmod(miliseconds, 1000)
    (minutes, seconds) = divmod(seconds, 60)
    return f"{minutes:02.0f}:{seconds:05.0f}"

In [None]:
def merge_comma_at_end(df: pd.DataFrame) -> pd.DataFrame:
    """Merge cells which end with comma with the next one
    Use start from the first and end from the second one)"""

    new_rows = []
    i = 0

    while i < len(df):
        current_row = df.iloc[i].copy()

        # Check if current row's text ends with comma and there's a next row
        if (
            i < len(df) - 1
            and isinstance(current_row["text"], str)
            and current_row["text"].rstrip().endswith(",")
        ):
            print(i)
            next_row = df.iloc[i + 1].copy()
            # Merge: remove comma from current, combine texts, use start from current and end from next
            merged_text = current_row["text"].rstrip().strip() + " " + str(next_row["text"]).strip()
            merged_row = {
                "start": current_row["start"],
                "end": next_row["end"],
                "text": merged_text,
            }
            new_rows.append(merged_row)
            i += 2  # Skip both rows as we've merged them
        else:
            new_rows.append(current_row.to_dict())
            i += 1

    return pd.DataFrame(new_rows)

In [125]:
df = merge_comma_at_end(df)

68
70
75
155
171
180
192
196
{'start': 46800, 'end': 52400, 'text': 'Der Reich ist in voller Schärfe entbrannt. Wieder stürmen die Geschwader unserer Luftwaffe den Erdgruppen voraus.'}


In [126]:
df["start_mm:ss"] = df["start"].apply(convert_ms_to_mmss)
df["end_mm:ss"] = df["end"].apply(convert_ms_to_mmss)

In [131]:
df.iloc[68]["end"]

np.int64(983120)