In [27]:
import os
import polars as pl

def read_rttm_file(file_path):
    # Read an RTTM file and return its content as a DataFrame
    columns = ["Type", "FileID", "Channel", "StartTime", "Duration", "SpeakerType", "Confidence", "SpeakerID", "Signal"]

    with open(file_path, 'r') as file:
        lines = file.readlines()

    data = [line.strip().split()[:9] for line in lines]
    df = pl.DataFrame(data, schema=columns)

    return df

def combine_rttm_files(directory_path):
    # Combine all RTTM files in a directory into a single DataFrame
    all_dataframes = []

    for filename in os.listdir(directory_path):
        if filename.endswith(".rttm"):
            file_path = os.path.join(directory_path, filename)
            df = read_rttm_file(file_path)
            all_dataframes.append(df)

    if not all_dataframes:
        print("No .rttm files found in the specified directory.")
        return None

    combined_df = pl.concat(all_dataframes)
    return combined_df

In [28]:
# Example usage:
directory_path = "../data/speech_audio_diarized"
result_df = combine_rttm_files(directory_path)

# If result_df is not None, you can now work with the combined DataFrame.
if result_df is not None:
    print(result_df.head())
    # Perform any additional operations on the DataFrame as needed.

shape: (5, 9)
┌─────────┬─────────────┬─────────┬───────────┬───┬─────────────┬────────────┬────────────┬────────┐
│ Type    ┆ FileID      ┆ Channel ┆ StartTime ┆ … ┆ SpeakerType ┆ Confidence ┆ SpeakerID  ┆ Signal │
│ ---     ┆ ---         ┆ ---     ┆ ---       ┆   ┆ ---         ┆ ---        ┆ ---        ┆ ---    │
│ str     ┆ str         ┆ str     ┆ str       ┆   ┆ str         ┆ str        ┆ str        ┆ str    │
╞═════════╪═════════════╪═════════╪═══════════╪═══╪═════════════╪════════════╪════════════╪════════╡
│ SPEAKER ┆ Bush_vs_Gor ┆ 1       ┆ 7.513     ┆ … ┆ <NA>        ┆ <NA>       ┆ SPEAKER_02 ┆ <NA>   │
│         ┆ e_The_third ┆         ┆           ┆   ┆             ┆            ┆            ┆        │
│         ┆ _2000_pres… ┆         ┆           ┆   ┆             ┆            ┆            ┆        │
│ SPEAKER ┆ Bush_vs_Gor ┆ 1       ┆ 19.194    ┆ … ┆ <NA>        ┆ <NA>       ┆ SPEAKER_02 ┆ <NA>   │
│         ┆ e_The_third ┆         ┆           ┆   ┆             ┆            

In [33]:
df = (
    result_df
    .select(pl.col("FileID", "SpeakerID"), pl.col("StartTime", "Duration").cast(pl.Float32))
    .rename({"FileID": "file_id", "StartTime": "start", "Duration": "duration", "SpeakerID": "speaker_est"})
    .with_columns((pl.col("start") + pl.col("duration")).alias("end"))
)

In [39]:
df.head()
df.shape

(13090, 5)

In [41]:
df.write_csv("../data/speech_validation.csv")