In [1]:
import os
import polars as pl

def read_rttm_file(file_path):
    # Read an RTTM file and return its content as a DataFrame
    columns = ["Type", "FileID", "Channel", "StartTime", "Duration", "SpeakerType", "Confidence", "SpeakerID", "Signal"]

    with open(file_path, 'r') as file:
        lines = file.readlines()

    data = [line.strip().split()[:9] for line in lines]
    df = pl.DataFrame(data, schema=columns)

    return df

def combine_rttm_files(directory_path):
    # Combine all RTTM files in a directory into a single DataFrame
    all_dataframes = []

    for filename in os.listdir(directory_path):
        if filename.endswith(".rttm"):
            file_path = os.path.join(directory_path, filename)
            df = read_rttm_file(file_path)
            all_dataframes.append(df)

    if not all_dataframes:
        print("No .rttm files found in the specified directory.")
        return None

    combined_df = pl.concat(all_dataframes)
    return combined_df

In [2]:
# Example usage:
directory_path = "../data/ad_audio_diarized"
result_df = combine_rttm_files(directory_path)

# If result_df is not None, you can now work with the combined DataFrame.
if result_df is not None:
    print(result_df.head())
    # Perform any additional operations on the DataFrame as needed.

shape: (5, 9)
┌─────────┬─────────────┬─────────┬───────────┬───┬─────────────┬────────────┬────────────┬────────┐
│ Type    ┆ FileID      ┆ Channel ┆ StartTime ┆ … ┆ SpeakerType ┆ Confidence ┆ SpeakerID  ┆ Signal │
│ ---     ┆ ---         ┆ ---     ┆ ---       ┆   ┆ ---         ┆ ---        ┆ ---        ┆ ---    │
│ str     ┆ str         ┆ str     ┆ str       ┆   ┆ str         ┆ str        ┆ str        ┆ str    │
╞═════════╪═════════════╪═════════╪═══════════╪═══╪═════════════╪════════════╪════════════╪════════╡
│ SPEAKER ┆ pres_trimme ┆ 1       ┆ 3.879     ┆ … ┆ <NA>        ┆ <NA>       ┆ SPEAKER_04 ┆ <NA>   │
│         ┆ d_incl_scen ┆         ┆           ┆   ┆             ┆            ┆            ┆        │
│         ┆ e-P-1904-1… ┆         ┆           ┆   ┆             ┆            ┆            ┆        │
│ SPEAKER ┆ pres_trimme ┆ 1       ┆ 9.228     ┆ … ┆ <NA>        ┆ <NA>       ┆ SPEAKER_02 ┆ <NA>   │
│         ┆ d_incl_scen ┆         ┆           ┆   ┆             ┆            

In [6]:
df = (
    result_df
    .select(pl.col("FileID", "SpeakerID"), pl.col("StartTime", "Duration").cast(pl.Float32))
    .rename({"FileID": "file_id", "StartTime": "start", "Duration": "duration", "SpeakerID": "speaker_est"})
    .with_columns((pl.col("start") + pl.col("duration")).alias("end"))
)

In [8]:
df.head()


file_id,speaker_est,start,duration,end
str,str,f32,f32,f32
"""P-1904-108497""","""SPEAKER_04""",3.879,5.195,9.074
"""P-1904-108497""","""SPEAKER_02""",9.228,2.733,11.960999
"""P-1904-108497""","""SPEAKER_02""",13.166,3.311,16.477001
"""P-1904-108497""","""SPEAKER_04""",16.426001,8.761,25.187
"""P-1904-108497""","""SPEAKER_01""",25.934,3.107,29.041


In [9]:
df.write_csv("../data/ad_validation.csv")