In [22]:
group_by_speaker = False
filename = "combined_smaller_sample_2"

In [1]:
import json

def read_json_file(filepath):
    """
    Reads a JSON file and returns the data as a Python object.

    Args:
        filepath (str): The path to the JSON file.

    Returns:
        dict or list: The data from the JSON file, or None if an error occurs.
    """
    try:
        with open(filepath, 'r') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {filepath}")
        return None

In [None]:
import json

json_data = read_json_file(f'output/{filename}.json')
print(json_data)

{'speakers': [{'speaker': 'SPEAKER_01', 'text': ' Yet, he is of all creatures the most formatively armed.', 'timestamp': [0.0, 4.52]}, {'speaker': 'SPEAKER_00', 'text': ' What then is the devilfish? It is the sea vampire.', 'timestamp': [4.52, 9.24]}, {'speaker': 'SPEAKER_02', 'text': ' To make sure this sad part of my story, we went the way of Hall sailors. The punch was made and I was made half drunk with it.', 'timestamp': [9.24, 18.04]}, {'speaker': 'SPEAKER_01', 'text': " At about two o'clock we heard the loud cry of sailho from a loft.", 'timestamp': [18.04, 23.4]}], 'asr': {'text': " Yet, he is of all creatures the most formatively armed. What then is the devilfish? It is the sea vampire. To make sure this sad part of my story, we went the way of Hall sailors. The punch was made and I was made half drunk with it. At about two o'clock we heard the loud cry of sailho from a loft.", 'chunks': [{'timestamp': [0.0, 4.52], 'text': ' Yet, he is of all creatures the most formatively arm

In [18]:
segments = json_data["diarization"]
print("segments", segments)

# diarizer output may contain consecutive segments from the same speaker (e.g. {(0 -> 1, speaker_1), (1 -> 1.5, speaker_1), ...})
# we combine these segments to give overall timestamps for each speaker's turn (e.g. {(0 -> 1.5, speaker_1), ...})
new_segments = []
prev_segment = cur_segment = segments[0]

for i in range(1, len(segments)):
    cur_segment = segments[i]

    # check if we have changed speaker ("label")
    if cur_segment["label"] != prev_segment["label"] and i < len(segments):
        # add the start/end times for the super-segment to the new list
        new_segments.append(
            {
                "segment": {"start": prev_segment["segment"]["start"], "end": cur_segment["segment"]["start"]},
                "speaker": prev_segment["label"],
            }
        )
        prev_segment = segments[i]

# add the last segment(s) if there was no speaker change
new_segments.append(
    {
        "segment": {"start": prev_segment["segment"]["start"], "end": cur_segment["segment"]["end"]},
        "speaker": prev_segment["label"],
    }
)

print("new_segments", new_segments)

segments [{'segment': {'start': 0.45284375, 'end': 0.5372187500000001}, 'track': 'A', 'label': 'SPEAKER_01'}, {'segment': {'start': 0.5372187500000001, 'end': 8.02971875}, 'track': 'B', 'label': 'SPEAKER_00'}, {'segment': {'start': 9.160343750000003, 'end': 9.177218750000002}, 'track': 'C', 'label': 'SPEAKER_00'}, {'segment': {'start': 9.177218750000002, 'end': 17.36159375}, 'track': 'D', 'label': 'SPEAKER_02'}, {'segment': {'start': 17.98596875, 'end': 23.43659375}, 'track': 'E', 'label': 'SPEAKER_01'}]
new_segments [{'segment': {'start': 0.45284375, 'end': 0.5372187500000001}, 'speaker': 'SPEAKER_01'}, {'segment': {'start': 0.5372187500000001, 'end': 9.177218750000002}, 'speaker': 'SPEAKER_00'}, {'segment': {'start': 9.177218750000002, 'end': 17.98596875}, 'speaker': 'SPEAKER_02'}, {'segment': {'start': 17.98596875, 'end': 23.43659375}, 'speaker': 'SPEAKER_01'}]


In [None]:
import numpy as np

transcript = json_data["asr"]["chunks"]
print("transcript", transcript)

# get the end timestamps for each chunk from the ASR output
end_timestamps = np.array([chunk["timestamp"][-1] for chunk in transcript])
segmented_preds = []

# align the diarizer timestamps and the ASR timestamps
for segment in new_segments:
    # get the diarizer end timestamp
    end_time = segment["segment"]["end"]
    # find the ASR end timestamp that is closest to the diarizer's end timestamp and cut the transcript to here
    try:
        upto_idx = np.argmin(np.abs(end_timestamps - end_time))
    except ValueError:
        continue

    if group_by_speaker:
        segmented_preds.append(
            {
                "speaker": segment["speaker"],
                "text": "".join([chunk["text"] for chunk in transcript[: upto_idx + 1]]),
                "timestamp": (transcript[0]["timestamp"][0], transcript[upto_idx]["timestamp"][1]),
            }
        )
    else:
        for i in range(upto_idx + 1):
            segmented_preds.append({"speaker": segment["speaker"], **transcript[i]})

    # crop the transcripts and timestamp lists according to the latest timestamp (for faster argmin)
    transcript = transcript[upto_idx + 1 :]
    end_timestamps = end_timestamps[upto_idx + 1 :]

print(segmented_preds)

[{'timestamp': [0.0, 4.52], 'text': ' Yet, he is of all creatures the most formatively armed.'}, {'timestamp': [4.52, 6.44], 'text': ' What then is the devilfish?'}, {'timestamp': [6.44, 9.24], 'text': ' It is the sea vampire.'}, {'timestamp': [9.24, 14.24], 'text': ' To make sure this sad part of my story, we went the way of Hall sailors.'}, {'timestamp': [14.24, 18.04], 'text': ' The punch was made and I was made half drunk with it.'}, {'timestamp': [18.04, 23.4], 'text': " At about two o'clock we heard the loud cry of sailho from a loft."}]
