In [None]:
import re
import numpy as np
import pandas as pd
import requests
import librosa
import jiwer
import warnings
import whisper.normalizers
import ptitprince as pt
from scipy import stats
from pathlib import Path
from tqdm.notebook import tqdm
from matplotlib import (
    pyplot as plt,
    ticker,
)

plt.style.use("assets/minimal.mplstyle")
plt.rcParams["figure.dpi"] = 300

warnings.filterwarnings(
    action="ignore",
    category=FutureWarning,
)

class COLORMAP:
    F1_red = "#E10600"

audio_files_path = Path() / "data" / "audio_clips"
text_normalizer = whisper.normalizers.EnglishTextNormalizer()

- ALEALB01_23_20240229_154846 optics > upshift
- LANNOR01_4_20250419_205244 im an f'ing idiot
- MAXVER01_1_20250413_181201 lado > lando
- LANNOR01_4_20240824_122457 landon > lando
- LANNOR01_4_20250906_163307 map > lap
- LANNOR01_4_20241201_113249 can't hear your â€“ strat 10!!!
- MAXVER01_1_20251109_152028 deck > deg(radation)
- ANDANT01_12_20250316_170136 [silence gets transcribed as "thank you"] 
- LIALAW01_30_20250615_151510 peel > PU
- GUAZHO01_24_20240706_113306 upship > upshift
- LANNOR01_4_20240405_034437 landau > Lando
- LANNOR01_4_20240302_153321 lina > Lando

In [None]:
def get_json(url):
    return requests.get(url).json()

radio_messages = pd.merge(
    left=pd.DataFrame.from_records(get_json("https://api.openf1.org/v1/team_radio")),
    right=pd.DataFrame.from_records(get_json("https://api.openf1.org/v1/sessions"))[[
        "session_key",
        "location",
        "year",
        "session_name",
    ]],
    on="session_key",
    how="left",
).rename(columns=dict(
    date="timestamp",
)).astype(dict(
    timestamp="datetime64[ns, UTC]",
)).sort_values("timestamp")

radio_messages["identifier"] = radio_messages["recording_url"].str.split("/").str[-1]
radio_messages["file_path"] = (audio_files_path / radio_messages["identifier"]).astype(str)

print(f"Found {len(radio_messages)} radio messages across {radio_messages["session_key"].nunique()} sessions over {radio_messages["meeting_key"].nunique()} Formula 1 events from {radio_messages["timestamp"].iloc[0].date()} to {radio_messages["timestamp"].iloc[-1].date()}")
radio_messages.tail(3)

In [None]:
def get_clip_duration(file_path):
    try:
        return librosa.get_duration(path=file_path)
    except:
        return np.nan

if not "clip_duration" in radio_messages.columns:
    radio_messages["clip_duration"] = radio_messages["file_path"].apply(get_clip_duration)
    # radio_messages = radio_messages[radio_messages["clip_duration"] < 30]

print(f"Total duration of all {len(radio_messages)} radio messages: {radio_messages["clip_duration"].sum() / 60 / 60:.0f} hours")

plt.hist(
    radio_messages["clip_duration"],
    bins=np.arange(0, 45, 0.5),
    color=COLORMAP.F1_red,
    edgecolor="white",
    linewidth=1,
)
plt.axvline(
    x=radio_messages["clip_duration"].mean(),
    color="silver",
    linestyle="dotted",
    label=f"Mean {radio_messages["clip_duration"].mean():.1f} sec",
)
plt.axvline(
    x=radio_messages["clip_duration"].median(),
    color="silver",
    linestyle="dashed",
    label=f"Median {radio_messages["clip_duration"].median():.1f} sec",
)
plt.legend(loc="upper right")
plt.xlabel("Audio clip duration [sec]")
plt.ylabel("Occurences")
plt.show()

In [None]:
def cache_audiofiles(file_path_todo_list):
    for _, radio_message in tqdm(list(file_path_todo_list), smoothing=0):
        if not Path(radio_message["file_path"]).exists():
            try:
                with open(radio_message["file_path"], mode="wb") as file:
                    file.write(requests.get(radio_message["recording_url"]).content)
            except:
                print("Failed on", radio_message["file_path"])

cache_audiofiles(
    file_path_todo_list=radio_messages[~radio_messages["file_path"].apply(Path).apply(Path.exists)][["recording_url", "file_path"]].iloc[::-1].iterrows(),
)

---

In [None]:
human_labeling_export = pd.read_json(sorted(Path("label-studio/export").iterdir())[-1]).rename(columns=dict(
    transcription="human_transcription",
)).drop_duplicates(
    subset="identifier",
    keep="last",
)

with_human_reference = pd.merge(
    left=radio_messages,
    right=human_labeling_export[[
        "identifier",
        "human_transcription",
        "lead_time",
        "updated_at",
    ]].rename(columns=dict(updated_at="human_transcription_timestamp")),
    on="identifier",
    how="right",
)

with_human_reference = with_human_reference[~with_human_reference["human_transcription"].isna()]

with_human_reference[[
    "identifier",
    "file_path",
    "human_transcription",
]].to_json(
    "data/with_human_reference.json", 
    index=False,
    orient="records",
)

print(f"Found {len(with_human_reference.drop_duplicates(subset="identifier"))} radio message clips with human reference, total duration {with_human_reference["clip_duration"].sum() / 60:.0f} minutes")
with_human_reference.sample(3)

---

In [None]:
machine_transcriptions = pd.read_csv(
    "exports/transcriptions.tsv", 
    delimiter="\t",
    names=(
        "transcription_timestamp",
        "identifier",
        "modelidentifier",
        "machine_transcription",
        "avg_logprob",
        "text_nbest",
        "no_speech_prob",
        "temperature",
        "compression_ratio",
        "sum_logprob_nbest",
        "token_nbest",
    ),
).astype(dict(
    transcription_timestamp="datetime64[ns, UTC]",
))

machine_vs_human_transcriptions = pd.merge(
    left=with_human_reference.drop(columns=[
        "meeting_key",
        "session_key",
    ]),
    right=machine_transcriptions,
    on="identifier",
    how="left",
).sort_values("transcription_timestamp").drop_duplicates(
    subset=("identifier", "modelidentifier"),
    keep="first",
)
machine_vs_human_transcriptions = machine_vs_human_transcriptions[~machine_vs_human_transcriptions["machine_transcription"].isna()]

print(f"Found {len(machine_vs_human_transcriptions.drop_duplicates(subset="identifier"))} machine transcribed radio messages that have human transcription, total duration {machine_vs_human_transcriptions.drop_duplicates("identifier")["clip_duration"].sum() / 60:.0f} minutes")
machine_vs_human_transcriptions.tail(3)

In [None]:
machine_vs_human_transcriptions[["identifier", "modelidentifier", "machine_transcription", "human_transcription"]].sort_values(["identifier", "modelidentifier"])

---

<!-- ln -s radio_messages/ /home/ucloud/.local/share/label-studio/radio_messages -->

In [None]:
for metric, estimator in [
    ("WER", jiwer.wer),
    ("WIP", jiwer.wip),
]:
    machine_vs_human_transcriptions[metric] = machine_vs_human_transcriptions.apply(lambda radio_message: estimator(
        text_normalizer(radio_message["machine_transcription"]),
        text_normalizer(radio_message["human_transcription"]),
    ), axis=1)


In [None]:
# for metric in ["WER", "WIP"]:
#     plt.hist(
#         machine_vs_human_transcriptions[metric],
#         bins=dict(
#             wer=200,
#             wip=np.linspace(0, 1, 100),
#         )[metric],
#         density=True,
#         color=COLORMAP.F1_red,
#         edgecolor="white",
#         linewidth=1,
#     )
#     plt.axvline(
#         color="silver",
#         linestyle="dashed",
#     )
#     plt.axvline(
#         x=machine_vs_human_transcriptions[metric].mean(),
#         color="silver",
#         linestyle="dotted",
#         label=f"Mean {machine_vs_human_transcriptions[metric].mean():.1%}",
#     )
#     plt.axvline(
#         x=machine_vs_human_transcriptions[metric].median(),
#         color="silver",
#         linestyle="dashed",
#         label=f"Median {machine_vs_human_transcriptions[metric].median():.1%}",
#     )
#     plt.legend(loc="upper right")
#     plt.xlabel(metric.upper())
#     plt.ylabel("PMF")
#     if metric == "WIP":
#         plt.xlim(0, 1)
#     elif metric == "WER":
#         plt.xlim(0, 3)
#     plt.gca().xaxis.set_major_formatter(ticker.PercentFormatter(1))
#     plt.show()

## WER stats

In [None]:
machine_vs_human_transcriptions.groupby("modelidentifier")["WER"].describe().round(3)

In [None]:
stock_transcriptions = machine_vs_human_transcriptions[machine_vs_human_transcriptions["modelidentifier"] == "stockWhisper"]
TCPGen_transcriptions = machine_vs_human_transcriptions[machine_vs_human_transcriptions["modelidentifier"] == "TCPGenWhisper"]

In [None]:
stats.ttest_ind(
    stock_transcriptions["WER"],
    TCPGen_transcriptions["WER"],
    equal_var=False,
)

## WIP stats

In [None]:
machine_vs_human_transcriptions.groupby("modelidentifier")["WIP"].describe().round(3)

In [None]:
stats.ttest_ind(
    stock_transcriptions["WIP"],
    TCPGen_transcriptions["WIP"],
    equal_var=False,
)

In [None]:
fig, ax = plt.subplots()
ax = pt.RainCloud(
    data=machine_vs_human_transcriptions,
    x="modelidentifier",
    y="WIP",
    hue="modelidentifier",
    bw=0.1,
    palette=(
        COLORMAP.F1_red,
        "grey",
    ),
    width_viol=0.6,
    width_box=0.2,
    linewidth=0,
    alpha=1,
    point_size=2,
    orient="h",
    rain_clip_on=False,
    pointplot=True,
    linecolor="black",
    point_linewidth=1.4,
    point_errorbar="ci",
    point_errwidth=1.4,
    point_capsize=0.05,
    ax=ax, 
)
plt.ylabel("")
plt.xlabel("WIP")
plt.xlim(0, 1)
plt.gca().xaxis.set_major_formatter(ticker.PercentFormatter(1))
plt.show()

fig.savefig(Path() / "exports" / "raincloud_plot_wip_comparison.png")

In [None]:
with open("data/biasing_list.txt") as file:
    biasing_terms = [word.strip() for word in file]

def biasing_terms_in_utterance(utterance):
    words = [re.sub(r"[^A-Za-z0-9 *]", "", word).strip().upper() for word in utterance.upper().split()]
    return list(set([word for word in words if word in biasing_terms]))

machine_vs_human_transcriptions["biasing_terms"] = machine_vs_human_transcriptions["human_transcription"].apply(biasing_terms_in_utterance)

machine_vs_human_transcriptions[machine_vs_human_transcriptions["biasing_terms"].apply(len) > 0].groupby("modelidentifier")["WIP"].describe().round(3)

In [None]:
plt.scatter(
    machine_vs_human_transcriptions["biasing_terms"].apply(len) / machine_vs_human_transcriptions["human_transcription"].str.split().str.len(),
    machine_vs_human_transcriptions["WIP"],
    color=COLORMAP.F1_red,
    edgecolor="white",
    alpha=0.5,
    clip_on=False,
    zorder=5,
)
plt.gca().yaxis.set_major_formatter(ticker.PercentFormatter(1))
plt.xscale("log", base=10)
plt.gca().xaxis.set_major_formatter(ticker.PercentFormatter(1))
# plt.xlim(right=1)
plt.ylim(0, 1)
plt.xlabel("Biasing terms proportion")
plt.ylabel("WIP")
plt.show()

In [None]:
plt.scatter(
    machine_vs_human_transcriptions["clip_duration"],
    machine_vs_human_transcriptions["WIP"],
    color=COLORMAP.F1_red,
    edgecolor="white",
    alpha=0.5,
    clip_on=False,
    zorder=5,
)
plt.gca().yaxis.set_major_formatter(ticker.PercentFormatter(1))
plt.xscale("log", base=10)
plt.gca().xaxis.set_major_formatter(ticker.ScalarFormatter())
plt.ylim(0, 1)
plt.xlabel("Clip duration [sec]")
plt.ylabel("WIP")
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
plt.scatter(
    machine_vs_human_transcriptions["human_transcription"].str.split(" ").str.len(),
    machine_vs_human_transcriptions["machine_transcription"].str.split(" ").str.len(),
    color=COLORMAP.F1_red,
    edgecolor="white",
    alpha=0.5,
    clip_on=False,
    zorder=5,
)
plt.plot(
    [0, 1],
    [0, 1],
    transform=plt.gca().transAxes,
    color="silver",
    linestyle="dotted",
    zorder=-1,
)
plt.xscale("log", base=10)
plt.yscale("log", base=10)
plt.gca().xaxis.set_major_formatter(ticker.ScalarFormatter())
plt.gca().yaxis.set_major_formatter(ticker.ScalarFormatter())
plt.xlim(1, 150)
plt.ylim(1, 150)
plt.xlabel("Human transcription word count")
plt.ylabel("ASR transcription word count")
plt.show()

In [None]:
model_training_log = pd.read_csv(
    sorted(Path("exports").glob("log*"))[-1],
    names=(
        "timestamp",
        "epoch",
        "training_batch_loss",
        "validation_batch_loss",
        "model_accuracy",
    )
)
model_training_log

fig, ax = plt.subplots()
for loss_type in (
    "training_loss",
    "testing_loss",
):
    plt.plot(
        model_training_log["epoch"],
        model_training_log[loss_type],
        label=f"{loss_type.replace("_", " ").capitalize()}",
        marker="o",
        markerfacecolor="white",
        clip_on=False,
        zorder=10,
    )
plt.legend(loc="upper right")
plt.ylim(bottom=0)
plt.xlabel("Training epoch")

fig.savefig(Path() / "exports" / "model_training_loss_curve.png")
plt.show()

In [None]:
%load_ext watermark
%watermark -iv -v -m