In [None]:
from pathlib import Path

from codec import run_codec_benchmarks, codec_json_to_df
from ffmpeg_utils import ConversionOptions, ffmpeg_convert
from data import get_wav_txt_file_paths
from config import (RAW_DIR, OUTPUT_DIR, PROCESSED_DIR)

from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

sns.set_theme(style="whitegrid")
sns.set_context("notebook", font_scale=1.2)

In [None]:
CODEC_COMBINATIONS: dict[str, ConversionOptions] = {
    "flac": {
        "acodec": "flac",
        "compression_level": ["0", "5"],
        "ar": ["8000", "16000"],
        "ac": "1"
    },
    "wav": {
        "acodec": "pcm_s16le",
        "ar": ["8000", "16000"],
        "ac": "1",
    },
    "opus": {
        "acodec": "libopus",
        "b:a": ["24k", "32k"],
        "application": "voip", 
        "ar": "16000",
        "ac": "1",
    },
    "aac": {
        "acodec": "aac",
        "b:a": ["32k", "64k"],
        "ar": "16000",
        "ac": "1",
    },
    "mp3": {
        "acodec": "libmp3lame",
        "b:a": ["32k", "64k"],  
        "ar": "16000",
        "ac": "1",
    },
    "mp4": {
        "acodec": "aac",
        "b:a": ["32k", "64k"],
        "ar": "16000",
        "ac": "1",
    },
}

CODEC_STANDALONES: dict[str, list[ConversionOptions]] = {
    "flac": [
        {
            "acodec": "flac",
            "compression_level": "0",
            "ar": "16000",
            "ac": "0"
        },
    ],
    # minute settings
    "mp3": [
        {
        "acodec": "libmp3lame",
        "b:a": "192k",  
        "ar": "16000",
        "ac": "1",
        }
    ]
}

## Benchmarks
- WER
- File Size
- CPU Time

In [None]:
audio_samples = get_wav_txt_file_paths(RAW_DIR)

results = run_codec_benchmarks(
    combination_codecs=CODEC_COMBINATIONS,
    standalone_codecs=CODEC_STANDALONES,
    audio_samples=audio_samples,
    results_output_path=OUTPUT_DIR / "codec_benchmarks.json",
    with_transcription=True
)

In [None]:
df = codec_json_to_df(OUTPUT_DIR / "codec_benchmarks.json")
df["label"] = df["codec"] + " " + df["spec"]
df["file_size_mb"] = df["file_size"] / (1024 ** 2)
df["cpu_time_per_sec"] = df["cpu_time"] / df["duration"]
df["mb_per_sec"] = df["file_size_mb"] / df["duration"]

summary = (
    df
    .groupby(["codec", "spec"], as_index=False)
    .agg({
        "wer": "mean",
        "cpu_time_per_sec": "mean",
        "mb_per_sec": "mean"
    })
    .sort_values("wer")
)

summary["label"] = summary["codec"] + " " + summary["spec"]
sample_rate_8k = summary[summary["spec"].str.contains("8000")]

In [None]:
plot_specs = [
    {
        "y": "wer",
        "y_label": "Word Error Rate",
        "title": "Codec by Mean WER"
    },
    {
        "y": "mb_per_sec",
        "y_label": "Normalised File Size (MB)",
        "title": "Codec by Mean File Size / Sec"
    },
    {
        "y": "cpu_time_per_sec",
        "y_label": "Normalised CPU Time (Seconds)",
        "title": "Codec by Mean CPU Time / Sec"
    },
]

for spec in plot_specs:
    plot_df = summary.sort_values(spec["y"])

    fig, ax = plt.subplots(figsize=(15, 10))

    # ascending values
    sns.stripplot(
        data=plot_df,
        x="label",
        y=spec["y"],
        alpha=0,
    )

    # all files
    sns.stripplot(
        data=df,
        x="label",
        y=spec["y"],
        jitter=False,
        alpha=0.5,
        size=8,
        hue="file_id"
    )

    # mean values
    sns.stripplot(
        data=plot_df,
        x="label",
        y=spec["y"],
        jitter=False,
        color="lightgreen",
        edgecolor="green",
        linewidth=2,
        size=13,
    )

    sns.stripplot(
        data=sample_rate_8k,
        x="label",
        y=spec["y"],
        jitter=False,
        color="pink",
        edgecolor="red",  
        linewidth=2,   
        size=13,
        alpha=1,
    )

    plt.xlabel("Specification")
    plt.ylabel(spec["y_label"])
    plt.title(spec["title"])
    plt.xticks(rotation=30, ha="right")

    # manually add in legend entry for Means
    mean_patch = mpatches.Patch(facecolor="lightgreen", label="Mean Value", edgecolor="green", linewidth=2)
    sample8k_patch = mpatches.Patch(facecolor="pink", label="8kHz Mean", edgecolor="red", linewidth=2)
    handles, labels = ax.get_legend_handles_labels()
    handles = handles[:len(df["file_id"].unique())] + [mean_patch, sample8k_patch]
    labels = labels[:len(df["file_id"].unique())] + ["Mean Value", "8kHz Mean"]

    plt.legend(handles=handles, labels=labels, bbox_to_anchor=(-0.07, 1), loc='upper right', title="File ID")

    plt.tight_layout()
    plt.show()

### Scaling and Plotting all 3 metrics

In [None]:
metrics = ["wer", "mb_per_sec", "cpu_time_per_sec"]

df_plot = summary[["label"] + metrics].copy()

# scale
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df_plot[metrics])
# invert - low values are better for all metrics
df_plot[metrics] = 1 - scaled

df_plot = df_plot.rename(columns={
    "wer": "WER",
    "mb_per_sec": "File Size / sec",
    "cpu_time_per_sec": "CPU Time / sec"
})

long_df = df_plot.melt(
    id_vars="label",
    var_name="Metric",
    value_name="Score"
)

In [None]:
plt.figure(figsize=(15, 8))

ax = sns.pointplot(
    data=long_df,
    x="label",
    y="Score",
    hue="Metric",
    markers="o",
    linestyles="",
    markersize=10,
)

# lines between codecs
n_labels = long_df["label"].nunique()

for i in range(n_labels + 1):
    ax.axvline(
        x=i - 0.5,
        color="gray",
        linestyle="-",
        alpha=0.15,
        linewidth=1,
        zorder=0
    )

ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha="right")

ax.set_ylabel("Scaled Performance (Higher = Better)")
ax.set_xlabel("Codec")
ax.set_title("Codec Comparison (Scaled Means)")

ax.grid(axis="y", alpha=0.3)
ax.set_ylim(-0.05, 1.05)

ax.legend(
    title="Metric",
    bbox_to_anchor=(-0.07, 1),
    loc="upper right",
    borderaxespad=0
)

plt.tight_layout()
plt.show()


## Benchmark longer files so exclude transcription

Do file size and CPU time scale linearly with audio length? Benchmark on longer files to check. Exclude transcription to avoid transcribing lots of audio.

Manually update RAW data folder (`/data/raw`) with longer audio samples if needed.

In [None]:
audio_samples = get_wav_txt_file_paths(RAW_DIR)

results = run_codec_benchmarks(
    combination_codecs=CODEC_COMBINATIONS,
    standalone_codecs=CODEC_STANDALONES,
    audio_samples=audio_samples,
    results_output_path=OUTPUT_DIR / "long_codec_benchmarks.json",
    with_transcription=False
)

In [None]:
df = codec_json_to_df(OUTPUT_DIR / "long_codec_benchmarks.json")
df["label"] = df["codec"] + " " + df["spec"]
df["file_size_mb"] = df["file_size"] / (1024 ** 2)
df["cpu_time_per_sec"] = df["cpu_time"] / df["duration"]
df["mb_per_sec"] = df["file_size_mb"] / df["duration"]

summary = (
    df
    .groupby(["codec", "spec"], as_index=False)
    .agg({
        "cpu_time_per_sec": "mean",
        "mb_per_sec": "mean"
    })
)

summary["label"] = summary["codec"] + " " + summary["spec"]
sample_rate_8k = summary[summary["spec"].str.contains("8000")]

In [None]:
plot_specs = [
    {
        "y": "mb_per_sec",
        "y_label": "Normalised File Size (MB)",
        "title": "Codec by Mean File Size / Sec"
    },
    {
        "y": "cpu_time_per_sec",
        "y_label": "Normalised CPU Time (Seconds)",
        "title": "Codec by Mean CPU Time / Sec"
    },
]

for spec in plot_specs:
    plot_df = summary.sort_values(spec["y"])

    fig, ax = plt.subplots(figsize=(15, 10))

    # ascending values
    sns.stripplot(
        data=plot_df,
        x="label",
        y=spec["y"],
        alpha=0,
    )

    # all files
    sns.stripplot(
        data=df,
        x="label",
        y=spec["y"],
        jitter=False,
        alpha=0.5,
        size=8,
        hue="file_id"
    )

    # mean values
    sns.stripplot(
        data=plot_df,
        x="label",
        y=spec["y"],
        jitter=False,
        color="lightgreen",
        edgecolor="green",
        linewidth=2,
        size=13,
    )

    sns.stripplot(
        data=sample_rate_8k,
        x="label",
        y=spec["y"],
        jitter=False,
        color="pink",
        edgecolor="red",  
        linewidth=2,   
        size=13,
        alpha=1,
    )

    plt.xlabel("Specification")
    plt.ylabel(spec["y_label"])
    plt.title(spec["title"])
    plt.xticks(rotation=30, ha="right")

    # manually add in legend entry for Means
    mean_patch = mpatches.Patch(facecolor="lightgreen", label="Mean Value", edgecolor="green", linewidth=2)
    sample8k_patch = mpatches.Patch(facecolor="pink", label="8kHz Mean", edgecolor="red", linewidth=2)
    handles, labels = ax.get_legend_handles_labels()
    handles = handles[:len(df["file_id"].unique())] + [mean_patch, sample8k_patch]
    labels = labels[:len(df["file_id"].unique())] + ["Mean Value", "8kHz Mean"]

    plt.legend(handles=handles, labels=labels, bbox_to_anchor=(-0.07, 1), loc='upper right', title="File ID")

    plt.tight_layout()
    plt.show()

## Benchmark conversion from Opus

Checking whether converting from Opus (instead of WAV) has impact on non-transcription metrics.

In [None]:
audio_samples = get_wav_txt_file_paths(RAW_DIR)

# convert to WAV to Opus
output_path = PROCESSED_DIR / "opus_source"
output_path.mkdir(parents=True, exist_ok=True)

for sample in audio_samples.keys():
    opus, _ = ffmpeg_convert(
        sample, 
        output_path / f"{sample.stem}.opus", 
        ["-acodec", "libopus", "-b:a", "32k", "-application", "voip", "-ar", "16000", "-ac", "1"]
    )

In [None]:
opus_audio_samples = {}
for file in (PROCESSED_DIR / "opus_source").iterdir():
    if file.is_file():
        # without transcription so text file Path('') unused
        opus_audio_samples[file] = Path('') 

results = run_codec_benchmarks(
    combination_codecs=CODEC_COMBINATIONS,
    standalone_codecs=CODEC_STANDALONES,
    audio_samples=opus_audio_samples,
    results_output_path=OUTPUT_DIR / "opus_codec_benchmarks.json",
    with_transcription=False
)

In [None]:
df = codec_json_to_df(OUTPUT_DIR / "opus_codec_benchmarks.json")
df["label"] = df["codec"] + " " + df["spec"]
df["file_size_mb"] = df["file_size"] / (1024 ** 2)
df["cpu_time_per_sec"] = df["cpu_time"] / df["duration"]
df["mb_per_sec"] = df["file_size_mb"] / df["duration"]

summary = (
    df
    .groupby(["codec", "spec"], as_index=False)
    .agg({
        "cpu_time_per_sec": "mean",
        "mb_per_sec": "mean"
    })
)

summary["label"] = summary["codec"] + " " + summary["spec"]
sample_rate_8k = summary[summary["spec"].str.contains("8000")]

In [None]:
plot_specs = [
    {
        "y": "mb_per_sec",
        "y_label": "Normalised File Size (MB)",
        "title": "Codec by Mean File Size / Sec"
    },
    {
        "y": "cpu_time_per_sec",
        "y_label": "Normalised CPU Time (Seconds)",
        "title": "Codec by Mean CPU Time / Sec"
    },
]

for spec in plot_specs:
    plot_df = summary.sort_values(spec["y"])

    fig, ax = plt.subplots(figsize=(15, 10))

    # ascending values
    sns.stripplot(
        data=plot_df,
        x="label",
        y=spec["y"],
        alpha=0,
    )

    # all files
    sns.stripplot(
        data=df,
        x="label",
        y=spec["y"],
        jitter=False,
        alpha=0.5,
        size=8,
        hue="file_id"
    )

    # mean values
    sns.stripplot(
        data=plot_df,
        x="label",
        y=spec["y"],
        jitter=False,
        color="lightgreen",
        edgecolor="green",
        linewidth=2,
        size=13,
    )

    sns.stripplot(
        data=sample_rate_8k,
        x="label",
        y=spec["y"],
        jitter=False,
        color="pink",
        edgecolor="red",  
        linewidth=2,   
        size=13,
        alpha=1,
    )

    plt.xlabel("Specification")
    plt.ylabel(spec["y_label"])
    plt.title(spec["title"])
    plt.xticks(rotation=30, ha="right")

    # manually add in legend entry for Means
    mean_patch = mpatches.Patch(facecolor="lightgreen", label="Mean Value", edgecolor="green", linewidth=2)
    sample8k_patch = mpatches.Patch(facecolor="pink", label="8kHz Mean", edgecolor="red", linewidth=2)
    handles, labels = ax.get_legend_handles_labels()
    handles = handles[:len(df["file_id"].unique())] + [mean_patch, sample8k_patch]
    labels = labels[:len(df["file_id"].unique())] + ["Mean Value", "8kHz Mean"]

    plt.legend(handles=handles, labels=labels, bbox_to_anchor=(-0.07, 1), loc='upper right', title="File ID")

    plt.tight_layout()
    plt.show()