<a href="https://colab.research.google.com/github/dungdt-infopstats/TV-command-synthesis/blob/main/notebooks/DDSS_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

tridungdo_whisper_text_raw_df_json_path = kagglehub.dataset_download('tridungdo/whisper-text-raw-df-json')
tridungdo_wer_cut_res_path = kagglehub.dataset_download('tridungdo/wer-cut-res')
tridungdo_wer_025_raw_path = kagglehub.dataset_download('tridungdo/wer-025-raw')

print('Data source import complete.')


# old wer, clean needed

In [None]:
import pandas as pd

path = "/kaggle/input/wer-cut-res/asr_evaluation_results.csv"

df = pd.read_csv(path)

In [None]:
import pandas as pd

import pandas as pd
from tqdm import tqdm

def get_transcribe(df, ref_col, reg_col, seg_col, id_col):
    results = []

    # group theo id_col, dùng tqdm để theo dõi tiến độ
    for id_val, group in tqdm(df.groupby(id_col), desc="Processing IDs"):
        # sắp xếp theo seg_col để đảm bảo đúng thứ tự
        group_sorted = group.sort_values(by=seg_col)

        ref_value = group_sorted[ref_col].iloc[0]  # lấy 1 cái là đủ
        reg_joined = " ".join(group_sorted[reg_col].astype(str).tolist())

        results.append({
            id_col: id_val,
            ref_col: ref_value,
            reg_col: reg_joined
        })

    return pd.DataFrame(results)


transcribed_df = get_transcribe(df = df,
                               ref_col = 'reference_text',
                               reg_col = 'recognized_text',
                               seg_col = 'segment_index',
                               id_col = 'id_x')

In [None]:
df_info = transcribed_df[['id_x', 'reference_text']] # true info with true length

# raw transcribe only

In [None]:
import json
import pandas as pd

raw_path = "/kaggle/input/whisper-text-raw-df-json/whisper_text_raw_df.json"

with open(raw_path) as f:
    raw_data = json.load(f)

df_raw = pd.DataFrame(raw_data)
df_raw_final = pd.merge(df_raw, df_info, left_on = 'id', right_on = 'id_x')
df_raw_final = df_raw_final.rename(columns = {
    "full_text": "recognized_text"
})

# Preprocessing

In [None]:
import pandas as pd
import re


def clean_text(text: str) -> str:
        # bỏ dấu câu
        text = re.sub(r"[^\w\s]", "", text)
        # thay nhiều khoảng trắng thành 1 khoảng trắng
        text = re.sub(r"\s+", " ", text)
        return text.strip()

def preprocessing(df: pd.DataFrame, ref_col: str, reg_col: str) -> pd.DataFrame:
    # Fillna bằng chuỗi rỗng
    df[ref_col] = df[ref_col].fillna("")
    df[reg_col] = df[reg_col].fillna("")


    df[ref_col] = df[ref_col].apply(clean_text)
    df[reg_col] = df[reg_col].apply(clean_text)

    return df

In [None]:
################## COMPUTE_DF

COMPUTE_DF = df

In [None]:
preprocessed_df = preprocessing(
    df = COMPUTE_DF,
    ref_col = "reference_text",
    reg_col = "recognized_text"
)

# WER

In [None]:
!pip install jiwer

In [None]:
import pandas as pd
import jiwer
from tqdm import tqdm

def compute_wer(df: pd.DataFrame, ref_col: str, hyp_col: str) -> pd.DataFrame:
    """
    Tính toán WER metrics cho từng hàng trong DataFrame.
    - ref_col: cột chứa ground-truth text
    - hyp_col: cột chứa predicted text
    """

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Computing WER"):
        ref = str(row[ref_col]) if pd.notna(row[ref_col]) else ""
        hyp = str(row[hyp_col]) if pd.notna(row[hyp_col]) else ""

        output = jiwer.process_words(reference=ref, hypothesis=hyp).__dict__

        # Đảm bảo các cột metrics đã tồn tại
        for key, val in output.items():
            if key not in df.columns:
                df[key] = None
            df.at[idx, key] = val

        # Đếm số từ trong reference
        words_count = len(ref.split()) if ref.strip() else 0
        if "words_count" not in df.columns:
            df["words_count"] = None
        df.at[idx, "words_count"] = words_count

    return df


In [None]:
df_wer = compute_wer(
    df = preprocessed_df,
    ref_col = "reference_text",
    hyp_col = "recognized_text"
)

In [None]:
import numpy as np

def statistics(df_wer):
    res = {}
    res["N"] = np.nansum(df_wer['words_count'])

    res["D"] = np.nansum(df_wer['deletions'])
    res["I"] = np.nansum(df_wer['insertions'])
    res["S"] = np.nansum(df_wer['substitutions'])

    res["D_rate"] = res["D"] / res["N"]
    res["I_rate"] = res["I"] / res["N"]
    res["S_rate"] = res["S"] / res["N"]
    res["WER"] = (res["D"] + res["I"] + res["S"]) / res["N"]

    return res

In [None]:
df_07 = statistics(df_wer)

In [None]:
df_025

In [None]:
df_raw

In [None]:
df_07