Merge pull request #5341 from roshansh-cmu/scorefix

Update Scoring for Speech Summarization from NLG-Eval to Huggingface Evaluate
espnet · Jul 22, 2023 · 79a74bc · 79a74bc
2 parents 2f87a2c + 14acebf
commit 79a74bc
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 40 deletions.
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py b/egs2/TEMPLATE/asr1/pyscripts/utils/score_summarization.py
@@ -1,9 +1,10 @@
-import os
+#! /bin/python
+
+# Score summarization outputs using the HuggingFace's evaluate library
 import sys
 
+import evaluate
 import numpy as np
-from datasets import load_metric
-from nlgeval import NLGEval, compute_metrics
 
 ref_file = sys.argv[1]
 hyp_file = sys.argv[2]
@@ -24,26 +25,24 @@
 labels = [ref_dict[k] for k, _ in hyp_dict.items()]
 decoded_preds = [v for k, v in hyp_dict.items()]
 
-metric = load_metric("bertscore")
-result_bert = metric.compute(
+
+summ_metrics = evaluate.combine(["rouge", "meteor"])
+
+bertscore_metric = evaluate.load("bertscore")
+
+
+result = summ_metrics.compute(
     predictions=decoded_preds,
     references=labels,
-    lang="en",
 )
 
-
-nlg = NLGEval()  # loads the models
-print("Key", "\t", "METEOR", "\t", "ROUGE-L")
-for key, ref, hyp in zip(keys, labels, decoded_preds):
-    metrics_dict = nlg.compute_individual_metrics([ref], hyp)
-    print(key, "\t", metrics_dict["METEOR"], "\t", metrics_dict["ROUGE_L"])
-refs = [[x] for x in labels]
-metrics_dict = nlg.compute_metrics(ref_list=[labels], hyp_list=decoded_preds)
-metric = load_metric("rouge")
-result = metric.compute(predictions=decoded_preds, references=labels)
-result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
-
-print(
-    f"RESULT {result['rouge1']} {result['rouge2']} {result['rougeL']} \
-    {metrics_dict['METEOR']*100.0} {100*np.mean(result_bert['precision'])}"
+bertscore_result = bertscore_metric.compute(
+    predictions=decoded_preds,
+    references=labels,
+    lang="en",
 )
+
+rouge = f"{result['rouge1']*100} {result['rouge2']**100} {result['rougeL']*100}"
+mtr = f"{result['meteor']*100}"
+brtsc = f"{np.mean(bertscore_result['precision'])*100}"
+print(f"RESULT {rouge} {mtr} {brtsc}")
diff --git a/egs2/how2_2000h/sum1/local/path.sh b/egs2/how2_2000h/sum1/local/path.sh
@@ -5,15 +5,3 @@ if ! python -c 'import longformer; import nlgeval; import datasets' > /dev/null;
     echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
     return 1
 fi
-if ! python -c 'import nlgeval' > /dev/null; then
-    echo "Error: it seems that nlgeval is not installed." >&2
-    echo "Error: please install nlgeval as follows." >&2
-    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
-    return 1
-fi
-if ! python -c 'import datasets' > /dev/null; then
-    echo "Error: it seems that datasets is not installed." >&2
-    echo "Error: please install datasets as follows." >&2
-    echo "Error: cd ${MAIN_ROOT}/tools && make longformer.done" >&2
-    return 1
-fi
diff --git a/egs2/how2_2000h/sum1/local/score.sh b/egs2/how2_2000h/sum1/local/score.sh
@@ -2,7 +2,8 @@
 # Copyright 2021 Carnegie Mellon University (Author : Roshan Sharma)
 
 ## begin configuration section.
-data=data/dev5_test_sum
+ref_file=data/dev5_test_sum/text
+inference_tag=decode
 # end configuration section.
 
 
@@ -18,8 +19,10 @@ fi
 
 asr_expdir=$1
 
-name=$(basename ${data}) # e.g. dev5_test
-echo "${asr_expdir}/decode_*/${name}"
-for dir in ${asr_expdir}/decode_*/${name}; do
-    python pyscripts/utils/score_summarization.py $data/text $dir/text $(echo $dir | sed 's/exp//g') > $dir/result.sum
+for decode_dir in $(ls -d ${asr_expdir}/*/ | grep ${inference_tag}); do
+	for test_dir in $(ls -d ${decode_dir}/*/); do
+		dir=${test_dir}
+		echo "${decode_dir} ${asr_expdir} ${test_dir} ${dir}"
+    		python pyscripts/utils/score_summarization.py ${ref_file} $dir/text $(echo $dir | sed 's/exp//g') > $dir/result.sum
+	done
 done
diff --git a/setup.py b/setup.py
@@ -87,6 +87,7 @@
         "fairscale",
         "transformers",
         "gtn==0.0.0",
+        "evaluate",
     ],
     "setup": [
         "pytest-runner",

diff --git a/tools/installers/install_longformer.sh b/tools/installers/install_longformer.sh
@@ -53,8 +53,6 @@ else
 
     if $(pt_plus 1.8.0); then
         python -m pip install git+https://github.com/roshansh-cmu/longformer.git
-        python -m pip install datasets bert-score
-        python -m pip install git+https://github.com/Maluuba/nlg-eval.git@master
     else
         echo "[WARNING] Longformer requires pytorch>=1.8.*"
     fi