In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [None]:
## Read Data 
output_df = pd.read_csv('../data/release_data_summaries_expert.tsv',sep="\t")

In [None]:
output_df.head()

# Transcript Analysis


In [None]:
text_summary1_lengths = output_df["Text_Summary1"].apply(lambda x: len(x.split(" "))).tolist()
text_summary2_lengths = output_df["Text_Summary2"].apply(lambda x: len(x.split(" "))).tolist()
audio_summary1_lengths = output_df["Audio_Summary1"].apply(lambda x: len(x.split(" "))).tolist()
audio_summary2_lengths = output_df["Audio_Summary2"].apply(lambda x: len(x.split(" "))).tolist()
asr_summary1_lengths = output_df["Whisper_Transcript_Summary1"].apply(lambda x: len(x.split(" "))).tolist()
asr_summary2_lengths = output_df["Whisper_Transcript_Summary2"].apply(lambda x: len(x.split(" "))).tolist()
transcript_lengths = output_df["transcript"].apply(lambda x: len(x.split(" "))).tolist()

mean_text_summary = (text_summary1_lengths + text_summary2_lengths)
mean_audio_summary = (audio_summary1_lengths+audio_summary2_lengths)
mean_asr_summary = (asr_summary1_lengths+asr_summary2_lengths)


audio_compression_ratio = [0.5*(audio_summary1_lengths[i]+audio_summary2_lengths[i])/transcript_lengths[i] for i in range(len(audio_summary1_lengths))]
text_compression_ratio = [0.5*(text_summary1_lengths[i]+text_summary2_lengths[i])/transcript_lengths[i] for i in range(len(text_summary1_lengths))]
asr_compression_ratio = [0.5*(asr_summary1_lengths[i]+asr_summary2_lengths[i])/transcript_lengths[i] for i in range(len(asr_summary1_lengths))]

print(f"SLENGTH {np.mean(mean_text_summary)} {np.std(mean_text_summary)} {np.mean(mean_audio_summary)} {np.std(mean_asr_summary)} {np.mean(mean_asr_summary)} {np.std(mean_asr_summary)} {get_pvalues(mean_text_summary,mean_audio_summary)} {get_pvalues(mean_text_summary,mean_asr_summary)} {get_pvalues(mean_asr_summary,mean_audio_summary)} ")

print(f"CRATIO {np.mean(audio_compression_ratio)} {np.std(audio_compression_ratio)} {np.mean(text_compression_ratio)} {np.std(text_compression_ratio)} {get_pvalues(audio_compression_ratio,text_compression_ratio)} {get_pvalues(asr_compression_ratio,text_compression_ratio)} {get_pvalues(asr_compression_ratio,audio_compression_ratio)}")




In [None]:
import spacy
 
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")



def get_entities(text):
    return [ent.text for ent in nlp(text).ents]

output_df["Text_Summary1_Entities"] = output_df["Text_Summary1"].apply(lambda x: get_entities(x))
output_df["Text_Summary2_Entities"] = output_df["Text_Summary2"].apply(lambda x: get_entities(x))
output_df["Audio_Summary1_Entities"] = output_df["Audio_Summary1"].apply(lambda x: get_entities(x))
output_df["Audio_Summary2_Entities"] = output_df["Audio_Summary2"].apply(lambda x: get_entities(x))
output_df["Whisper_Transcript_Summary1_Entities"] = output_df["Whisper_Transcript_Summary1"].apply(lambda x: get_entities(x))
output_df["Whisper_Transcript_Summary2_Entities"] = output_df["Whisper_Transcript_Summary2"].apply(lambda x: get_entities(x))
output_df["Transcript_Entities"] = output_df["transcript"].apply(lambda x: get_entities(x))

def process_entity_f1(ref_list,hyp_list):
    ref_list = [x.lower() for x in ref_list]
    hyp_list = [x.lower() for x in hyp_list]
    ref_set = set(ref_list)
    hyp_set = set(hyp_list)
    tp = len(ref_set.intersection(hyp_set))
    precision = tp/len(hyp_set) if len(hyp_set) > 0 else 0
    recall = tp/len(ref_set) if len(ref_set) > 0 else 0
    f1 = 2*precision*recall/(precision+recall) if precision+recall > 0 else 0
    return f1


In [None]:
text_summary1_entity_f1 = output_df.apply(lambda x: process_entity_f1(x["Transcript_Entities"],x["Text_Summary1_Entities"]),axis=1).tolist()
text_summary2_entity_f1 = output_df.apply(lambda x: process_entity_f1(x["Transcript_Entities"],x["Text_Summary2_Entities"]),axis=1).tolist()
audio_summary1_entity_f1 = output_df.apply(lambda x: process_entity_f1(x["Transcript_Entities"],x["Audio_Summary1_Entities"]),axis=1).tolist()
audio_summary2_entity_f1 = output_df.apply(lambda x: process_entity_f1(x["Transcript_Entities"],x["Audio_Summary2_Entities"]),axis=1).tolist()
asr_summary1_entity_f1 = output_df.apply(lambda x: process_entity_f1(x["Transcript_Entities"],x["Whisper_Transcript_Summary1_Entities"]),axis=1).tolist()
asr_summary2_entity_f1 = output_df.apply(lambda x: process_entity_f1(x["Transcript_Entities"],x["Whisper_Transcript_Summary2_Entities"]),axis=1).tolist()


text_ents = text_summary1_entity_f1 + text_summary2_entity_f1
audio_ents = audio_summary1_entity_f1 + audio_summary2_entity_f1
asr_ents = asr_summary1_entity_f1 + asr_summary2_entity_f1

print(f"ENTITIES {np.mean(text_ents)} {np.std(text_ents)} {np.mean(audio_ents)} {np.std(audio_ents)} {np.mean(asr_ents)} {np.std(asr_ents)} {get_pvalues(text_ents,audio_ents)} {get_pvalues(text_ents,asr_ents)} {get_pvalues(asr_ents,audio_ents)}")







In [None]:
import evaluate 
rouge = evaluate.load('rouge')

def compute_rouge(hyp,ref):
    results = rouge.compute(predictions=[hyp],references=[ref])
    return results["rougeL"]

text_summary1_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Text_Summary1"],x["transcript"]),axis=1).tolist()
text_summary2_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Text_Summary2"],x["transcript"]),axis=1).tolist()
audio_summary1_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Audio_Summary1"],x["transcript"]),axis=1).tolist()
audio_summary2_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Audio_Summary2"],x["transcript"]),axis=1).tolist()
asr_summary1_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Whisper_Transcript_Summary1"],x["transcript"]),axis=1).tolist()
asr_summary2_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Whisper_Transcript_Summary2"],x["transcript"]),axis=1).tolist()

text_rouge = text_summary1_transcript_rouge + text_summary2_transcript_rouge
audio_rouge = audio_summary1_transcript_rouge + audio_summary2_transcript_rouge
asr_rouge = asr_summary1_transcript_rouge + asr_summary2_transcript_rouge

print(f"ROUGE {np.mean(text_rouge)} {np.std(text_rouge)} {np.mean(audio_rouge)} {np.std(audio_rouge)} {np.mean(asr_rouge)} {np.std(asr_rouge)} {get_pvalues(text_rouge,audio_rouge)} {get_pvalues(text_rouge,asr_rouge)} {get_pvalues(asr_rouge,audio_rouge)}")

In [None]:
import evaluate 


meteor = evaluate.load('meteor')

def compute_meteor(hyp,ref):
    results = meteor.compute(predictions=[hyp.lower()],references=[ref.lower()])
    return results["meteor"]

text_summary1_transcript_meteor = output_df.apply(lambda x: compute_meteor(x["Text_Summary1"],x["transcript"]),axis=1).tolist()
text_summary2_transcript_meteor = output_df.apply(lambda x: compute_meteor(x["Text_Summary2"],x["transcript"]),axis=1).tolist()
audio_summary1_transcript_meteor = output_df.apply(lambda x: compute_meteor(x["Audio_Summary1"],x["transcript"]),axis=1).tolist()
audio_summary2_transcript_meteor = output_df.apply(lambda x: compute_meteor(x["Audio_Summary2"],x["transcript"]),axis=1).tolist()
asr_summary1_transcript_meteor = output_df.apply(lambda x: compute_meteor(x["Whisper_Transcript_Summary1"],x["transcript"]),axis=1).tolist()
asr_summary2_transcript_meteor = output_df.apply(lambda x: compute_meteor(x["Whisper_Transcript_Summary2"],x["transcript"]),axis=1).tolist()

text_meteor = text_summary1_transcript_meteor + text_summary2_transcript_meteor
audio_meteor = audio_summary1_transcript_meteor +   audio_summary2_transcript_meteor
asr_meteor = asr_summary1_transcript_meteor + asr_summary2_transcript_meteor

print(f"METEOR {np.mean(text_meteor)} {np.std(text_meteor)} {np.mean(audio_meteor)} {np.std(audio_meteor)} {np.mean(asr_meteor)} {np.std(asr_meteor)} {get_pvalues(text_meteor,audio_meteor)} {get_pvalues(text_meteor,asr_meteor)} {get_pvalues(asr_meteor,audio_meteor)}")



In [None]:
from evaluate import load
bertscore = load("bertscore")

def compute_bertscore(hyp,ref):
    results = bertscore.compute(predictions=[hyp],references=[ref],lang="en")
    return results["f1"][0]

text_summary1_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Text_Summary1"],x["transcript"]),axis=1).tolist()
text_summary2_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Text_Summary2"],x["transcript"]),axis=1).tolist()
audio_summary1_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Audio_Summary1"],x["transcript"]),axis=1).tolist()
audio_summary2_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Audio_Summary2"],x["transcript"]),axis=1).tolist()
asr_summary1_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Whisper_Transcript_Summary1"],x["transcript"]),axis=1).tolist()
asr_summary2_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Whisper_Transcript_Summary2"],x["transcript"]),axis=1).tolist()


text_bertscore = text_summary1_transcript_bertscore + text_summary2_transcript_bertscore
audio_bertscore = audio_summary1_transcript_bertscore + audio_summary2_transcript_bertscore
asr_bertscore = asr_summary1_transcript_bertscore + asr_summary2_transcript_bertscore

print(f"BERTSCORE {np.mean(text_bertscore)} {np.std(text_bertscore)} {np.mean(audio_bertscore)} {np.std(audio_bertscore)} {np.mean(asr_bertscore)} {np.std(asr_bertscore)} {get_pvalues(text_bertscore,audio_bertscore)} {get_pvalues(text_bertscore,asr_bertscore)} {get_pvalues(asr_bertscore,audio_bertscore)}")



In [None]:
text_summary1_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Text_Summary1"],x["Text_Summary2"]),axis=1).tolist()
text_summary2_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Text_Summary2"],x["Text_Summary1"]),axis=1).tolist()
audio_summary1_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Audio_Summary1"],x["Audio_Summary2"]),axis=1).tolist()
audio_summary2_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Audio_Summary2"],x["Audio_Summary1"]),axis=1).tolist()
asr_summary1_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Whisper_Transcript_Summary1"],x["Whisper_Transcript_Summary2"]),axis=1).tolist()
asr_summary2_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Whisper_Transcript_Summary2"],x["Whisper_Transcript_Summary1"]),axis=1).tolist()

print(f"BSINT {np.mean(text_summary1_transcript_bertscore+text_summary2_transcript_bertscore)} {np.std(text_summary1_transcript_bertscore+text_summary2_transcript_bertscore)} {np.mean(audio_summary1_transcript_bertscore+audio_summary2_transcript_bertscore)} {np.std(audio_summary1_transcript_bertscore+audio_summary2_transcript_bertscore)} {np.mean(asr_summary1_transcript_bertscore+asr_summary2_transcript_bertscore)} {np.std(asr_summary1_transcript_bertscore+asr_summary2_transcript_bertscore)} {get_pvalues(text_summary1_transcript_bertscore+text_summary2_transcript_bertscore,audio_summary1_transcript_bertscore+audio_summary2_transcript_bertscore)} {get_pvalues(text_summary1_transcript_bertscore+text_summary2_transcript_bertscore,asr_summary1_transcript_bertscore+asr_summary2_transcript_bertscore)} {get_pvalues(asr_summary1_transcript_bertscore+asr_summary2_transcript_bertscore,audio_summary1_transcript_bertscore+audio_summary2_transcript_bertscore)}")


In [None]:
text_summary1 = list(output_df["Text_Summary1"])
text_summary2 = list(output_df["Text_Summary2"])
audio_summary1 = list(output_df["Audio_Summary1"])
audio_summary2 = list(output_df["Audio_Summary2"])
asr_summary1 = list(output_df["Whisper_Transcript_Summary1"])
asr_summary2 = list(output_df["Whisper_Transcript_Summary2"])

src = list(output_df["transcript"])
ref = list(output_df["ref"])


print(len(text_summary1),len(text_summary2),len(audio_summary1),len(audio_summary2),len(src),len(ref),len(asr_summary1),len(asr_summary2))

# UniEval

In [None]:
%cd UniEval

In [None]:
from utils import convert_to_json
from metric.evaluator import get_evaluator

task = 'summarization'

evaluator = get_evaluator(task)


def compute_unieval_scores(hyp_list,ref_list,src_list):
    data = convert_to_json(src_list=src_list, ref_list=ref_list,output_list=hyp_list)
    eval_scores = evaluator.evaluate(data, dims=['coherence', 'consistency', 'fluency'], 
                                 overall=False, print_result=False)
    return [x["coherence"] for x in eval_scores],[x["consistency"] for x in eval_scores],[x["fluency"] for x in eval_scores]



ts1_coh,ts1_con,ts1_flu = compute_unieval_scores(text_summary1,ref,src)
ts2_coh,ts2_con,ts2_flu = compute_unieval_scores(text_summary2,ref,src)
ss1_coh,ss1_con,ss1_flu = compute_unieval_scores(audio_summary1,ref,src)
ss2_coh,ss2_con,ss2_flu = compute_unieval_scores(audio_summary2,ref,src)
as1_coh,as1_con,as1_flu = compute_unieval_scores(asr_summary1,ref,src)
as2_coh,as2_con,as2_flu = compute_unieval_scores(asr_summary2,ref,src)

ts_coh = ts1_coh + ts2_coh
ss_coh = ss1_coh + ss2_coh
as_coh = as1_coh + as2_coh
ts_con = ts1_con + ts2_con
ss_con = ss1_con + ss2_con
as_con = as1_con + as2_con
ts_flu = ts1_flu + ts2_flu
ss_flu = ss1_flu + ss2_flu
as_flu = as1_flu + as2_flu

print(f"UEVALCOH {np.mean(ts_coh)} {np.std(ts_coh)} {np.mean(ss_coh)} {np.std(ss_coh)} {np.mean(as_coh)} {np.std(as_coh)} {get_pvalues(ts_coh,ss_coh)} {get_pvalues(ts_coh,as_coh)} {get_pvalues(as_coh,ss_coh)}")
print(f"UEVALCON {np.mean(ts_con)} {np.std(ts_con)} {np.mean(ss_con)} {np.std(ss_con)} {np.mean(as_con)} {np.std(as_con)} {get_pvalues(ts_con,ss_con)} {get_pvalues(ts_con,as_con)} {get_pvalues(as_con,ss_con)}")
print(f"UEVALFLU {np.mean(ts_flu)} {np.std(ts_flu)} {np.mean(ss_flu)} {np.std(ss_flu)} {np.mean(as_flu)} {np.std(as_flu)} {get_pvalues(ts_flu,ss_flu)} {get_pvalues(ts_flu,as_flu)} {get_pvalues(as_flu,ss_flu)}")

In [None]:
task = 'summarization'

evaluator = get_evaluator(task)


def compute_unieval_scores(hyp_list,ref_list,src_list):
    data = convert_to_json(src_list=src_list, ref_list=ref_list,output_list=hyp_list)
    eval_scores = evaluator.evaluate(data, dims=['relevance'], 
                                 overall=False, print_result=False)
    return [x["relevance"] for x in eval_scores]



ts1_rel = compute_unieval_scores(text_summary1,text_summary2,src)
ts2_rel = compute_unieval_scores(text_summary2,text_summary1,src)
ss1_rel = compute_unieval_scores(audio_summary1,audio_summary2,src)
ss2_rel = compute_unieval_scores(audio_summary2,audio_summary1,src)
as1_rel = compute_unieval_scores(asr_summary1,asr_summary2,src)
as2_rel = compute_unieval_scores(asr_summary2,asr_summary1,src)

ts_rel = ts1_rel + ts2_rel
ss_rel = ss1_rel + ss2_rel
as_rel = as1_rel + as2_rel

print(f"REL {np.mean(ts_rel)} {np.std(ts_rel)} {np.mean(ss_rel)} {np.std(ss_rel)} {np.mean(as_rel)} {np.std(as_rel)} {get_pvalues(ts_rel,ss_rel)} {get_pvalues(ts_rel,as_rel)} {get_pvalues(as_rel,ss_rel)}")

In [None]:
task = 'summarization'

evaluator = get_evaluator(task)


def compute_unieval_scores(hyp_list,ref_list,src_list):
    data = convert_to_json(src_list=src_list, ref_list=ref_list,output_list=hyp_list)
    eval_scores = evaluator.evaluate(data, dims=['relevance'], 
                                 overall=False, print_result=True)
    return [x["relevance"] for x in eval_scores]



ts12_rel =  compute_unieval_scores(text_summary1,audio_summary2,src)
ts11_rel =  compute_unieval_scores(text_summary1,audio_summary1,src)

ts22_rel =  compute_unieval_scores(text_summary2,audio_summary2,src)
ts21_rel = compute_unieval_scores(text_summary2,audio_summary1,src)

ta12_rel =  compute_unieval_scores(text_summary1,asr_summary2,src)
ta11_rel =  compute_unieval_scores(text_summary1,asr_summary1,src)

ta22_rel =  compute_unieval_scores(text_summary2,asr_summary2,src)
ta21_rel = compute_unieval_scores(text_summary2,asr_summary1,src)


st12_rel =  compute_unieval_scores(audio_summary1,text_summary2,src)
st11_rel = compute_unieval_scores(audio_summary1,text_summary1,src)

st22_rel = compute_unieval_scores(audio_summary2,text_summary2,src)
st21_rel = compute_unieval_scores(audio_summary2,text_summary1,src)

sa11_rel =  compute_unieval_scores(audio_summary1,asr_summary1,src)
sa12_rel =  compute_unieval_scores(audio_summary1,asr_summary2,src)

sa21_rel =  compute_unieval_scores(audio_summary2,asr_summary1,src)
sa22_rel = compute_unieval_scores(audio_summary2,asr_summary2,src)

as12_rel =  compute_unieval_scores(asr_summary1,text_summary2,src)
as11_rel =  compute_unieval_scores(asr_summary1,text_summary1,src)

as22_rel =  compute_unieval_scores(asr_summary2,text_summary2,src)
as21_rel = compute_unieval_scores(asr_summary2,text_summary1,src)

at12_rel =  compute_unieval_scores(asr_summary1,audio_summary2,src)
at11_rel =  compute_unieval_scores(asr_summary1,audio_summary1,src)

at22_rel =  compute_unieval_scores(asr_summary2,audio_summary2,src)
at21_rel = compute_unieval_scores(asr_summary2,audio_summary1,src)

text_all = ts12_rel + ts11_rel + ts21_rel + ts22_rel + ta12_rel + ta11_rel + ta21_rel + ta22_rel
audio_all = st12_rel + st11_rel + st21_rel + st22_rel + sa11_rel + sa12_rel + sa21_rel + sa22_rel
asr_all = as12_rel + as11_rel + as21_rel + as22_rel + at12_rel + at11_rel + at21_rel + at22_rel


print(f"RELPAIR {np.mean(text_all)} {np.std(text_all)} {np.mean(audio_all)} {np.std(audio_all)} {np.mean(asr_all)} {np.std(asr_all)} {get_pvalues(text_all,audio_all)} {get_pvalues(text_all,asr_all)} {get_pvalues(asr_all,audio_all)}")

In [None]:
task = 'summarization'

evaluator = get_evaluator(task)


def compute_unieval_scores(hyp_list,ref_list,src_list):
    data = convert_to_json(src_list=src_list, ref_list=ref_list,output_list=hyp_list)
    eval_scores = evaluator.evaluate(data, dims=['consistency'], 
                                 overall=False, print_result=True)
    return [x["consistency"] for x in eval_scores]



ts12_rel =  compute_unieval_scores(text_summary1,audio_summary2,src)
ts11_rel =  compute_unieval_scores(text_summary1,audio_summary1,src)

ts22_rel =  compute_unieval_scores(text_summary2,audio_summary2,src)
ts21_rel = compute_unieval_scores(text_summary2,audio_summary1,src)

ta12_rel =  compute_unieval_scores(text_summary1,asr_summary2,src)
ta11_rel =  compute_unieval_scores(text_summary1,asr_summary1,src)

ta22_rel =  compute_unieval_scores(text_summary2,asr_summary2,src)
ta21_rel = compute_unieval_scores(text_summary2,asr_summary1,src)


st12_rel =  compute_unieval_scores(audio_summary1,text_summary2,src)
st11_rel = compute_unieval_scores(audio_summary1,text_summary1,src)

st22_rel = compute_unieval_scores(audio_summary2,text_summary2,src)
st21_rel = compute_unieval_scores(audio_summary2,text_summary1,src)

sa11_rel =  compute_unieval_scores(audio_summary1,asr_summary1,src)
sa12_rel =  compute_unieval_scores(audio_summary1,asr_summary2,src)

sa21_rel =  compute_unieval_scores(audio_summary2,asr_summary1,src)
sa22_rel = compute_unieval_scores(audio_summary2,asr_summary2,src)

as12_rel =  compute_unieval_scores(asr_summary1,text_summary2,src)
as11_rel =  compute_unieval_scores(asr_summary1,text_summary1,src)

as22_rel =  compute_unieval_scores(asr_summary2,text_summary2,src)
as21_rel = compute_unieval_scores(asr_summary2,text_summary1,src)

at12_rel =  compute_unieval_scores(asr_summary1,audio_summary2,src)
at11_rel =  compute_unieval_scores(asr_summary1,audio_summary1,src)

at22_rel =  compute_unieval_scores(asr_summary2,audio_summary2,src)
at21_rel = compute_unieval_scores(asr_summary2,audio_summary1,src)

text_all = ts12_rel + ts11_rel + ts21_rel + ts22_rel + ta12_rel + ta11_rel + ta21_rel + ta22_rel
audio_all = st12_rel + st11_rel + st21_rel + st22_rel + sa11_rel + sa12_rel + sa21_rel + sa22_rel
asr_all = as12_rel + as11_rel + as21_rel + as22_rel + at12_rel + at11_rel + at21_rel + at22_rel


print(f"CONPAIR {np.mean(text_all)} {np.std(text_all)} {np.mean(audio_all)} {np.std(audio_all)} {np.mean(asr_all)} {np.std(asr_all)} {get_pvalues(text_all,audio_all)} {get_pvalues(text_all,asr_all)} {get_pvalues(asr_all,audio_all)}")

In [None]:
%cd ../

# BARTScore

In [None]:
%cd BARTScore

In [None]:
from bart_score import BARTScorer
bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')

text1_scores = bart_scorer.score(text_summary1,src,batch_size=4)
text2_scores = bart_scorer.score(text_summary2,src,batch_size=4)
audio1_scores = bart_scorer.score(audio_summary1,src,batch_size=4)
audio2_scores = bart_scorer.score(audio_summary2,src,batch_size=4)
asr1_scores = bart_scorer.score(asr_summary1,src,batch_size=4)
asr2_scores = bart_scorer.score(asr_summary2,src,batch_size=4)

text_score = text1_scores+text2_scores
audio_score = audio1_scores+audio2_scores
asr_score = asr1_scores+asr2_scores


print(f"BARTSCORE {np.mean(text_score)} {np.std(text_score)} {np.mean(audio_score)} {np.std(audio_score)} {np.mean(asr_score)} {np.std(asr_score)} {get_pvalues(text_score,audio_score)} {get_pvalues(text_score,asr_score)} {get_pvalues(asr_score,audio_score)}")

In [None]:
def get_interannotator_bart_score(hyp,ref):
    results1 = bart_scorer.score(hyp,ref,batch_size=4)
    results2 = bart_scorer.score(ref,hyp,batch_size=4)
    return np.mean(results1+results2),np.std(results1+results2),results1+results2

text_interannotator_bartscore,text_std,text_sc = get_interannotator_bart_score(text_summary1,text_summary2)
audio_interannotator_bartscore,audio_std,audio_sc = get_interannotator_bart_score(audio_summary1,audio_summary2)
asr_interannotator_bartscore,asr_std,asr_sc = get_interannotator_bart_score(asr_summary1,asr_summary2)

print(f"INTERBARTSCORE {text_interannotator_bartscore} {text_std} {audio_interannotator_bartscore} {audio_std} {asr_interannotator_bartscore} {asr_std} {get_pvalues(text_sc,audio_sc)} {get_pvalues(text_sc,asr_sc)} {get_pvalues(asr_sc,audio_sc)}")




In [None]:
text1_ref_speech1hyp = bart_scorer.score(audio_summary1,text_summary1,batch_size=4)
text1_ref_speech2hyp = bart_scorer.score(audio_summary2,text_summary1,batch_size=4)

text2_ref_speech1hyp = bart_scorer.score(audio_summary1,text_summary2,batch_size=4)
text2_ref_speech2hyp = bart_scorer.score(audio_summary2,text_summary2,batch_size=4)

text1_ref_asr1hyp = bart_scorer.score(asr_summary1,text_summary1,batch_size=4)
text1_ref_asr2hyp = bart_scorer.score(asr_summary2,text_summary1,batch_size=4)

text2_ref_asr1hyp = bart_scorer.score(asr_summary1,text_summary2,batch_size=4)
text2_ref_asr2hyp = bart_scorer.score(asr_summary2,text_summary2,batch_size=4)

speech1_ref_text1hyp = bart_scorer.score(text_summary1,audio_summary1,batch_size=4)
speech1_ref_text2hyp = bart_scorer.score(text_summary2,audio_summary1,batch_size=4)

speech2_ref_text1hyp = bart_scorer.score(text_summary1,audio_summary2,batch_size=4)
speech2_ref_text2hyp = bart_scorer.score(text_summary2,audio_summary2,batch_size=4)

speech1_ref_asr1hyp = bart_scorer.score(asr_summary1,audio_summary1,batch_size=4)
speech1_ref_asr2hyp = bart_scorer.score(asr_summary2,audio_summary1,batch_size=4)

speech2_ref_asr1hyp = bart_scorer.score(asr_summary1,audio_summary2,batch_size=4)
speech2_ref_asr2hyp = bart_scorer.score(asr_summary2,audio_summary2,batch_size=4)

asr1_ref_text1hyp = bart_scorer.score(text_summary1,asr_summary1,batch_size=4)
asr1_ref_text2hyp = bart_scorer.score(text_summary2,asr_summary1,batch_size=4)

asr2_ref_text1hyp = bart_scorer.score(text_summary1,asr_summary2,batch_size=4)
asr2_ref_text2hyp = bart_scorer.score(text_summary2,asr_summary2,batch_size=4)

asr1_ref_speech1hyp = bart_scorer.score(audio_summary1,asr_summary1,batch_size=4)
asr1_ref_speech2hyp = bart_scorer.score(audio_summary2,asr_summary1,batch_size=4)

asr2_ref_speech1hyp = bart_scorer.score(audio_summary1,asr_summary2,batch_size=4)
asr2_ref_speech2hyp = bart_scorer.score(audio_summary2,asr_summary2,batch_size=4)



text_all = text1_ref_speech1hyp+text1_ref_speech2hyp+text2_ref_speech1hyp+text2_ref_speech2hyp + text1_ref_asr1hyp+text1_ref_asr2hyp+text2_ref_asr1hyp+text2_ref_asr2hyp
speech_all = speech1_ref_text1hyp+speech1_ref_text2hyp+speech2_ref_text1hyp+speech2_ref_text2hyp + speech1_ref_asr1hyp+speech1_ref_asr2hyp+speech2_ref_asr1hyp+speech2_ref_asr2hyp
asr_all = asr1_ref_text1hyp+asr1_ref_text2hyp+asr2_ref_text1hyp+asr2_ref_text2hyp + asr1_ref_speech1hyp+asr1_ref_speech2hyp+asr2_ref_speech1hyp+asr2_ref_speech2hyp

print(f"PAIRBARTSCORE {np.mean(text_all)} {np.std(text_all)} {np.mean(speech_all)} {np.std(speech_all)} {np.mean(asr_all)} {np.std(asr_all)} {get_pvalues(text_all,speech_all)} {get_pvalues(text_all,asr_all)} {get_pvalues(asr_all,speech_all)}")


In [None]:
import scipy

from evaluate import load
bertscore = load("bertscore")

def get_bertscore_retrieval(hyp_summary,refs):
    hyp = [hyp_summary]*len(refs)
    results = bertscore.compute(predictions=hyp,references=refs,lang="en",device="cuda:0")
    return results["f1"]

    
def get_retrieval_acc_entropy(hyp_summary,i):
    
    scores = np.array(get_bertscore_retrieval(hyp_summary,src))
    index = np.argmax(scores,axis=0)
    retrieval_acc = 1 if index == i else 0
    normalized_scores = scores/scores.sum()
    entropy = scipy.stats.entropy(normalized_scores)
    return retrieval_acc,entropy


In [None]:
text_summary1_retr = output_df.apply(lambda x: get_retrieval_acc_entropy(x["Text_Summary1"],x["index"]),axis=1)
text_summary2_retr = output_df.apply(lambda x: get_retrieval_acc_entropy(x["Text_Summary2"],x["index"]),axis=1)

text_retr_acc = 0.5 * (np.mean([x[0] for x in text_summary1_retr]) + np.mean([x[0] for x in text_summary2_retr]))



print(f"Text Source Retrieval Acc is {text_retr_acc}")

text_retr_ent = 0.5 * (np.mean([x[1] for x in text_summary1_retr]) + np.mean([x[1] for x in text_summary2_retr]))
print(f"Text Source Ent is {text_retr_ent}")




audio_summary1_retr = output_df.apply(lambda x: get_retrieval_acc_entropy(x["Audio_Summary1"],x["index"]),axis=1)
audio_summary2_retr = output_df.apply(lambda x: get_retrieval_acc_entropy(x["Audio_Summary2"],x["index"]),axis=1)
audio_retr_acc = 0.5 * (np.mean([x[0] for x in audio_summary1_retr]) + np.mean([x[0] for x in audio_summary2_retr]))
print(f"Audio Source Retrieval Acc is {audio_retr_acc}")

audio_retr_ent = 0.5 * (np.mean([x[1] for x in audio_summary1_retr]) + np.mean([x[1] for x in audio_summary2_retr]))
print(f"Audio Source Ent Acc is {audio_retr_ent}")




asr_summary1_retr = output_df.apply(lambda x: get_retrieval_acc_entropy(x["Whisper_Transcript_Summary1"],x["index"]),axis=1)
asr_summary2_retr = output_df.apply(lambda x: get_retrieval_acc_entropy(x["Whisper_Transcript_Summary2"],x["index"]),axis=1)
asr_retr_acc = 0.5 * (np.mean([x[0] for x in asr_summary1_retr]) + np.mean([x[0] for x in asr_summary2_retr]))
print(f"ASR Source Retrieval Acc is {asr_retr_acc}")

asr_retr_ent = 0.5 * (np.mean([x[1] for x in asr_summary1_retr]) + np.mean([x[1] for x in asr_summary2_retr]))
print(f"ASR Source Ent Acc is {asr_retr_ent}")
