In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt





In [None]:
## Read Data 
output_df = pd.read_csv('../data/release_data_summaries_nonexpert.tsv',sep="\t")


In [None]:
output_df.head()

In [None]:
import numpy as np
from scipy.stats import ttest_ind

def get_pvalues(v1,v2):
    res = ttest_ind(v1, v2)
    return res.pvalue


# Summary Length Analysis

In [None]:
## Summary Lengths 

text_summary1_lengths = output_df["Text_Summary1"].apply(lambda x: len(x.split(" ")))
text_summary2_lengths = output_df["Text_Summary2"].apply(lambda x: len(x.split(" ")))
audio_summary1_lengths = output_df["Audio_Summary1"].apply(lambda x: len(x.split(" ")))
audio_summary2_lengths = output_df["Audio_Summary2"].apply(lambda x: len(x.split(" ")))
transcript_lengths = output_df["transcript"].apply(lambda x: len(x.split(" ")))

mean_text_summary = 0.5*(text_summary1_lengths + text_summary2_lengths)
mean_audio_summary = 0.5*(audio_summary1_lengths+audio_summary2_lengths)

audio_compression_ratio = [2*transcript_lengths[i]/(audio_summary1_lengths[i]+audio_summary2_lengths[i]) for i in range(len(audio_summary1_lengths))]
text_compression_ratio = [2*transcript_lengths[i]/(text_summary1_lengths[i]+text_summary2_lengths[i]) for i in range(len(text_summary1_lengths))]


print(f"SLENGTH {mean_text_summary.mean()} {mean_text_summary.std()} {mean_audio_summary.mean()} {mean_audio_summary.std()} {get_pvalues(mean_text_summary,mean_audio_summary)}  ")
print(f"CRATIO {np.mean(audio_compression_ratio)} {np.std(audio_compression_ratio)} {np.mean(text_compression_ratio)} {np.std(text_compression_ratio)} {get_pvalues(audio_compression_ratio,text_compression_ratio)}")




In [None]:
fig, axes = plt.subplots(2, 2)

plt.subplot(2,2,1)
plt.hist(text_summary1_lengths,bins=20)
plt.title("Text 1 Summary Length")
plt.subplot(2,2,2)
plt.hist(text_summary2_lengths,bins=20)
plt.title("Text 2 Summary Length")
plt.subplot(2,2,3)
plt.hist(audio_summary1_lengths,bins=20)
plt.title("Audio 1 Summary Length")
plt.subplot(2,2,4)
plt.hist(audio_summary2_lengths,bins=20)
plt.title("Audio 2 Summary Length")
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2)
plt.subplot(1,2,1)
plt.hist(text_summary1_lengths.tolist()+text_summary2_lengths.tolist(),bins=20)
plt.title("Text Summary Length")
plt.subplot(1,2,2)
plt.hist(audio_summary1_lengths.tolist()+audio_summary2_lengths.tolist(),bins=20)
plt.title("Audio Summary Length")


## Unique Vocab Size

In [None]:
import spacy
 
# Load spaCy English model
nlp = spacy.blank("en")


In [None]:
## Unique Vocab 
from itertools import chain

text_summary1_unique = list(set(list(chain(*output_df["Text_Summary1"].apply(lambda x: x.lower().split(" "))))))
text_summary2_unique = list(set(list(chain(*output_df["Text_Summary2"].apply(lambda x: x.lower().split(" "))))))
audio_summary1_unique = list(set(list(chain(*output_df["Audio_Summary1"].apply(lambda x: x.lower().split(" "))))))
audio_summary2_unique = list(set(list(chain(*output_df["Audio_Summary2"].apply(lambda x: x.lower().split(" "))))))

ref_unique = list(set(list(chain(*output_df["transcript"].apply(lambda x: x.lower().split(" "))))))


print("Unique Text Summary 1 Vocab: ",len(text_summary1_unique))
print("Unique Text Summary 2 Vocab: ",len(text_summary2_unique))
print("Unique Audio Summary 1 Vocab: ",len(audio_summary1_unique))
print("Unique Audio Summary 2 Vocab: ",len(audio_summary2_unique))

print("Unique Text Summaries Vocab: ",len(set(text_summary1_unique+text_summary2_unique)))
print("Unique Audio Summaries Vocab: ",len(set(audio_summary1_unique+audio_summary2_unique)))
print("Unique Source Text Vocab",len(set(ref_unique)))


## Novel Words 

In [None]:
## Overlap with Transcript

def filter_stopwords(text):
    return [token.text for token in nlp(text) if not token.is_stop]

novel_word_percent_text1 = output_df.apply(lambda x: len(set(filter_stopwords(x["Text_Summary1"])).difference(set(filter_stopwords(x["transcript"]))))/len(set(x["Text_Summary1"].split(" "))),axis=1)
novel_word_percent_text2 = output_df.apply(lambda x: len(set(filter_stopwords(x["Text_Summary2"])).difference(set(filter_stopwords(x["transcript"]))))/len(set(x["Text_Summary2"].split(" "))),axis=1)
novel_word_percent_audio1 = output_df.apply(lambda x: len(set(filter_stopwords(x["Audio_Summary1"])).difference(set(filter_stopwords(x["transcript"]))))/len(set(x["Audio_Summary1"].split(" "))),axis=1)
novel_word_percent_audio2 = output_df.apply(lambda x: len(set(filter_stopwords(x["Audio_Summary2"])).difference(set(filter_stopwords(x["transcript"]))))/len(set(x["Audio_Summary2"].split(" "))),axis=1)

print(f"NOVWORD {0.5*(novel_word_percent_text1.mean()+novel_word_percent_text2.mean())} {np.std(novel_word_percent_text1.tolist()+novel_word_percent_text2.tolist())} {0.5*(novel_word_percent_audio1.mean()+novel_word_percent_audio2.mean())} {np.std(novel_word_percent_audio1.tolist()+novel_word_percent_audio2.tolist())} {get_pvalues(0.5*(novel_word_percent_text1+novel_word_percent_text2),0.5*(novel_word_percent_audio1+novel_word_percent_audio2))}")



# Entity Detection

In [None]:
import spacy
 
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")



def get_entities(text):
    return [ent.text for ent in nlp(text).ents]

output_df["Text_Summary1_Entities"] = output_df["Text_Summary1"].apply(lambda x: get_entities(x))
output_df["Text_Summary2_Entities"] = output_df["Text_Summary2"].apply(lambda x: get_entities(x))
output_df["Audio_Summary1_Entities"] = output_df["Audio_Summary1"].apply(lambda x: get_entities(x))
output_df["Audio_Summary2_Entities"] = output_df["Audio_Summary2"].apply(lambda x: get_entities(x))
output_df["Transcript_Entities"] = output_df["transcript"].apply(lambda x: get_entities(x))




In [None]:
def process_entity_f1(ref_list,hyp_list):
    ref_list = [x.lower() for x in ref_list]
    hyp_list = [x.lower() for x in hyp_list]
    ref_set = set(ref_list)
    hyp_set = set(hyp_list)
    tp = len(ref_set.intersection(hyp_set))
    precision = tp/len(hyp_set) if len(hyp_set) > 0 else 0
    recall = tp/len(ref_set) if len(ref_set) > 0 else 0
    f1 = 2*precision*recall/(precision+recall) if precision+recall > 0 else 0
    return f1

In [None]:


text_summary1_entity_f1 = output_df.apply(lambda x: process_entity_f1(x["Transcript_Entities"],x["Text_Summary1_Entities"]),axis=1)
text_summary2_entity_f1 = output_df.apply(lambda x: process_entity_f1(x["Transcript_Entities"],x["Text_Summary2_Entities"]),axis=1)
audio_summary1_entity_f1 = output_df.apply(lambda x: process_entity_f1(x["Transcript_Entities"],x["Audio_Summary1_Entities"]),axis=1)
audio_summary2_entity_f1 = output_df.apply(lambda x: process_entity_f1(x["Transcript_Entities"],x["Audio_Summary2_Entities"]),axis=1)

print(f"ENTITYF1 {(text_summary1_entity_f1+text_summary2_entity_f1).mean()} {np.std(text_summary1_entity_f1.tolist()+text_summary2_entity_f1).tolist()} {(audio_summary1_entity_f1+audio_summary2_entity_f1).mean()} {np.std(audio_summary1_entity_f1.tolist()+audio_summary2_entity_f1).tolist()} {get_pvalues(text_summary1_entity_f1+text_summary2_entity_f1,audio_summary1_entity_f1+audio_summary2_entity_f1)}")



In [None]:
# Interannotator Agreement 


text_summary1_entity_f1_int = output_df.apply(lambda x: process_entity_f1(x["Text_Summary2_Entities"],x["Text_Summary1_Entities"]),axis=1)
text_summary2_entity_f1_int = output_df.apply(lambda x: process_entity_f1(x["Text_Summary1_Entities"],x["Text_Summary2_Entities"]),axis=1)
audio_summary1_entity_f1_int = output_df.apply(lambda x: process_entity_f1(x["Audio_Summary2_Entities"],x["Audio_Summary1_Entities"]),axis=1)
audio_summary2_entity_f1_int = output_df.apply(lambda x: process_entity_f1(x["Audio_Summary1_Entities"],x["Audio_Summary2_Entities"]),axis=1)

print(f"INTENTF1 {(text_summary1_entity_f1_int+text_summary2_entity_f1_int).mean()} {np.std(text_summary1_entity_f1.tolist()+text_summary2_entity_f1).tolist()} {(audio_summary1_entity_f1_int+audio_summary2_entity_f1_int).mean()} {np.std(audio_summary1_entity_f1.tolist()+audio_summary2_entity_f1).tolist()} {get_pvalues(text_summary1_entity_f1_int+text_summary2_entity_f1_int,audio_summary1_entity_f1_int+audio_summary2_entity_f1_int)}")



# Metrics with Transcript

## ROUGE-L with Transcript 

In [None]:
import evaluate 
rouge = evaluate.load('rouge')

def compute_rouge(hyp,ref):
    results = rouge.compute(predictions=[hyp],references=[ref])
    return 100*results["rougeL"]

text_summary1_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Text_Summary1"],x["transcript"]),axis=1).tolist()
text_summary2_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Text_Summary2"],x["transcript"]),axis=1).tolist()
audio_summary1_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Audio_Summary1"],x["transcript"]),axis=1).tolist()
audio_summary2_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Audio_Summary2"],x["transcript"]),axis=1).tolist()

print(f"RL {np.mean(text_summary1_transcript_rouge+text_summary2_transcript_rouge)} {np.std(text_summary1_transcript_rouge+text_summary2_transcript_rouge)} {np.mean(audio_summary1_transcript_rouge+audio_summary2_transcript_rouge)} {np.std(audio_summary1_transcript_rouge+audio_summary2_transcript_rouge)} {get_pvalues(text_summary1_transcript_rouge+text_summary2_transcript_rouge,audio_summary1_transcript_rouge+audio_summary2_transcript_rouge)}")


In [None]:
import evaluate 
rouge = evaluate.load('rouge')

def compute_rouge(hyp,ref):
    results = rouge.compute(predictions=[hyp],references=[ref])
    return 100*results["rougeL"]

text_summary1_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Text_Summary1"],x["Text_Summary2"]),axis=1).tolist()
text_summary2_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Text_Summary2"],x["Text_Summary1"]),axis=1).tolist()
audio_summary1_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Audio_Summary1"],x["Audio_Summary2"]),axis=1).tolist()
audio_summary2_transcript_rouge = output_df.apply(lambda x: compute_rouge(x["Audio_Summary2"],x["Audio_Summary1"]),axis=1).tolist()


print(f"RLINT {np.mean(text_summary1_transcript_rouge+text_summary2_transcript_rouge)} {np.std(text_summary1_transcript_rouge+text_summary2_transcript_rouge)} {np.mean(audio_summary1_transcript_rouge+audio_summary2_transcript_rouge)} {np.std(audio_summary1_transcript_rouge+audio_summary2_transcript_rouge)} {get_pvalues(text_summary1_transcript_rouge+text_summary2_transcript_rouge,audio_summary1_transcript_rouge+audio_summary2_transcript_rouge)}")


## METEOR with Transcript

In [None]:
import evaluate 


meteor = evaluate.load('meteor')

def compute_meteor(hyp,ref):
    results = meteor.compute(predictions=[hyp.lower()],references=[ref.lower()])
    return 100*results["meteor"]

text_summary1_transcript_meteor = output_df.apply(lambda x: compute_meteor(x["Text_Summary1"],x["transcript"]),axis=1).tolist()
text_summary2_transcript_meteor = output_df.apply(lambda x: compute_meteor(x["Text_Summary2"],x["transcript"]),axis=1).tolist()
audio_summary1_transcript_meteor = output_df.apply(lambda x: compute_meteor(x["Audio_Summary1"],x["transcript"]),axis=1).tolist()
audio_summary2_transcript_meteor = output_df.apply(lambda x: compute_meteor(x["Audio_Summary2"],x["transcript"]),axis=1).tolist()

print(f"MTR {np.mean(text_summary1_transcript_meteor+text_summary2_transcript_meteor)} {np.std(text_summary1_transcript_meteor+text_summary2_transcript_meteor)} {np.mean(audio_summary1_transcript_meteor+audio_summary2_transcript_meteor)} {np.std(audio_summary1_transcript_meteor+audio_summary2_transcript_meteor)} {get_pvalues(text_summary1_transcript_meteor+text_summary2_transcript_meteor,audio_summary1_transcript_meteor+audio_summary2_transcript_meteor)}")



## BERTScore with Transcript

In [None]:
from evaluate import load
bertscore = load("bertscore")

def compute_bertscore(hyp,ref):
    results = bertscore.compute(predictions=[hyp.lower()],references=[ref.lower()],lang="en")
    return 100*np.mean(results["f1"])

text_summary1_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Text_Summary1"],x["transcript"]),axis=1).tolist()
text_summary2_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Text_Summary2"],x["transcript"]),axis=1).tolist()
audio_summary1_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Audio_Summary1"],x["transcript"]),axis=1).tolist()
audio_summary2_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Audio_Summary2"],x["transcript"]),axis=1).tolist()

print(f"BS {np.mean(text_summary1_transcript_bertscore+text_summary2_transcript_bertscore)} {np.std(text_summary1_transcript_bertscore+text_summary2_transcript_bertscore)} {np.mean(audio_summary1_transcript_bertscore+audio_summary2_transcript_bertscore)} {np.std(audio_summary1_transcript_bertscore+audio_summary2_transcript_bertscore)} {get_pvalues(text_summary1_transcript_bertscore+text_summary2_transcript_bertscore,audio_summary1_transcript_bertscore+audio_summary2_transcript_bertscore)}")


In [None]:
text_summary1_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Text_Summary1"],x["Text_Summary2"]),axis=1).tolist()
text_summary2_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Text_Summary2"],x["Text_Summary1"]),axis=1).tolist()
audio_summary1_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Audio_Summary1"],x["Audio_Summary2"]),axis=1).tolist()
audio_summary2_transcript_bertscore = output_df.apply(lambda x: compute_bertscore(x["Audio_Summary2"],x["Audio_Summary1"]),axis=1).tolist()


print(f"BSINT {np.mean(text_summary1_transcript_bertscore+text_summary2_transcript_bertscore)} {np.std(text_summary1_transcript_bertscore+text_summary2_transcript_bertscore)} {np.mean(audio_summary1_transcript_bertscore+audio_summary2_transcript_bertscore)} {np.std(audio_summary1_transcript_bertscore+audio_summary2_transcript_bertscore)} {get_pvalues(text_summary1_transcript_bertscore+text_summary2_transcript_bertscore,audio_summary1_transcript_bertscore+audio_summary2_transcript_bertscore)}")


In [None]:
text_summary1 = list(output_df["Text_Summary1"])
text_summary2 = list(output_df["Text_Summary2"])
audio_summary1 = list(output_df["Audio_Summary1"])
audio_summary2 = list(output_df["Audio_Summary2"])
src = list(output_df["transcript"])
ref = list(output_df["ref"])


print(len(text_summary1),len(text_summary2),len(audio_summary1),len(audio_summary2),len(src),len(ref))

## UniEval with Transcript 

In [None]:

%cd UniEval

In [None]:
from utils import convert_to_json
from metric.evaluator import get_evaluator

task = 'summarization'

evaluator = get_evaluator(task)


def compute_unieval_scores(hyp_list,ref_list,src_list):
    data = convert_to_json(src_list=src_list, ref_list=ref_list,output_list=hyp_list)
    eval_scores = evaluator.evaluate(data, dims=['coherence', 'consistency', 'fluency'], 
                                 overall=False, print_result=False)
    return [x["coherence"] for x in eval_scores],[x["consistency"] for x in eval_scores],[x["fluency"] for x in eval_scores]



ts1_coh,ts1_con,ts1_flu = compute_unieval_scores(text_summary1,ref,src)
ts2_coh,ts2_con,ts2_flu = compute_unieval_scores(text_summary2,ref,src)
ss1_coh,ss1_con,ss1_flu = compute_unieval_scores(audio_summary1,ref,src)
ss2_coh,ss2_con,ss2_flu = compute_unieval_scores(audio_summary2,ref,src)

print(f"COH {np.mean(ts1_coh+ts2_coh)} {np.std(ts1_coh+ts2_coh)} {np.mean(ss1_coh+ss2_coh)} {np.std(ss1_coh+ss2_coh)} {get_pvalues(ts1_coh+ts2_coh,ss1_coh+ss2_coh)}")
print(f"CON {np.mean(ts1_con+ts2_con)} {np.std(ts1_con+ts2_con)} {np.mean(ss1_con+ss2_con)} {np.std(ss1_con+ss2_con)} {get_pvalues(ts1_con+ts2_con,ss1_con+ss2_con)}")
print(f"FL {np.mean(ts1_flu+ts2_flu)} {np.std(ts1_flu+ts2_flu)} {np.mean(ss1_flu+ss2_flu)} {np.std(ss1_flu+ss2_flu)} {get_pvalues(ts1_flu+ts2_flu,ss1_flu+ss2_flu)}")




In [None]:
# Interannotator agreement 

task = 'summarization'
evaluator = get_evaluator(task)


def compute_unieval_scores(hyp_list,ref_list,src_list):
    data = convert_to_json(src_list=src_list, ref_list=ref_list,output_list=hyp_list)
    eval_scores = evaluator.evaluate(data, dims=['relevance'], 
                                 overall=False, print_result=False)
    return [x["relevance"] for x in eval_scores]



ts1_rel = compute_unieval_scores(text_summary1,text_summary2,src)
ts2_rel = compute_unieval_scores(text_summary2,text_summary1,src)
ss1_rel = compute_unieval_scores(audio_summary1,audio_summary2,src)
ss2_rel = compute_unieval_scores(audio_summary2,audio_summary1,src)

print(f"RELINT {np.mean(ts1_rel+ts2_rel)} {np.std(ts1_rel+ts2_rel)} {np.mean(ss1_rel+ss2_rel)} {np.std(ss1_rel+ss2_rel)} {get_pvalues(ts1_rel+ts2_rel,ss1_rel+ss2_rel)}")


In [None]:
## Factual Consistency between Modality 
task = 'summarization'
evaluator = get_evaluator(task)


def compute_unieval_scores(hyp_list,ref_list,src_list):
    data = convert_to_json(src_list=src_list, ref_list=ref_list,output_list=hyp_list)
    eval_scores = evaluator.evaluate(data, dims=['relevance'], 
                                 overall=False, print_result=True)
    return [x["relevance"] for x in eval_scores]



ts12_rel =  compute_unieval_scores(text_summary1,audio_summary2,src)
ts11_rel =  compute_unieval_scores(text_summary1,audio_summary1,src)

ts22_rel =  compute_unieval_scores(text_summary2,audio_summary2,src)
ts21_rel = compute_unieval_scores(text_summary2,audio_summary1,src)

ss12_rel =  compute_unieval_scores(audio_summary1,text_summary2,src)
ss11_rel = compute_unieval_scores(audio_summary1,text_summary1,src)

ss22_rel = compute_unieval_scores(audio_summary2,text_summary2,src)
ss21_rel = compute_unieval_scores(audio_summary2,text_summary1,src)

text_all = ts12_rel + ts11_rel + ts21_rel + ts22_rel
audio_all = ss12_rel + ss11_rel + ss21_rel + ss22_rel



print(f"RELPAIR {np.mean(text_all)} {np.std(text_all)} {np.mean(audio_all)} {np.std(audio_all)} {get_pvalues(text_all,audio_all)}")




In [None]:
task = 'summarization'

evaluator = get_evaluator(task)


def compute_unieval_scores(hyp_list,ref_list,src_list):
    data = convert_to_json(src_list=src_list, ref_list=ref_list,output_list=hyp_list)
    eval_scores = evaluator.evaluate(data, dims=['consistency'], 
                                 overall=False, print_result=True)
    return [x["consistency"] for x in eval_scores]



ts12_rel =  compute_unieval_scores(text_summary1,audio_summary2,audio_summary2)
ts11_rel =  compute_unieval_scores(text_summary1,audio_summary1,audio_summary1)

ts22_rel =  compute_unieval_scores(text_summary2,audio_summary2,audio_summary2)
ts21_rel = compute_unieval_scores(text_summary2,audio_summary1,audio_summary1)

ss12_rel = compute_unieval_scores(audio_summary1,text_summary2,text_summary2)
ss11_rel = compute_unieval_scores(audio_summary1,text_summary1,text_summary1)

ss22_rel = compute_unieval_scores(audio_summary2,text_summary2,text_summary2)
ss21_rel = compute_unieval_scores(audio_summary2,text_summary1,text_summary1)


text_all = ts12_rel + ts11_rel + ts21_rel + ts22_rel
audio_all = ss12_rel + ss11_rel + ss21_rel + ss22_rel


print(f"CONPAIR {np.mean(text_all)} {np.std(text_all)} {np.mean(audio_all)} {np.std(audio_all)} {get_pvalues(text_all,audio_all)}")



In [None]:
%cd ../

# BARTScore

In [None]:
%cd BARTScore

In [None]:
from bart_score import BARTScorer
bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')

text1_scores = bart_scorer.score(text_summary1,src,batch_size=4)
text2_scores = bart_scorer.score(text_summary2,src,batch_size=4)
audio1_scores = bart_scorer.score(audio_summary1,src,batch_size=4)
audio2_scores = bart_scorer.score(audio_summary2,src,batch_size=4)

text_score = np.mean(text1_scores+text2_scores)
audio_score = np.mean(audio1_scores+audio2_scores) 

print(f"BARTTrans {text_score} {np.std(text1_scores+text2_scores)} {audio_score} {np.std(audio1_scores+audio2_scores)} {get_pvalues(text1_scores+text2_scores,audio1_scores+audio2_scores)}")
    

In [None]:
text1_scores = bart_scorer.score(text_summary1,ref,batch_size=4)
text2_scores = bart_scorer.score(text_summary2,ref,batch_size=4)
audio1_scores = bart_scorer.score(audio_summary1,ref,batch_size=4)
audio2_scores = bart_scorer.score(audio_summary2,ref,batch_size=4)

text_score = np.mean(text1_scores+text2_scores)
audio_score = np.mean(audio1_scores+audio2_scores) 

print(f"BARTRef {text_score} {np.std(text1_scores+text2_scores)} {audio_score} {np.std(audio1_scores+audio2_scores)} {get_pvalues(text1_scores+text2_scores,audio1_scores+audio2_scores)}")


In [None]:
def get_interannotator_bart_score(hyp,ref):
    results1 = bart_scorer.score(hyp,ref,batch_size=4)
    results2 = bart_scorer.score(ref,hyp,batch_size=4)
    return np.mean(results1+results2),np.std(results1+results2),results1+results2

text_interannotator_bartscore,text_std,text_sc = get_interannotator_bart_score(text_summary1,text_summary2)
audio_interannotator_bartscore,audio_std,audio_sc = get_interannotator_bart_score(audio_summary1,audio_summary2)

print(f"BARTINT {text_interannotator_bartscore} {text_std} {audio_interannotator_bartscore} {audio_std} {get_pvalues(text_sc,audio_sc)}")

In [None]:

text1_ref_speech1hyp = bart_scorer.score(audio_summary1,text_summary1,batch_size=4)
text1_ref_speech2hyp = bart_scorer.score(audio_summary2,text_summary1,batch_size=4)

text2_ref_speech1hyp = bart_scorer.score(audio_summary1,text_summary2,batch_size=4)
text2_ref_speech2hyp = bart_scorer.score(audio_summary2,text_summary2,batch_size=4)

speech1_ref_text1hyp = bart_scorer.score(text_summary1,audio_summary1,batch_size=4)
speech1_ref_text2hyp = bart_scorer.score(text_summary2,audio_summary1,batch_size=4)

speech2_ref_text1hyp = bart_scorer.score(text_summary1,audio_summary2,batch_size=4)
speech2_ref_text2hyp = bart_scorer.score(text_summary2,audio_summary2,batch_size=4)


text_all = text1_ref_speech1hyp+text1_ref_speech2hyp+text2_ref_speech1hyp+text2_ref_speech2hyp
speech_all = speech1_ref_text1hyp+speech1_ref_text2hyp+speech2_ref_text1hyp+speech2_ref_text2hyp

print("Text Ref BS",np.mean(text_all))
print("Speech Ref BS",np.mean(speech_all))

print("Text Ref BS STD",np.std(text_all))
print("Speech Ref BS STD",np.std(speech_all))

print(f"Stat Result {get_pvalues(text_all,speech_all)}")


print(f"BARTPAIR {np.mean(text_all)} {np.std(text_all)} {np.mean(speech_all)} {np.std(speech_all)} {get_pvalues(text_all,speech_all)}")

In [None]:
%cd ../

## Retrieval based Evaluation

In [None]:
import scipy
import torchmetrics
import torch

def get_bertscore_retrieval(hyp_summary,refs):
    hyp = [hyp_summary]*len(refs)
    results = bertscore.compute(predictions=hyp,references=refs,lang="en",device="cuda:0")
    return results["f1"]

    
def get_retrieval_acc_entropy(hyp_summary,gt_index):
    
    scores = np.array(get_bertscore_retrieval(hyp_summary,ref))
    index = np.argmax(scores,axis=0)
    retrieval_acc = 1 if index == gt_index else 0
    mrr = get_mrr(scores,gt_index)    
    return retrieval_acc,mrr

def get_mrr(scores,gt_index):
    gt_index_array = torch.from_numpy(np.array([False]*len(scores),dtype=bool))
    gt_index_array[gt_index] = True
    scores = torch.from_numpy(scores)
    mrr = torchmetrics.functional.retrieval.retrieval_reciprocal_rank(scores, gt_index_array, top_k=None)
    
    return mrr
    

In [None]:
text_summary1_retr = output_df.apply(lambda x: get_retrieval_acc_entropy(x["Text_Summary1"],x.name),axis=1)
text_summary2_retr = output_df.apply(lambda x: get_retrieval_acc_entropy(x["Text_Summary2"],x.name),axis=1)


audio_summary1_retr = output_df.apply(lambda x: get_retrieval_acc_entropy(x["Audio_Summary1"],x.name),axis=1)
audio_summary2_retr = output_df.apply(lambda x: get_retrieval_acc_entropy(x["Audio_Summary2"],x.name),axis=1)


text_acc = [x[0] for x in text_summary1_retr] + [x[0] for x in text_summary2_retr]
text_mrr = [x[1] for x in text_summary1_retr] + [x[1] for x in text_summary2_retr]

audio_acc = [x[0] for x in audio_summary1_retr] + [x[0] for x in audio_summary2_retr]
audio_mrr = [x[1] for x in audio_summary1_retr] + [x[1] for x in audio_summary2_retr]


print(f"RETRACC {np.mean(text_acc)} {np.std(text_acc)} {np.mean(audio_acc)} {np.std(audio_acc)} {get_pvalues(text_acc,audio_acc)}")
print(f"RETRMRR {np.mean(text_mrr)} {np.std(text_mrr)} {np.mean(audio_mrr)} {np.std(audio_mrr)} {get_pvalues(text_acc,audio_mrr)}")


