# ROUGE (Recall-oriented Understudy of Gisting Evaluation)

## Install packages

In [18]:
!pip install evaluate
!pip install rouge-score
!pip install pandas



## Load the rouge metric

In [129]:
import evaluate
rouge = evaluate.load("rouge")

## Provide predictions and references

In [153]:
predictions = ['the cat was found under the bed']
references = [['the cat was found under the bed', 'hellow worldd', 'goodbye bird']]

rouge.compute(predictions=predictions, references=references)

{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}

In [180]:
test_2_tuned_summary = "#Person1# tells Kate that Masha and Hero are getting divorced. Kate is surprised and can't believe it. Masha and Hero are having a separation for 2 months and filed for divorce."


ref1 = 'Masha and Hero are getting divorced.'
ref2 = 'Someone is telling Kate that Masha and Hero are getting a peaceful divorce. Kate feels surprised and asks about their children.'
ref3 = '#Person1# and Kate talk about the divorce between Masha and Hero. Kate feels surprised because she thought they are well matched'
test_2_refs = [ref1, ref2, ref3]
print('all 3 refs in one array')

print(rouge.compute(
    predictions=[test_2_tuned_summary],
    references=[test_2_refs]))

print(1)
print(rouge.compute(
    predictions=[test_2_tuned_summary],
    references=[ref1]))
print(2)
print(rouge.compute(
    predictions=[test_2_tuned_summary],
    references=[ref2]))
print(3)
print(rouge.compute(
    predictions=[test_2_tuned_summary],
    references=[ref3]))

all 3 refs in one array
{'rouge1': 0.49056603773584906, 'rouge2': 0.27777777777777773, 'rougeL': 0.37735849056603776, 'rougeLsum': 0.37735849056603776}
1
{'rouge1': 0.3157894736842105, 'rouge2': 0.27777777777777773, 'rougeL': 0.3157894736842105, 'rougeLsum': 0.3157894736842105}
2
{'rouge1': 0.49056603773584906, 'rouge2': 0.2745098039215686, 'rougeL': 0.37735849056603776, 'rougeLsum': 0.37735849056603776}
3
{'rouge1': 0.37735849056603776, 'rouge2': 0.0784313725490196, 'rougeL': 0.3018867924528302, 'rougeLsum': 0.3018867924528302}


In [83]:
print(rouge.compute(
    predictions=[test_02_dialogue],
    references=[[test_02_ref1,test_02_ref2,test_02_ref3]]))


{'rouge1': 0.04761904761904762, 'rouge2': 0.0, 'rougeL': 0.04761904761904762, 'rougeLsum': 0.04761904761904762}


In [71]:
results = rouge.compute(
    predictions=predictions,
    references=references)

results

{'rouge1': 0.43076923076923074,
 'rouge2': 0.19047619047619047,
 'rougeL': 0.27692307692307694,
 'rougeLsum': 0.27692307692307694}

{'rouge1': 0.9615384615384615,
 'rouge2': 0.8636363636363635,
 'rougeL': 0.9615384615384615,
 'rougeLsum': 0.9615384615384615}

## Import CSV files

In [185]:
import pandas as pd

# Load the CSV files into Pandas DataFrames
test_prompts = pd.read_csv('file://localhost/content/dialogsum.test.truncated.csv')
test_prompts = test_prompts.drop(columns=['topic1', 'topic2', 'topic3'])
tuned_model_results = pd.read_csv('file://localhost/content/dialogsum.test.truncated.output.csv')

# Ensure dataframes are sorted the same on fname and dialogue
# test_prompts.sort_values(by=['fname', 'dialogue'], inplace=True)
# tuned_model_results.sort_values(by=['fname', 'dialogue'], inplace=True)

# Temporary, drop all rows but 1

# test_prompts.drop(test_prompts.index[3:], inplace=True)
# tuned_model_results.drop(tuned_model_results.index[3:], inplace=True)

In [41]:
test_prompts


Unnamed: 0,fname,dialogue,summary1,summary2,summary3
0,test_0,"#Person1#: Ms. Dawson, I need you to take a di...",Ms. Dawson helps #Person1# to write a memo to ...,In order to prevent employees from wasting tim...,Ms. Dawson takes a dictation for #Person1# abo...
1,test_1,#Person1#: You're finally here! What took so l...,#Person2# arrives late because of traffic jam....,#Person2# decides to follow #Person1#'s sugges...,#Person2# complains to #Person1# about the tra...
2,test_2,"#Person1#: Kate, you never believe what's happ...",#Person1# tells Kate that Masha and Hero get d...,#Person1# tells Kate that Masha and Hero are g...,#Person1# and Kate talk about the divorce betw...


In [42]:
tuned_model_results

Unnamed: 0,fname,dialogue,summary
0,test_0,"#Person1#: Ms. Dawson, I need you to take a di...",#Person1# dictates a memo to #Person2# to rest...
1,test_1,#Person1#: You're finally here! What took so l...,#Person2# got stuck in traffic again. #Person1...
2,test_2,"#Person1#: Kate, you never believe what's happ...",#Person1# tells Kate that Masha and Hero are g...


In [None]:
# Each row should have 3 scores, 1 for each summary
# {'fname': STRING, 'scores': ARRAY[{'rouge1': FLOAT, 'rouge2': FLOAT, 'rougeL': FLOAT, 'rougeLsum': FLOAT}]}

# {'fname': 'test_0',
#   'scores': [{
#     'rouge1': 0.8421052631578948,
#     'rouge2': 0.5882352941176471,
#     'rougeL': 0.8421052631578948,
#     'rougeLsum': 0.8421052631578948
#   },
#   {
#     'rouge1': 0.8421052631578948,
#     'rouge2': 0.5882352941176471,
#     'rougeL': 0.8421052631578948,
#     'rougeLsum': 0.8421052631578948
#   }]
#}

In [None]:
# scores
# {
#   'test_01': ARRAY[
#     {'rouge1': FLOAT, 'rouge2': FLOAT, 'rougeL': FLOAT, 'rougeLsum': FLOAT},
#     {'rouge1': FLOAT, 'rouge2': FLOAT, 'rougeL': FLOAT, 'rougeLsum': FLOAT},
#    ]}

In [None]:
all_scores = {}
for index, row in tuned_model_results.iterrows():
  fname = row['fname']
  dialogue = row['dialogue']
  tuned_summary = row['summary']

  # Test data includes 3 variations of accepted summaries. Add them to an array
  ref_summaries = []
  ref_summaries.append(test_prompts.loc[(test_prompts['fname'] == fname) & (test_prompts['dialogue'] == dialogue)]['summary1'].values[0])
  ref_summaries.append(test_prompts.loc[(test_prompts['fname'] == fname) & (test_prompts['dialogue'] == dialogue)]['summary2'].values[0])
  ref_summaries.append(test_prompts.loc[(test_prompts['fname'] == fname) & (test_prompts['dialogue'] == dialogue)]['summary3'].values[0])

  predictions = [tuned_summary]
  references = [ref_summaries]

  score = rouge.compute(
            predictions=predictions,
            references=references)

  all_scores[fname] = score

print(all_scores)