# ROUGE (Recall-oriented Understudy of Gisting Evaluation)

## Install packages

In [1]:
!pip install evaluate
!pip install rouge-score
!pip install pandas

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting numpy>=1.17 (from evaluate)
  Using cached numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl.metadata (61 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from evaluate)
  Using cached pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl.metadata (19 kB)
Collecting requests>=2.19.0 (from evaluate)
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.62.1 (from evaluate)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp39-cp39-macosx_10_9_x86_64.whl.metadata (12 kB)
Collecting multiprocess 

## Load the rouge metric

In [2]:
import evaluate
rouge = evaluate.load("rouge")

  from .autonotebook import tqdm as notebook_tqdm
Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 3.96MB/s]


## Provide predictions and references

### Sample of `rouge.compute`

In [3]:
# ['prompt1 tuned response', 'prompt2 tuned response']
sample_predictions = ['the cow jumped over the moon', 'the dish ran away with the spoon']
# [ ['prompt1 reference response 1', 'prompt1 reference response 2'], ['prompt2 reference response 1', 'prompt2 reference repsonse 2'] ]
sample_references = [['the cow lept over the moon', 'the cow went over the moon'], ['the plate dashed away with the spoon','the dish ran with the spoon']]

rouge.compute(predictions=sample_predictions, references=sample_references)

{'rouge1': 0.8782051282051282,
 'rouge2': 0.6636363636363636,
 'rougeL': 0.8782051282051282,
 'rougeLsum': 0.8782051282051282}

### Import CSV files

In [15]:
import pandas as pd

# Load the CSV files into Pandas DataFrames
reference_prompts = pd.read_csv('dialogsum.test.truncated.csv')
tuned_model_output = pd.read_csv('./tuned_results/dev_api/tuned.sdk.small.output.csv')

# Drop the unused topic columns from the test prompts
reference_prompts = reference_prompts.drop(columns=['topic1', 'topic2', 'topic3'])

In [16]:
reference_prompts

Unnamed: 0,fname,dialogue,summary1,summary2,summary3
0,test_0,"#Person1#: Ms. Dawson, I need you to take a di...",Ms. Dawson helps #Person1# to write a memo to ...,In order to prevent employees from wasting tim...,Ms. Dawson takes a dictation for #Person1# abo...
1,test_1,#Person1#: You're finally here! What took so l...,#Person2# arrives late because of traffic jam....,#Person2# decides to follow #Person1#'s sugges...,#Person2# complains to #Person1# about the tra...
2,test_2,"#Person1#: Kate, you never believe what's happ...",#Person1# tells Kate that Masha and Hero get d...,#Person1# tells Kate that Masha and Hero are g...,#Person1# and Kate talk about the divorce betw...
3,test_3,"#Person1#: Happy Birthday, this is for you, Br...",#Person1# and Brian are at the birthday party ...,#Person1# attends Brian's birthday party. Bria...,#Person1# has a dance with Brian at Brian's bi...
4,test_4,#Person1#: This Olympic park is so big!\n#Pers...,#Person1# is surprised at the Olympic Stadium'...,#Person2# shows #Person1# around the construct...,#Person2# introduces the Olympic Stadium's fin...
...,...,...,...,...,...
95,test_95,#Person1#: Thank you. Steven. That was the mos...,Steven and Lin just had a great meal. Then the...,Steven buys Lin a magnificent dinner in Americ...,Steven treats Lin to a nice meal. Then they ta...
96,test_96,"#Person1#: Bill, how can you hear so happy tod...",Bill is happy because he made a move to know h...,#Person1# and Bill talk about Bill's roommate ...,Bill tells #Person1# that he has made a move t...
97,test_97,#Person1#: I'd like to pay my bill now. \n#Per...,#Person2# checks Tom Wilson's information and ...,Tom Wilson pays for his bill for hotel and foo...,Tom Wilson pays the bill for his room and meal...
98,test_98,#Person1#: Carol telephone.\n#Person2#: Who is...,Susan calls Carol to ask about the party time....,"Carol is taking a shower when Carol calls her,...",Susan calls to ask Carol about the party time....


In [17]:
tuned_model_output

Unnamed: 0,fname,dialogue,summary
0,test_0,"#Person1#: Ms. Dawson, I need you to take a di...",#Person1# asks Ms. Dawson to take a dictation ...
1,test_1,#Person1#: You're finally here! What took so l...,#Person1# suggests #Person2# take public trans...
2,test_2,"#Person1#: Kate, you never believe what's happ...",#Person1# tells Kate that Masha and Hero are g...
3,test_3,"#Person1#: Happy Birthday, this is for you, Br...",#Person1# gives Brian a birthday present and t...
4,test_4,#Person1#: This Olympic park is so big!\n#Pers...,#Person1# and #Person2# are in the Olympic sta...
...,...,...,...
95,test_95,#Person1#: Thank you. Steven. That was the mos...,Lin treats Steven a meal abroad. Steven asks L...
96,test_96,"#Person1#: Bill, how can you hear so happy tod...",Bill is happy because he made a move today. He...
97,test_97,#Person1#: I'd like to pay my bill now. \n#Per...,#Person1# pays the bill by credit card with th...
98,test_98,#Person1#: Carol telephone.\n#Person2#: Who is...,#Person1# answers the phone for Carol and tell...


In [19]:
# The tuned model did not respond to some prompts due to safety settings
# Let's filter those out of both of our dataframes

no_response_condition = tuned_model_output['summary'].str.contains("FinishReason")

# prompts with no reponses
no_response_prompts = tuned_model_output[no_response_condition]

# get their uniquie identifiers (fname column) and turn it into a filter
fnames_to_drop = no_response_prompts['fname'].tolist()
print('prompts with no response:\n', fnames_to_drop)
filter_no_reponses =  tuned_model_output['fname'].isin(fnames_to_drop)

# Filter out the rows that match the fname from both tables

tuned_model_output = tuned_model_output[~filter_no_reponses]
reference_prompts = reference_prompts[~filter_no_reponses]


prompts with no response:
 []


In [20]:
reference_prompts


Unnamed: 0,fname,dialogue,summary1,summary2,summary3
0,test_0,"#Person1#: Ms. Dawson, I need you to take a di...",Ms. Dawson helps #Person1# to write a memo to ...,In order to prevent employees from wasting tim...,Ms. Dawson takes a dictation for #Person1# abo...
1,test_1,#Person1#: You're finally here! What took so l...,#Person2# arrives late because of traffic jam....,#Person2# decides to follow #Person1#'s sugges...,#Person2# complains to #Person1# about the tra...
2,test_2,"#Person1#: Kate, you never believe what's happ...",#Person1# tells Kate that Masha and Hero get d...,#Person1# tells Kate that Masha and Hero are g...,#Person1# and Kate talk about the divorce betw...
3,test_3,"#Person1#: Happy Birthday, this is for you, Br...",#Person1# and Brian are at the birthday party ...,#Person1# attends Brian's birthday party. Bria...,#Person1# has a dance with Brian at Brian's bi...
4,test_4,#Person1#: This Olympic park is so big!\n#Pers...,#Person1# is surprised at the Olympic Stadium'...,#Person2# shows #Person1# around the construct...,#Person2# introduces the Olympic Stadium's fin...
...,...,...,...,...,...
95,test_95,#Person1#: Thank you. Steven. That was the mos...,Steven and Lin just had a great meal. Then the...,Steven buys Lin a magnificent dinner in Americ...,Steven treats Lin to a nice meal. Then they ta...
96,test_96,"#Person1#: Bill, how can you hear so happy tod...",Bill is happy because he made a move to know h...,#Person1# and Bill talk about Bill's roommate ...,Bill tells #Person1# that he has made a move t...
97,test_97,#Person1#: I'd like to pay my bill now. \n#Per...,#Person2# checks Tom Wilson's information and ...,Tom Wilson pays for his bill for hotel and foo...,Tom Wilson pays the bill for his room and meal...
98,test_98,#Person1#: Carol telephone.\n#Person2#: Who is...,Susan calls Carol to ask about the party time....,"Carol is taking a shower when Carol calls her,...",Susan calls to ask Carol about the party time....


In [21]:
tuned_model_output

Unnamed: 0,fname,dialogue,summary
0,test_0,"#Person1#: Ms. Dawson, I need you to take a di...",#Person1# asks Ms. Dawson to take a dictation ...
1,test_1,#Person1#: You're finally here! What took so l...,#Person1# suggests #Person2# take public trans...
2,test_2,"#Person1#: Kate, you never believe what's happ...",#Person1# tells Kate that Masha and Hero are g...
3,test_3,"#Person1#: Happy Birthday, this is for you, Br...",#Person1# gives Brian a birthday present and t...
4,test_4,#Person1#: This Olympic park is so big!\n#Pers...,#Person1# and #Person2# are in the Olympic sta...
...,...,...,...
95,test_95,#Person1#: Thank you. Steven. That was the mos...,Lin treats Steven a meal abroad. Steven asks L...
96,test_96,"#Person1#: Bill, how can you hear so happy tod...",Bill is happy because he made a move today. He...
97,test_97,#Person1#: I'd like to pay my bill now. \n#Per...,#Person1# pays the bill by credit card with th...
98,test_98,#Person1#: Carol telephone.\n#Person2#: Who is...,#Person1# answers the phone for Carol and tell...


## Run Rouge Evaluation

In [23]:
predictions = []
references = []

for index, row in tuned_model_output.iterrows():
  fname = row['fname']
  dialogue = row['dialogue']
  tuned_summary = row['summary']

  predictions.append(tuned_summary)

  reference_summary_1 = reference_prompts.loc[(reference_prompts['fname'] == fname) & (reference_prompts['dialogue'] == dialogue)]['summary1'].values[0]
  reference_summary_2 = reference_prompts.loc[(reference_prompts['fname'] == fname) & (reference_prompts['dialogue'] == dialogue)]['summary2'].values[0]
  reference_summary_3 = reference_prompts.loc[(reference_prompts['fname'] == fname) & (reference_prompts['dialogue'] == dialogue)]['summary3'].values[0]


  references.append([reference_summary_1, reference_summary_2, reference_summary_3])

print(predictions)
print()
print(references)


['#Person1# asks Ms. Dawson to take a dictation for him. #Person1# dictates a memorandum to all employees that the use of Instant Message programs by employees during working hours is strictly prohibited.', '#Person1# suggests #Person2# take public transport to work to avoid traffic jams and protect the environment. #Person2# agrees and decides to quit driving to work.', "#Person1# tells Kate that Masha and Hero are getting divorced. Kate is surprised and can't believe it.", '#Person1# gives Brian a birthday present and they have a dance together. #Person1# thinks the party is fine and they should have a drink together.', '#Person1# and #Person2# are in the Olympic stadium. #Person1# is surprised by the size of the stadium and the number of seats. #Person2# shows #Person1# the tracks and the jumping pit.', '#Person1# decides to create a company and asks #Person2# for advice. #Person2# tells #Person1# how to write a business plan. #Person1# gives up the idea after hearing the details.',

In [24]:
# Score all prompts against 3 reference responses for one score
score = rouge.compute(
            predictions=predictions,
            references=references)


print(score)

{'rouge1': 0.5688350823910646, 'rouge2': 0.3237342478359688, 'rougeL': 0.48906197184975014, 'rougeLsum': 0.48969075221631087}


In [106]:
# Score each prediction prompt against 3 reference responses, individually
all_scores = {}
for index, row in tuned_model_results.iterrows():
  fname = row['fname']
  dialogue = row['dialogue']
  tuned_summary = row['summary']

  # Test data includes 3 variations of accepted summaries. Add them to an array
  ref_summaries = []
  ref_summaries.append(test_prompts.loc[(test_prompts['fname'] == fname) & (test_prompts['dialogue'] == dialogue)]['summary1'].values[0])
  ref_summaries.append(test_prompts.loc[(test_prompts['fname'] == fname) & (test_prompts['dialogue'] == dialogue)]['summary2'].values[0])
  ref_summaries.append(test_prompts.loc[(test_prompts['fname'] == fname) & (test_prompts['dialogue'] == dialogue)]['summary3'].values[0])

  predictions = [tuned_summary]
  references = [ref_summaries]

  score = rouge.compute(
            predictions=predictions,
            references=references)

  all_scores[fname] = score

print(all_scores)

{'test_0': {'rouge1': 0.43076923076923074, 'rouge2': 0.19047619047619047, 'rougeL': 0.2857142857142857, 'rougeLsum': 0.2857142857142857}, 'test_1': {'rouge1': 0.380952380952381, 'rouge2': 0.09999999999999999, 'rougeL': 0.31111111111111117, 'rougeLsum': 0.31111111111111117}, 'test_2': {'rouge1': 0.5384615384615384, 'rouge2': 0.3673469387755102, 'rougeL': 0.47058823529411764, 'rougeLsum': 0.47058823529411764}, 'test_3': {'rouge1': 0.5, 'rouge2': 0.09523809523809525, 'rougeL': 0.40909090909090906, 'rougeLsum': 0.40909090909090906}, 'test_4': {'rouge1': 0.39130434782608703, 'rouge2': 0.18181818181818182, 'rougeL': 0.2608695652173913, 'rougeLsum': 0.2608695652173913}, 'test_5': {'rouge1': 0.591549295774648, 'rouge2': 0.20289855072463767, 'rougeL': 0.34567901234567905, 'rougeLsum': 0.34567901234567905}, 'test_6': {'rouge1': 0.6046511627906976, 'rouge2': 0.4878048780487805, 'rougeL': 0.6046511627906976, 'rougeLsum': 0.6046511627906976}, 'test_7': {'rouge1': 0.5263157894736842, 'rouge2': 0.275