# ROUGE (Recall-oriented Understudy of Gisting Evaluation)

## Install packages

In [24]:
!pip install evaluate
!pip install rouge-score
!pip install pandas



## Load the rouge metric

In [25]:
import evaluate
rouge = evaluate.load("rouge")

## Provide predictions and references

### Sample of `rouge.compute`

In [26]:
# ['prompt1 tuned response', 'prompt2 tuned response']
sample_predictions = ['the cow jumped over the moon', 'the dish ran away with the spoon']
# [ ['prompt1 reference response 1', 'prompt1 reference response 2'], ['prompt2 reference response 1', 'prompt2 reference repsonse 2'] ]
sample_references = [['the cow lept over the moon', 'the cow went over the moon'], ['the plate dashed away with the spoon','the dish ran with the spoon']]

rouge.compute(predictions=sample_predictions, references=sample_references)

{'rouge1': 0.8782051282051282,
 'rouge2': 0.6636363636363636,
 'rougeL': 0.8782051282051282,
 'rougeLsum': 0.8782051282051282}

### Import CSV files

In [27]:
import pandas as pd

# Load the CSV files into Pandas DataFrames
reference_prompts = pd.read_csv('./gemini_core_datasets/testing.csv')
reference_prompts = reference_prompts.drop(columns=['topic1', 'topic2', 'topic3'])

model_outputs = {}

model_outputs['base_gemini'] = pd.read_csv('./tuned_results/base.gemini-1.0-pro-002.output.csv')

# model_outputs['core_sdk_small'] = pd.read_csv('./tuned_results/dev_api/tuned.sdk.small.output.csv')
# model_outputs['core_sdk_large'] = pd.read_csv('./tuned_results/dev_api/tuned.sdk.large.output.csv')
# model_outputs['studio_small'] = pd.read_csv('./tuned_results/dev_api/tuned.studio.small.output.csv')
# model_outputs['studio_large'] = pd.read_csv('./tuned_results/dev_api/tuned.studio.large.output.csv')

model_outputs['vertex_sdk_small'] = pd.read_csv('./tuned_results/vertex_api/tuned.vertex.sdk.small.output.csv')
model_outputs['vertex_sdk_large'] = pd.read_csv('./tuned_results/vertex_api/tuned.vertex.sdk.large.output.csv')
model_outputs['vertex_studio_small'] = pd.read_csv('./tuned_results/vertex_api/tuned.vertex.studio.small.output.csv')
model_outputs['vertex_studio_large'] = pd.read_csv('./tuned_results/vertex_api/tuned.vertex.studio.large.output.csv')
model_outputs['vertex_studio_large_w_validation'] = pd.read_csv('./tuned_results/vertex_api/tuned.vertex.studio.large.w.validation.output.csv')



In [28]:
for model_output in model_outputs:
    print(model_outputs[model_output])

      fname                                           dialogue  \
0    test_0  #Person1#: Ms. Dawson, I need you to take a di...   
1    test_1  #Person1#: You're finally here! What took so l...   
2    test_2  #Person1#: Kate, you never believe what's happ...   
3    test_3  #Person1#: Happy Birthday, this is for you, Br...   
4    test_4  #Person1#: This Olympic park is so big!\n#Pers...   
..      ...                                                ...   
95  test_95  #Person1#: Thank you. Steven. That was the mos...   
96  test_96  #Person1#: Bill, how can you hear so happy tod...   
97  test_97  #Person1#: I'd like to pay my bill now. \n#Per...   
98  test_98  #Person1#: Carol telephone.\n#Person2#: Who is...   
99  test_99  #Person1#: Hey, don't I know you from somewher...   

                                              summary  
0   ## Intra-Office Memorandum\n\n**To:** All Empl...  
1   #Person1#: I'm glad you've decided to stop dri...  
2   #Person2#: Wow, this is a big news.

In [29]:
# The tuned model did not respond to some prompts due to safety settings
# Let's filter those out of both of our dataframes
filtered_reference_prompts = {}


# {model_name: df}
for dataframe in model_outputs:
    model_name = dataframe
    output_df = model_outputs[dataframe]

    no_response_condition = output_df['summary'].str.contains("FinishReason")
    

    # prompts with no responses
    no_response_prompts = output_df[no_response_condition]
    fnames_to_drop = no_response_prompts['fname'].tolist()

    print(model_name, ' --- ', fnames_to_drop)

    filter_no_reponses =  output_df['fname'].isin(fnames_to_drop)
    
    # Filter out the rows that match the fname from both tables

    model_outputs[model_name] = output_df[~filter_no_reponses]
    filtered_reference_prompts[model_name] = reference_prompts[~filter_no_reponses]



base_gemini  ---  ['test_23', 'test_46', 'test_65', 'test_67']
vertex_sdk_small  ---  ['test_16', 'test_41']
vertex_sdk_large  ---  ['test_41']
vertex_studio_small  ---  ['test_16', 'test_41', 'test_58', 'test_65']
vertex_studio_large  ---  ['test_30', 'test_41']
vertex_studio_large_w_validation  ---  ['test_41']


In [33]:
model_outputs['vertex_sdk_large']

Unnamed: 0,fname,dialogue,summary
0,test_0,"#Person1#: Ms. Dawson, I need you to take a di...",#Person1# asks Ms. Dawson to take a dictation ...
1,test_1,#Person1#: You're finally here! What took so l...,#Person2# got stuck in traffic again. #Person1...
2,test_2,"#Person1#: Kate, you never believe what's happ...",#Person1# tells Kate that Masha and Hero are g...
3,test_3,"#Person1#: Happy Birthday, this is for you, Br...",#Person1# gives Brian a present on his birthda...
4,test_4,#Person1#: This Olympic park is so big!\n#Pers...,#Person1# thinks the Olympic park is big. #Per...
...,...,...,...
95,test_95,#Person1#: Thank you. Steven. That was the mos...,Lin thanks Steven for the magnificent meal. Li...
96,test_96,"#Person1#: Bill, how can you hear so happy tod...",Bill tells #Person1# he made a move today and ...
97,test_97,#Person1#: I'd like to pay my bill now. \n#Per...,#Person2# helps Tom Wilson pay his bill with a...
98,test_98,#Person1#: Carol telephone.\n#Person2#: Who is...,#Person1# answers the phone for Carol. Carol i...


## Run Rouge Evaluation

In [31]:

predictions_per_model = {}
references_per_model = {}

for dataframe in model_outputs:
  predictions = []
  references = []
  model_name = dataframe
  output_df = model_outputs[dataframe]
  reference_prompts = filtered_reference_prompts[model_name]

  for index, row in output_df.iterrows():
    fname = row['fname']
    dialogue = row['dialogue']
    tuned_summary = row['summary']

    predictions.append(tuned_summary)

    reference_summary_1 = reference_prompts.loc[(reference_prompts['fname'] == fname) & (reference_prompts['dialogue'] == dialogue)]['summary1'].values[0]
    reference_summary_2 = reference_prompts.loc[(reference_prompts['fname'] == fname) & (reference_prompts['dialogue'] == dialogue)]['summary2'].values[0]
    reference_summary_3 = reference_prompts.loc[(reference_prompts['fname'] == fname) & (reference_prompts['dialogue'] == dialogue)]['summary3'].values[0]

    references.append([reference_summary_1, reference_summary_2, reference_summary_3])

    predictions_per_model[model_name] = predictions
    references_per_model[model_name] = references



print(predictions_per_model)
print()
print(references_per_model)



{'base_gemini': [['Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.', 'In order to prevent employees from wasting time on Instant Message programs, #Person1# decides to terminate the use of those programs and asks Ms. Dawson to send out a memo to all employees by the afternoon.', 'Ms. Dawson takes a dictation for #Person1# about prohibiting the use of Instant Message programs in the office. They argue about its reasonability but #Person1# still insists.'], ['#Person2# arrives late because of traffic jam. #Person1# persuades #Person2# to use public transportations to keep healthy and to protect the environment.', "#Person2# decides to follow #Person1#'s suggestions on quitting driving to work and will try to use public transportations.", '#Person2# complains to #Person1# about the traffic jam, #Person1# suggests quitting driving and taking public transportation instead.'],

In [32]:
scores = {}
# Score all prompts against 3 reference responses for one score

for dataframe in model_outputs:
    model_name = dataframe
    score = rouge.compute(
            predictions=predictions_per_model[model_name],
            references=references_per_model[model_name])
    
    scores[model_name] = score

for model in scores:
    print(model, ' --- ', scores[model])



base_gemini  ---  {'rouge1': 0.16186534895209137, 'rouge2': 0.0453878111544593, 'rougeL': 0.12434220411396402, 'rougeLsum': 0.1418452796600015}
vertex_sdk_small  ---  {'rouge1': 0.5267998115442215, 'rouge2': 0.28331536887828446, 'rougeL': 0.4583534222474014, 'rougeLsum': 0.457816011742501}
vertex_sdk_large  ---  {'rouge1': 0.5441469911506053, 'rouge2': 0.29809703686987343, 'rougeL': 0.47148420181215894, 'rougeLsum': 0.47011074167228645}
vertex_studio_small  ---  {'rouge1': 0.5282531249662937, 'rouge2': 0.26570070727725426, 'rougeL': 0.4463253283804182, 'rougeLsum': 0.4458456416613107}
vertex_studio_large  ---  {'rouge1': 0.5572866925825298, 'rouge2': 0.3126900180712632, 'rougeL': 0.4863236624057099, 'rougeLsum': 0.4860592699994632}
vertex_studio_large_w_validation  ---  {'rouge1': 0.554393529256747, 'rouge2': 0.3059111890892891, 'rougeL': 0.47734949923656433, 'rougeLsum': 0.47747216614091803}
