# ROUGE (Recall-oriented Understudy of Gisting Evaluation)

## Install packages

In [22]:
!pip install evaluate
!pip install rouge-score
!pip install pandas



## Load the rouge metric

In [None]:
import evaluate
rouge = evaluate.load("rouge")

  from .autonotebook import tqdm as notebook_tqdm


## Provide predictions and references

### Sample of `rouge.compute`

In [None]:
# ['prompt1 tuned response', 'prompt2 tuned response']
sample_predictions = ['the cow jumped over the moon', 'the dish ran away with the spoon']
# [ ['prompt1 reference response 1', 'prompt1 reference response 2'], ['prompt2 reference response 1', 'prompt2 reference repsonse 2'] ]
sample_references = [['the cow lept over the moon', 'the cow went over the moon'], ['the plate dashed away with the spoon','the dish ran with the spoon']]

rouge.compute(predictions=sample_predictions, references=sample_references)

{'rouge1': 0.8782051282051282,
 'rouge2': 0.6636363636363636,
 'rougeL': 0.8782051282051282,
 'rougeLsum': 0.8782051282051282}

### Import CSV files

In [None]:
import pandas as pd

# Load the CSV files into Pandas DataFrames
reference_prompts = pd.read_csv('dialogsum.test.truncated.csv')
reference_prompts = reference_prompts.drop(columns=['topic1', 'topic2', 'topic3'])

model_outputs = {}

model_outputs['base_gemini'] = pd.read_csv('./tuned_results/base.gemini-1.0-pro-002.output.csv')

model_outputs['core_sdk_small'] = pd.read_csv('./tuned_results/dev_api/tuned.sdk.small.output.csv')
model_outputs['core_sdk_large'] = pd.read_csv('./tuned_results/dev_api/tuned.sdk.large.output.csv')
model_outputs['studio_small'] = pd.read_csv('./tuned_results/dev_api/tuned.studio.small.output.csv')
model_outputs['studio_large'] = pd.read_csv('./tuned_results/dev_api/tuned.studio.large.output.csv')

model_outputs['vertex_sdk_small'] = pd.read_csv('./tuned_results/vertex_api/tuned.vertex.sdk.small.output.csv')
model_outputs['vertex_sdk_large'] = pd.read_csv('./tuned_results/vertex_api/tuned.vertex.sdk.large.output.csv')
model_outputs['vertex_studio_small'] = pd.read_csv('./tuned_results/vertex_api/tuned.vertex.studio.small.output.csv')
model_outputs['vertex_studio_large'] = pd.read_csv('./tuned_results/vertex_api/tuned.vertex.studio.large.output.csv')



In [None]:
for model_output in model_outputs:
    print(model_outputs[model_output])

      fname                                           dialogue  \
0    test_0  #Person1#: Ms. Dawson, I need you to take a di...   
1    test_1  #Person1#: You're finally here! What took so l...   
2    test_2  #Person1#: Kate, you never believe what's happ...   
3    test_3  #Person1#: Happy Birthday, this is for you, Br...   
4    test_4  #Person1#: This Olympic park is so big!\n#Pers...   
..      ...                                                ...   
95  test_95  #Person1#: Thank you. Steven. That was the mos...   
96  test_96  #Person1#: Bill, how can you hear so happy tod...   
97  test_97  #Person1#: I'd like to pay my bill now. \n#Per...   
98  test_98  #Person1#: Carol telephone.\n#Person2#: Who is...   
99  test_99  #Person1#: Hey, don't I know you from somewher...   

                                              summary  
0   ## Intra-Office Memorandum\n\n**To**: All Empl...  
1   This is a good example of a conversation betwe...  
2   The excerpt you have provided is a 

In [None]:
# The tuned model did not respond to some prompts due to safety settings
# Let's filter those out of both of our dataframes
filtered_reference_prompts = {}


# {model_name: df}
for dataframe in model_outputs:
    model_name = dataframe
    output_df = model_outputs[dataframe]

    no_response_condition = output_df['summary'].str.contains("FinishReason")
    

    # prompts with no responses
    no_response_prompts = output_df[no_response_condition]
    fnames_to_drop = no_response_prompts['fname'].tolist()

    print(model_name, ' --- ', fnames_to_drop)

    filter_no_reponses =  output_df['fname'].isin(fnames_to_drop)
    
    # Filter out the rows that match the fname from both tables

    model_outputs[model_name] = output_df[~filter_no_reponses]
    filtered_reference_prompts[model_name] = reference_prompts[~filter_no_reponses]



base_gemini  ---  ['test_16', 'test_65']
core_sdk_small  ---  ['test_16', 'test_37', 'test_41', 'test_82']
core_sdk_large  ---  ['test_16', 'test_30', 'test_41', 'test_58', 'test_59']
studio_small  ---  ['test_16', 'test_30', 'test_41', 'test_82']
studio_large  ---  ['test_16', 'test_41', 'test_58', 'test_59']
vertex_sdk_small  ---  ['test_16', 'test_37', 'test_41', 'test_58', 'test_59']
vertex_sdk_large  ---  ['test_16', 'test_17', 'test_28', 'test_41']
vertex_studio_small  ---  ['test_16', 'test_41', 'test_58']
vertex_studio_large  ---  ['test_41', 'test_58', 'test_92']


In [None]:
model_outputs

{'base_gemini':       fname                                           dialogue  \
 0    test_0  #Person1#: Ms. Dawson, I need you to take a di...   
 1    test_1  #Person1#: You're finally here! What took so l...   
 2    test_2  #Person1#: Kate, you never believe what's happ...   
 3    test_3  #Person1#: Happy Birthday, this is for you, Br...   
 4    test_4  #Person1#: This Olympic park is so big!\n#Pers...   
 ..      ...                                                ...   
 95  test_95  #Person1#: Thank you. Steven. That was the mos...   
 96  test_96  #Person1#: Bill, how can you hear so happy tod...   
 97  test_97  #Person1#: I'd like to pay my bill now. \n#Per...   
 98  test_98  #Person1#: Carol telephone.\n#Person2#: Who is...   
 99  test_99  #Person1#: Hey, don't I know you from somewher...   
 
                                               summary  
 0   ## Intra-Office Memorandum\n\n**To**: All Empl...  
 1   This is a good example of a conversation betwe...  
 2   The

## Run Rouge Evaluation

In [None]:

predictions_per_model = {}
references_per_model = {}

for dataframe in model_outputs:
  predictions = []
  references = []
  model_name = dataframe
  output_df = model_outputs[dataframe]
  reference_prompts = filtered_reference_prompts[model_name]

  for index, row in output_df.iterrows():
    fname = row['fname']
    dialogue = row['dialogue']
    tuned_summary = row['summary']

    predictions.append(tuned_summary)

    reference_summary_1 = reference_prompts.loc[(reference_prompts['fname'] == fname) & (reference_prompts['dialogue'] == dialogue)]['summary1'].values[0]
    reference_summary_2 = reference_prompts.loc[(reference_prompts['fname'] == fname) & (reference_prompts['dialogue'] == dialogue)]['summary2'].values[0]
    reference_summary_3 = reference_prompts.loc[(reference_prompts['fname'] == fname) & (reference_prompts['dialogue'] == dialogue)]['summary3'].values[0]

    references.append([reference_summary_1, reference_summary_2, reference_summary_3])

    predictions_per_model[model_name] = predictions
    references_per_model[model_name] = references



print(predictions_per_model)
print()
print(references_per_model)


{'base_gemini': ['## Intra-Office Memorandum\n\n**To**: All Employees\n\n**From**: [Your Name]\n\n**Date**: October 26, 2023\n\n**Subject**: New Policy Regarding Instant Messaging\n\nEffective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.\n\n**Scope**: This policy applies to all communications within the organization, including communications between employees and external parties.\n\n**Reason for Policy**: This policy is being implemented to ensure efficient and professional communication within the organization. Instant Messaging can be disruptive and distracting, and it can also lead to misunderstandings and miscommunications.\n\n**Enforcement**: Any employee who violates this policy will be subject to disciplinary action, up to and including termination of employment.\n\n**Questions**: Any questions regarding this policy should be directe

In [None]:
scores = {}
# Score all prompts against 3 reference responses for one score

for dataframe in model_outputs:
    model_name = dataframe
    score = rouge.compute(
            predictions=predictions_per_model[model_name],
            references=references_per_model[model_name])
    
    scores[model_name] = score

for model in scores:
    print(model, ' --- ', scores[model])



{'base_gemini': {'rouge1': [0.2061855670103093, 0.16161616161616163, 0.22222222222222224, 0.10714285714285714, 0.16216216216216217, 0.16724738675958187, 0.07999999999999999, 0.29885057471264365, 0.04519774011299435, 0.14285714285714285, 0.08547008547008549, 0.13636363636363635, 0.0457516339869281, 0.2790697674418605, 0.13836477987421386, 0.3333333333333333, 0.12, 0.0, 0.1, 0.22222222222222218, 0.10112359550561797, 0.1487603305785124, 0.078125, 0.07407407407407407, 0.288659793814433, 0.14193548387096774, 0.13186813186813187, 0.1477272727272727, 0.12903225806451613, 0.1142857142857143, 0.05620608899297424, 0.1348314606741573, 0.08415841584158416, 0.05673758865248227, 0.2411347517730496, 0.20588235294117643, 0.05208333333333333, 0.09322033898305085, 0.2119205298013245, 0.11111111111111112, 0.1342281879194631, 0.25, 0.06483790523690773, 0.1423948220064725, 0.10149253731343284, 0.12, 0.12454212454212454, 0.05423728813559322, 0.1641025641025641, 0.2608695652173913, 0.18705035971223022, 0.102