In [6]:
# pip installs
!pip install rouge
!pip install bert_score



In [7]:
!pip install transformers
!pip install bert-score



In [8]:
# Imports
import os
from rouge import Rouge
from nltk import word_tokenize
from nltk.translate import meteor
import nltk
import bert_score
import pandas as pd
import warnings
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
nltk.download('punkt')
nltk.download('wordnet')
# for blue score
import nltk.translate.bleu_score as bleu
import math
import numpy
try:
  nltk.data.find('tokenizers/punkt')
except LookupError:
  nltk.download('punkt')
import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# connect to drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Score functions
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer
def evaluate_sentence(candidate, reference):
  result = []
  candidate = str(candidate)
  reference = str(reference)
  # Calculate Rouge
  ROUGE = Rouge()
  score_rouge = ROUGE.get_scores(candidate, reference)

  # F-1 Score
  result.append(score_rouge[0]['rouge-1']['f'])
  result.append(score_rouge[0]['rouge-2']['f'])
  result.append(score_rouge[0]['rouge-l']['f'])

  # Calculate Bleu
  reference = word_tokenize(reference)
  candidate = word_tokenize(candidate)
  score_bleu = bleu.sentence_bleu([reference], candidate)
  result.append(score_bleu)

  # Calculate Meteor
  # score_meteor = meteor([candidate],reference)
  # result.append(score_meteor)

  # BERTScore calculation
  candidate = str(candidate)
  reference = str(reference)
  scorer = BERTScorer(model_type='bert-base-uncased')
  P, R, F1 = scorer.score([candidate], [reference])
  result.append(float(F1))

  return result


In [None]:
# Demo
hyp_sample = 'she read the book because she was interested in world history'
ref_sample = 'she was interested in world history because she read the book'

evaluate_sentence(hyp_sample, ref_sample)

[0.999999995,
 0.899999995,
 0.5999999950000001,
 0.7400828044922853,
 0.9461895227432251]

In [None]:
file_path = '/content/drive/My Drive/CSC490/CSC490 Project/results/test_gen_captions_blip.pkl'
with open(file_path, 'rb') as file:
    blip_captions = pickle.load(file)
print(blip_captions)

Mounted at /content/drive
[['a ct scan with a ct scan', ' Chest CT image in a bone window setting shows osteosclerotic lesions in the vertebral body of the thoracic spine arrow head and sternum white arrow as well as proliferative new bone formation over the right transverse process black arrow Bilateral pleural effusions and partial passive atelectasis of left basal lung are present'], ['a radiograph of the chest and a radiograph of the chest', ' a  linear radiopacity extended to the external jugular vein was observed on posteroanterior chest Xray b  0038inch  guidewire was observed on computed tomography and the distal part of the guidewire was detected in the external jugular vein'], ['a chest with a small chest and a small chest', '  15 month old boy presented with restlessness Bowel sounds were heard on auscultation in the left hemithorax Chest x ray examination confirmed the diagnosis of DH'], ['a picture of a man with a cervical surgery procedure', ' Top left patient chest wall 

In [None]:
# Evaluate generated captions from un-fine tuned blip
result_scores = []

for i in range(0, len(blip_captions)):
  result_scores.append(evaluate_sentence(blip_captions[i][0], blip_captions[i][1]))

result_df = pd.DataFrame(result_scores, columns=["rouge-1", "rouge-2", "rouge-l", "bleu", "bert-f1"])
result_df

Unnamed: 0,rouge-1,rouge-2,rouge-l,bleu,bert-f1
0,0.042553,0.000000,0.042553,1.808592e-234,0.736710
1,0.312500,0.055556,0.312500,6.781841e-156,0.812947
2,0.066667,0.000000,0.066667,2.068412e-232,0.786194
3,0.054054,0.000000,0.054054,1.697856e-233,0.723468
4,0.315789,0.111111,0.315789,6.373704e-155,0.848856
...,...,...,...,...,...
295,0.200000,0.000000,0.200000,8.020204e-232,0.859636
296,0.266667,0.153846,0.266667,1.656224e-155,0.784055
297,0.105263,0.043478,0.105263,1.886247e-158,0.767352
298,0.066667,0.026667,0.066667,3.264398e-156,0.757598


In [None]:
file_path = '/content/drive/My Drive/CSC490/CSC490 Project/results/test_blip_scores.pkl'

with open(file_path, 'wb') as file:
    pickle.dump(result_df, file)

In [None]:
# Evaluate generated captions from fine tuned BLIP
file_path = '/content/drive/My Drive/CSC490/CSC490 Project/results/test_gen_captions.pkl'
with open(file_path, 'rb') as file:
    ft_captions = pickle.load(file)
result_scores_ft = []

for i in range(0, len(ft_captions)):
  result_scores_ft.append(evaluate_sentence(ft_captions[i][0], ft_captions[i][1]))

result_df_ft = pd.DataFrame(result_scores_ft, columns=["rouge-1", "rouge-2", "rouge-l", "bleu", "bert-f1"])
file_path = '/content/drive/My Drive/CSC490/CSC490 Project/results/test_ft_scores.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(result_df_ft, file)

In [None]:
# Evaluate generated captions from extra fine tuned BLIP
file_path = '/content/drive/My Drive/CSC490/CSC490 Project/results/test_gen_captions2.pkl'
with open(file_path, 'rb') as file:
    ex_captions = pickle.load(file)
result_scores_ex = []

for i in range(0, len(ft_captions)):
  result_scores_ex.append(evaluate_sentence(ex_captions[i][0], ex_captions[i][1]))

result_df_ex = pd.DataFrame(result_scores_ex, columns=["rouge-1", "rouge-2", "rouge-l", "bleu", "bert-f1"])
file_path = '/content/drive/My Drive/CSC490/CSC490 Project/results/test_ex_scores.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(result_df_ex, file)

In [None]:
# Save as new csv file
path = '/content/drive/My Drive/CSC490/CSC490 Project/results/validation_result_scores.csv'
result_df.to_csv(path, index=False)

In [None]:
# BLIP
bleu_mean = result_df['bleu'].mean()
rouge1_mean = result_df['rouge-1'].mean()
rouge2_mean = result_df['rouge-2'].mean()
rougel_mean = result_df['rouge-l'].mean()
meteor_mean = result_df['bert-f1'].mean()

print(f'bleu mean: {bleu_mean}')
print(f'rouge-1 mean: {rouge1_mean}')
print(f'rouge-2 mean: {rouge2_mean}')
print(f'rouge-l mean: {rougel_mean}')
print(f'bert-f1 mean: {meteor_mean}')

bleu mean: 0.00017367193482808075
rouge-1 mean: 0.12037195249593999
rouge-2 mean: 0.016953815410902042
rouge-l mean: 0.10744364179467619
bert-f1 mean: 0.7878416933616003


In [None]:
# FT-1
bleu_mean = result_df_ft['bleu'].mean()
rouge1_mean = result_df_ft['rouge-1'].mean()
rouge2_mean = result_df_ft['rouge-2'].mean()
rougel_mean = result_df_ft['rouge-l'].mean()
meteor_mean = result_df_ft['bert-f1'].mean()

print(f'bleu mean: {bleu_mean}')
print(f'rouge-1 mean: {rouge1_mean}')
print(f'rouge-2 mean: {rouge2_mean}')
print(f'rouge-l mean: {rougel_mean}')
print(f'bert-f1 mean: {meteor_mean}')

bleu mean: 0.011614946360723888
rouge-1 mean: 0.22050356925164663
rouge-2 mean: 0.06722715073350484
rouge-l mean: 0.19514539559894292
bert-f1 mean: 0.834068706035614


In [None]:
#FT-2
bleu_mean = result_df_ex['bleu'].mean()
rouge1_mean = result_df_ex['rouge-1'].mean()
rouge2_mean = result_df_ex['rouge-2'].mean()
rougel_mean = result_df_ex['rouge-l'].mean()
meteor_mean = result_df_ex['bert-f1'].mean()

print(f'bleu mean: {bleu_mean}')
print(f'rouge-1 mean: {rouge1_mean}')
print(f'rouge-2 mean: {rouge2_mean}')
print(f'rouge-l mean: {rougel_mean}')
print(f'bert-f1 mean: {meteor_mean}')

bleu mean: 0.018605384719434303
rouge-1 mean: 0.241023432689097
rouge-2 mean: 0.07960306232090125
rouge-l mean: 0.21189371155871084
bert-f1 mean: 0.8405693197250366


In [None]:
# max bleu
bleu_max = result_df['bleu'].max()
max_index = result_df['bleu'].idxmax()

print(f'The max bleu score is {bleu_max} and its index is {max_index}')

The max bleu score is 0.861173529963367 and its index is 303


In [None]:
# min bleu
bleu_min = result_df['bleu'].min()
min_index = result_df['bleu'].idxmin()

print(f'The max bleu score is {bleu_min} and its index is {min_index}')

The max bleu score is 0.23152888964854848 and its index is 680


In [None]:
# Calculate BERT score
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer

def calculate_bert(candidate, reference):
  candidate = str(candidate)
  reference = str(reference)
  result = []
  # BERTScore calculation
  scorer = BERTScorer(model_type='bert-base-uncased')
  P, R, F1 = scorer.score([candidate], [reference])
  # print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")
  result.append(P)
  result.append(R)
  result.append(F1)
  return result

In [None]:
# Evaluate generated captions
result_bert= []

for i in range(0, len(blip_captions)):
  result_bert.append(calculate_bert(blip_captions[i][0], blip_captions[i][1]))

result_bert = pd.DataFrame(result_bert, columns=["precision", "recall", "f1"])
result_bert

Unnamed: 0,precision,recall,f1
0,[tensor(0.5910)],[tensor(0.3046)],[tensor(0.4020)]
1,[tensor(0.5747)],[tensor(0.4065)],[tensor(0.4762)]
2,[tensor(0.4477)],[tensor(0.3572)],[tensor(0.3974)]
3,[tensor(0.4055)],[tensor(0.2772)],[tensor(0.3293)]
4,[tensor(0.4817)],[tensor(0.4433)],[tensor(0.4617)]
...,...,...,...
295,[tensor(0.6507)],[tensor(0.5082)],[tensor(0.5707)]
296,[tensor(0.5352)],[tensor(0.4427)],[tensor(0.4846)]
297,[tensor(0.5563)],[tensor(0.4007)],[tensor(0.4658)]
298,[tensor(0.4933)],[tensor(0.2659)],[tensor(0.3455)]


In [None]:
# Save as new csv file
path = '/content/drive/My Drive/CSC490/CSC490 Project/results/validation_result_bert.csv'
result_bert.to_csv(path, index=False)

In [None]:
bert_mean = result_bert['f1'].mean()
bert_sd = result_bert['f1'].std()
print(bert_mean)
print(bert_sd)

0.7650656387209892
0.06630050394562262


In [None]:
# max bert
result_bert['f1'] = result_bert['f1'].astype(float)
bleu_max = result_bert['f1'].max()
max_index = result_bert['f1'].idxmax()

print(f'The max bert score is {bleu_max} and its index is {max_index}')

# min bert
bert_min = result_bert['f1'].min()
min_index = result_bert['f1'].idxmin()

print(f'The min bert score is {bleu_min} and its index is {min_index}')

The max bert score is 0.9207987189292908 and its index is 660
The max bert score is 0.23152888964854848 and its index is 344


In [11]:
# get saved df
file_path = '/content/drive/My Drive/CSC490/CSC490 Project/results/test_blip_scores.pkl'
file_path2 = '/content/drive/My Drive/CSC490/CSC490 Project/results/test_ft_scores.pkl'
file_path3 = '/content/drive/My Drive/CSC490/CSC490 Project/results/test_ex_scores.pkl'
with open(file_path, 'rb') as file:
  result_df = pickle.load(file)

with open(file_path2, 'rb') as file:
  result_df_ft = pickle.load(file)

with open(file_path3, 'rb') as file:
  result_df_ex = pickle.load(file)

In [12]:
# Standard Deviation BLIP
bleu_mean = result_df['bleu'].std()
rouge1_mean = result_df['rouge-1'].std()
rouge2_mean = result_df['rouge-2'].std()
rougel_mean = result_df['rouge-l'].std()
meteor_mean = result_df['bert-f1'].std()

print(f'bleu mean: {bleu_mean}')
print(f'rouge-1 mean: {rouge1_mean}')
print(f'rouge-2 mean: {rouge2_mean}')
print(f'rouge-l mean: {rougel_mean}')
print(f'bert-f1 mean: {meteor_mean}')

bleu mean: 0.0030080861497102664
rouge-1 mean: 0.08868070107548576
rouge-2 mean: 0.03454729194988594
rouge-l mean: 0.08031711478182002
bert-f1 mean: 0.041277374700074206


In [13]:
# Standard Deviation FT-1
bleu_mean = result_df_ft['bleu'].std()
rouge1_mean = result_df_ft['rouge-1'].std()
rouge2_mean = result_df_ft['rouge-2'].std()
rougel_mean = result_df_ft['rouge-l'].std()
meteor_mean = result_df_ft['bert-f1'].std()

print(f'bleu mean: {bleu_mean}')
print(f'rouge-1 mean: {rouge1_mean}')
print(f'rouge-2 mean: {rouge2_mean}')
print(f'rouge-l mean: {rougel_mean}')
print(f'bert-f1 mean: {meteor_mean}')

bleu mean: 0.0534250197049724
rouge-1 mean: 0.11902576354642251
rouge-2 mean: 0.09127935545439869
rouge-l mean: 0.11376627491391615
bert-f1 mean: 0.037013537581050186


In [14]:
# Standard Deviation FT-2
bleu_mean = result_df_ex['bleu'].std()
rouge1_mean = result_df_ex['rouge-1'].std()
rouge2_mean = result_df_ex['rouge-2'].std()
rougel_mean = result_df_ex['rouge-l'].std()
meteor_mean = result_df_ex['bert-f1'].std()

print(f'bleu mean: {bleu_mean}')
print(f'rouge-1 mean: {rouge1_mean}')
print(f'rouge-2 mean: {rouge2_mean}')
print(f'rouge-l mean: {rougel_mean}')
print(f'bert-f1 mean: {meteor_mean}')

bleu mean: 0.059443337116386274
rouge-1 mean: 0.12555687635884366
rouge-2 mean: 0.0967447915430078
rouge-l mean: 0.12574640695570843
bert-f1 mean: 0.041122845919076084
