### Automated testing

In [1]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import BERTScorer

# Replace 'Data cleaning/medicalQnA.xlsx' with the path to your Excel file
excel_file_path = 'Data cleaning/medicalQnA.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(excel_file_path)

In [2]:
df

Unnamed: 0,answer,question,url,tags
0,adhd and bipolar mood disorder (bmd) can coexi...,my 5 1/2-year-old son displays adhd symptoms f...,http://answers.webmd.com/answers/1194205/my-5-...,['attention deficit hyperactivity disorder']
1,stimulants in general tend to decrease appetit...,my son has add and mild autism. he has been su...,http://answers.webmd.com/answers/1194206/my-so...,"['autism', 'weight loss']"
2,while any of the stimulant medications can inc...,my son is 13 and is depressed. he has been tak...,http://answers.webmd.com/answers/1198557/my-so...,[]
3,seventy percent of teens diagnosed when they a...,my 17-year-old has stopped taking concerta aft...,http://answers.webmd.com/answers/1195455/my-17...,['attention deficit hyperactivity disorder']
4,try claritin-d which is located behind the pha...,i've been taking respa-ar for allergies. i can...,http://answers.webmd.com/answers/1182576/i-ve-...,['allergy']
...,...,...,...,...
29747,get it confirmed by doing venous doppler of bo...,pain in legs – varicose veins?,https://questiondoctors.com/question-pain-in-l...,[]
29748,hi dear. there are very less chances of pregna...,headaches really tired all the time feeling na...,https://questiondoctors.com/question-headaches...,"[""don't know if its too early to take a pregna..."
29749,i’m a radiologist. may be an mri misread. when...,46 year old male had stroke on right side of b...,https://questiondoctors.com/question-46-year-o...,['stroke on right side of brain']
29750,could be reaction to oxycodone…. ”the most fre...,46 year old male had stroke on right side of b...,https://questiondoctors.com/question-46-year-o...,['stroke on right side of brain']


In [3]:
# Replace 'your_file.txt' with the actual file path
file_path = '5_11_random_subset_1.txt'

# Initialize an empty dictionary to store the data
qa_dict = {}

# Open and read the file
with open(file_path, 'r') as file:
    lines = file.readlines()
    current_question = None
    current_answer = None

    for line in lines:
        line = line.strip()  # Remove leading/trailing white spaces

        if line.startswith('question:'):
            if current_question:
                qa_dict[current_question] = current_answer
            current_question = line[len('question: '):]  # Extract the question text
            current_answer = None  # Reset the current answer
        elif line.startswith('answer:'):
            if current_answer:
                current_answer += " " + line[len('answer: '):]  # Append to the current answer
            else:
                current_answer = line[len('answer: '):]  # Extract the answer text

    # Add the last question and answer to the dictionary
    if current_question:
        qa_dict[current_question] = current_answer

# # Print the dictionary
# for question, answer in qa_dict.items():
#     print(f"Question: {question}")
#     print(f"Answer: {answer}")
#     print()

In [4]:
# Initialize the BERTScorer
scorer = BERTScorer(lang="en")  # "en" for English, you can use other languages as well

# Create a smoothing function (e.g., Laplace smoothing)
smoother = SmoothingFunction()

# Initialize variables to keep track of BERTScore, BLEU score, and Exact Match
start = 0
bertScore = 0
bleuScore = 0

for question in qa_dict:
    generated_text = qa_dict[question]
    reference_text = df[df["question"] == question]["answer"].values[0]
                        
    # Calculate BERTScore
    precision, recall, f1 = scorer.score([generated_text], [reference_text])

    # Print the BERTScore values
    print(f"BERTScore: {f1.item():.4f}")
    
    # Calculate BLEU Score
    generated_tokens = generated_text.split()
    reference_tokens = reference_text.split()
    bleu = sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoother.method1)

    # Print the BLEU Score
    print(f"BLEU Score: {bleu:.4f}")

    bertScore += f1.item()
    bleuScore += bleu
    start += 1
    
    if start >= 5:  # Adjust the number of iterations as needed
        break

print("Average BERTScore:", bertScore / 5)  # Calculate and print average BERTScore
print("Average BLEU Score:", bleuScore / 5)  # Calculate and print average BLEU Score

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: 0.8389
BLEU Score: 0.0215
BERTScore: 0.8120
BLEU Score: 0.0060
BERTScore: 0.8126
BLEU Score: 0.0000
BERTScore: 0.8097
BLEU Score: 0.0000
BERTScore: 0.8429
BLEU Score: 0.0101
Average BERTScore: 0.8232302069664001
Average BLEU Score: 0.007524031162565327
Exact Match (Accuracy): 0.0


### Human evaluation

In [11]:
import pandas as pd
import random

# Generate some example data for the three columns
data = {
    'correctness': [round(random.uniform(5, 10)) for _ in range(5)],
    'coherence': [round(random.uniform(1, 5)) for _ in range(5)],
    'fluency': [round(random.uniform(1, 5)) for _ in range(5)]
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)

# Print the resulting DataFrame
print(df)

df.to_excel("output.xlsx")

   correctness  coherence  fluency
0            8          4        2
1            8          3        2
2            9          5        2
3            7          2        5
4            6          4        3


In [8]:
df

Unnamed: 0,correctness,coherence,fluency
0,7,4,3
1,5,3,4
2,9,3,3
3,9,5,5
4,10,5,4
5,8,2,3
6,9,2,2
7,10,4,5
8,5,1,4
9,7,5,1
