In [1]:
import pandas as pd
import numpy as np
import json
import csv
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer, util
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoModelForSeq2SeqLM, AutoTokenizer, BertTokenizer, BertForSequenceClassification, AdamW
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import warnings
import re
import pickle
import random
from rouge import Rouge

warnings.filterwarnings("ignore")




In [2]:
# Load dataset 
df = pd.read_csv("mle_screening_dataset.csv")
df.describe(), df.head()

(                                question  \
 count                              16406   
 unique                             14981   
 top     What causes Causes of Diabetes ?   
 freq                                  20   
 
                                                    answer  
 count                                               16401  
 unique                                              15811  
 top     This condition is inherited in an autosomal re...  
 freq                                                  348  ,
                          question  \
 0        What is (are) Glaucoma ?   
 1        What is (are) Glaucoma ?   
 2        What is (are) Glaucoma ?   
 3  Who is at risk for Glaucoma? ?   
 4       How to prevent Glaucoma ?   
 
                                               answer  
 0  Glaucoma is a group of diseases that can damag...  
 1  The optic nerve is a bundle of more than 1 mil...  
 2  Open-angle glaucoma is the most common form of...  
 3  Anyone ca

##### What we see over here is that there are only 5 (16406 total - 16401 answers) missing "answers". We remove them as they can mess up certain algorithms, such as consine similiarity. We also remove duplicate question answer pairs, but not duplicate questions or duplicate answers. 

In [3]:
# Data cleaning
df = df.dropna(subset=["answer"])
df = df.drop_duplicates()

##### I will be showcasing 2 styles of solutions:

1. RAG based Retriever
2. RAG based Generator

In [4]:
class qa_model():
    def __init__(self, dataframe = None):
        self.df = dataframe
        self.train_df, self.test_df = train_test_split(self.df, test_size=0.2, random_state=42)

        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        self.embeddings = self.train_df['answer'].tolist()
        self.embeddings_vectors = self.embedding_model.encode(self.embeddings, convert_to_tensor=True)
        
        with open("embeddings_vectors.pkl", "wb") as f:
            pickle.dump(self.embeddings_vectors, f)

        # self.tokenizer = T5Tokenizer.from_pretrained("t5-base")
        self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
        # self.model = T5ForConditionalGeneration.from_pretrained("t5-base").to("cpu")
        self.model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to("cpu")

    def predict_rag_retriever(self, question):
        question_embedding = self.embedding_model.encode(question, convert_to_tensor=True)
        results = util.semantic_search(question_embedding, self.embeddings_vectors, top_k=1)
        best = results[0][0]
        return self.embeddings[best['corpus_id']]
    
    def get_top_k_answers(self, query, k=3):
        query_embedding = self.embedding_model.encode(query, convert_to_tensor=True)
        results = util.semantic_search(query_embedding, self.embeddings_vectors, top_k=k)[0]
        return [self.embeddings[result['corpus_id']] for result in results]
    
    def generate_answer(self, question, contexts):
        context = " \n ".join(contexts)
        prompt = f"question: {question} context: {context}"
        input_ids = self.tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).input_ids.to("cpu")
        outputs = self.model.generate(input_ids, max_length=128)
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    def predict_rag_generator(self, question):
        top_results = self.get_top_k_answers(question, k=3)
        return self.generate_answer(question, top_results)

    def predict(self, solution_type = None, question = None):
        if solution_type == 1:
            output = self.predict_rag_retriever(question)
        else:
            output = self.predict_rag_generator(question)
        return output

    def evaluate_rag_retriever(self):
        correct = 0
        total = 0
        threshold = 0.75

        for idx, row in self.test_df.iterrows():
            question = row['question']
            actual_answer = row['answer']
            predicted_answer = self.predict_rag_retriever(question)

        actual_embed = self.embedding_model.encode(actual_answer, convert_to_tensor=True)
        predicted_embed = self.embedding_model.encode(predicted_answer, convert_to_tensor=True)
        similarity = cos_sim(actual_embed, predicted_embed).item()

        if similarity >= threshold:
            correct += 1
        total += 1

        accuracy = correct / total if total > 0 else 0
        print(f"RAG Retriever Accuracy: {accuracy}")
        # return accuracy

    def evaluate_rag_generator(self, max_samples=100):
        rouge = Rouge()
        scores_list = []
    
        for idx, row in self.test_df.sample(n=min(max_samples, len(self.test_df)), random_state=42).iterrows():
            question = row['question']
            reference = row['answer']
            generated = self.predict_rag_generator(question)

            try:
                scores = rouge.get_scores(generated, reference)[0]
                scores_list.append(scores)
            except Exception as e:
                print(f"Failure at {idx}, {e}")
                continue

        avg_scores = {
            'rouge-1': {},
            'rouge-2': {},
            'rouge-l': {}
        }

        for metric in avg_scores.keys():
            avg_scores[metric] = {
                'p': sum(score[metric]['p'] for score in scores_list) / len(scores_list),
                'r': sum(score[metric]['r'] for score in scores_list) / len(scores_list),
                'f': sum(score[metric]['f'] for score in scores_list) / len(scores_list)
            }

        for metric, values in avg_scores.items():
            print(f"{metric.upper()}: Precision={values['p']}, Recall={values['r']}, F1={values['f']}")

        # return avg_scores
                

In [5]:
questions = [
    "What are the symptoms of Glaucoma?",
    "How is diabetes managed?",
    "What causes high blood pressure?"
]

solution_types = {
    1: "RAG Retriever",
    2: "RAG Generator"
}

nlp = qa_model(dataframe=df)

In [6]:
for sol_type, label in solution_types.items():
    print(f"\n=== {label} ===\n")
    for q in questions:
        answer = nlp.predict(solution_type=sol_type, question=q)
        print(f"Q: {q}\nA: {answer}\n")


=== RAG Retriever ===

Q: What are the symptoms of Glaucoma?
A: At first, open-angle glaucoma has no symptoms. It causes no pain. Vision seems normal. Without treatment, people with glaucoma will slowly lose their peripheral, or side vision. They seem to be looking through a tunnel. Over time, straight-ahead vision may decrease until no vision remains.

Q: How is diabetes managed?
A: Diabetes cannot be cured, but it can be managed. Managing blood glucose (blood sugar) as well as blood pressure and cholesterol is the best defense against the serious complications of diabetes. Know What To Do Every Day To manage your diabetes, here are things to do every day.  -  Take your medicines.   -  Keep track of your blood glucose (blood sugar).   -  Check your blood pressure if your doctor advises.   -  Check your feet.   -  Brush your teeth and floss.   -  Stop smoking.   -  Eat well.   -  Be active.  Take your medicines. Keep track of your blood glucose (blood sugar). Check your blood pressure

In [7]:
# nlp.predict(solution_type = 1, question = questions[0])

In [8]:
# nlp.predict(solution_type = 2, question = questions[2])

## Model Evaluation

##### The RAG Retriver uses cosine similiarity with a threshold of 0.8 to check if the actual result is close to the predicted result. 
##### The RAG Generator uses ROUGE, a package that is specifically designed for evaluating automatic summarization. 

##### ROUGE-1: captures how many individual words match
##### ROUGE-2: captures short phrase similarity
##### ROUGE-L: captures longest common subsequence

In [9]:
nlp.evaluate_rag_retriever()
nlp.evaluate_rag_generator()

RAG Retriever Accuracy: 1.0
ROUGE-1: Precision=0.43688479293895627, Recall=0.04596258721821942, F1=0.0748718223003693
ROUGE-2: Precision=0.19227596135472844, Recall=0.013829605253526748, F1=0.024074299463576823
ROUGE-L: Precision=0.429148973647645, Recall=0.0431345649216121, F1=0.0713189971977033
