In [None]:
from qdrant_client import models, QdrantClient
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings, HuggingFaceBgeEmbeddings
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores.qdrant import Qdrant
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI, ChatAnthropic
from langchain import PromptTemplate
#
from tqdm.auto import tqdm
from uuid import uuid4
from transformers import pipeline
from getpass import getpass
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import openai
import tiktoken
#
import os
import json
#
import io
#
import mlflow
import itertools

**First lets start mlflow. Launch a terminal and run mlflow server --host 127.0.0.1 --port 8080 before running the cells below. Change the IP address and port if you need to**

In [90]:
os.environ['OPENAI_API_KEY'] = getpass("Enter Openai key:")
os.environ['ANTHROPIC_API_KEY'] = getpass("Enter Anthropic key:")
os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/mnt/data/RAG-mktg/model_cache/'
# Change the IP address and port if you are using a different one
mlflow.set_tracking_uri("http://127.0.0.1:8080")

In [102]:
# Load the data
loader = CSVLoader(file_path='/mnt/code/data/disease_components.csv',source_column="link")
data = loader.load()

In [None]:
# Dictionary to hold embeddings with model names as keys
embeddings_dict = {}

metadatas = []
texts = []
for row in data:
  metadatas.append(row.metadata)
  texts.append(row.page_content)
print(len(metadatas),len(texts))

In [None]:
prompt_template = """Use the following pieces of context to answer the question enclosed within  3 backticks at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
Please provide an answer which is factually correct and based on the information retrieved from the vector store.
Please also mention any quotes supporting the answer if any present in the context supplied within two double quotes "" .

{context}

QUESTION:```{question}```
ANSWER:
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context","question"])
#
chain_type_kwargs = {"prompt": PROMPT}

In [None]:
# Function to get a qa chain based on the llm and embedding model inputs
def get_qa_chain(llm_name, embedding_model_name='OpenAI'):
    
    qa = None
    doc_store = embeddings_dict.get(embedding_model_name)
    
    if llm_name == 'Anthropic':
        rag_llm = ChatAnthropic(temperature=0,
                         anthropic_api_key=os.environ["ANTHROPIC_API_KEY"])
        
    elif llm_name =='gpt-4':
        rag_llm = ChatOpenAI(model_name='gpt-4',
                             openai_api_key=os.environ["OPENAI_API_KEY"],
                             temperature=0)
        
    else:
        rag_llm = ChatOpenAI(model_name='gpt-3.5-turbo-16k',
                             openai_api_key=os.environ["OPENAI_API_KEY"],
                             temperature=0)
            
    return RetrievalQA.from_chain_type(llm=rag_llm,
                                       chain_type="stuff",
                                       chain_type_kwargs={"prompt": PROMPT},
                                       retriever=doc_store.as_retriever(search_kwargs={"k": 5}),
                                       return_source_documents=True
                                      )

In [None]:
# Function to compute embeddings
def compute_embedding(embedding_model_name='HuggingFace'):
    
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': True}
    
    if embedding_model_name == 'OpenAI':
        embeddings = OpenAIEmbeddings()
           
    elif embedding_model_name =='BGE':
        embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en",
                                      model_kwargs=model_kwargs,
                                      encode_kwargs=encode_kwargs
                                     )
    else:
         embeddings = HuggingFaceEmbeddings(model_kwargs = model_kwargs,
                                            encode_kwargs = encode_kwargs,
                                           )
        
    store = Qdrant.from_texts(texts,
                              metadatas=metadatas,
                              embedding=embeddings,
                              location=":memory:",
                              prefer_grpc=True,
                              collection=f"{embedding_model_name}_medical_qa_search")
    
    embeddings_dict[embedding_model_name] = store

In [None]:
# Define your search params
llms = ('OpenAI', 'Anthropic')  # Model names
embedding_models = ('BGE',)  # Embedding names , lets just evaluate the models for now
# embedding_models = ('HuggingFace', 'OpenAI', 'BGE')  # Embedding model options

In [None]:
embeddings_dict = {}  # Dictionary to hold the Qdrant stores

In [None]:
# Lets compute the embeddings for the embedding models specified above
for model_name in embedding_models:
    print(f"Generating embeddings for {model_name}")
    compute_embedding(model_name)

In [None]:
# Data frame of questions that we will use to evaluate the RAG pipeline
eval_df = pd.DataFrame(
    {
        "questions": [
            "I have persistent back pain since 4 weeks,I workout but havent had any sports injury.What might be the cause of the back pain?",
            "I have shortness of breath and frequently feel nauseated and tired.What can be the possible cause?",
            "My 12 year old son has Poor coordination Unsteady walk and a tendency to stumble while walking and poor coordination between two hands.What might be the possible cuase?",
            "What is Baby acne ?",
            "What is Botulism ?",
        ],
    }
)

In [None]:
# Lets setip a faithfulness metric
from mlflow.metrics.genai import faithfulness, EvaluationExample

# Create a good and bad example for faithfulness in the context of this problem
faithfulness_examples = [
    EvaluationExample(
        input="What is the cause of Achalasia?",
        output="We don't know the exact cause of achalasia and there are many theories about its cause ",
        score=2,
        justification="The output provides an answer that does not use all the information that was present in the context and therefore does not convey a comprehensive answer.",
        grading_context={
            "context": "The exact cause of achalasia is poorly understood. Researchers suspect it may be caused by a loss of nerve cells in the esophagus. There are theories about what causes this, but viral infection or autoimmune responses have been suspected. Very rarely, achalasia may be caused by an inherited genetic disorder or infection"
        },
    ),
    EvaluationExample(
        input="What is the cause of Achalasia?",
        output="The exact cause is not well understood and there are many theories about its cause. Researchers suspect it may be caused by nerve cell loss in the esophagus but is rarely caused by an inherited genetic disorder",
        score=5,
        justification="The output provides an answer using most of the information provided in the context.",
        grading_context={
            "context": "The exact cause of achalasia is poorly understood. Researchers suspect it may be caused by a loss of nerve cells in the esophagus. There are theories about what causes this, but viral infection or autoimmune responses have been suspected. Very rarely, achalasia may be caused by an inherited genetic disorder or infection"
        },
    ),
]

faithfulness_metric = faithfulness(model="openai:/gpt-4", examples=faithfulness_examples)
print(faithfulness_metric)

In [None]:
# Lets setup a relevance metric
from mlflow.metrics.genai import relevance, EvaluationExample

# Create a good and bad example for relevance in the context of this problem
relevance_examples = [
    EvaluationExample(
        input="At what age is it most common to rupture an Achilles tendon?",
        output="Your Achilles tendon helps you point your foot downward, rise on your toes and push off your foot as you walk. You rely on it virtually every time you walk and move your foot.The peak age for Achilles tendon rupture is 30 to 40 and more likely to occur in men than women",
        score=2,
        justification="The output provides an answer that has the information but also some irrelevant information about what the Achilles tendon does",
        grading_context={
            "context": "Factors that may increase your risk of Achilles tendon rupture include:Age. The peak age for Achilles tendon rupture is 30 to 40.\', \'Sex. Achilles tendon rupture is up to five times more likely to occur in men than in women."
        },
    ),
    EvaluationExample(
        input="At what age is it most common to rupture an Achilles tendon?",
        output="The peak age for Achilles tendon rupture is 30 to 40 with men being five times more likely to rupture an Achilles tendon than women",
        score=5,
        justification="The output provides a relevant answer to the question being asked by extracting the relevant information from the context",
        grading_context={
            "context": "Factors that may increase your risk of Achilles tendon rupture include:Age. The peak age for Achilles tendon rupture is 30 to 40.\', \'Sex. Achilles tendon rupture is up to five times more likely to occur in men than in women."
        },
    ),
]

relevance_metric = relevance(model="openai:/gpt-4", examples=relevance_examples)
print(relevance_metric)

In [96]:
# Function that returns the response from the RAG for the evaluation dataset
def model(input_df):
    answer = []
    for index, row in input_df.iterrows():
        answer.append(qa(row["questions"]))

    return answer

In [None]:
# Lets run the evaluation for the llm-embedding model combinations

qa = None
df_metrics = pd.DataFrame()

# Iterate through each combination and execute the evaluations
for llm_name, embedding_model_name in search_space:
    run_name = f"{llm_name}_{embedding_model_name}_run"
    print(f'run_name={run_name}')
    # Log parameters
    print(f"model : {llm_name}")
    print(f"embedding : {embedding_model_name}")
    qa = get_qa_chain(llm_name, embedding_model_name)
    # Run the evaluation
    results = mlflow.evaluate(
    model,
    eval_df,
    model_type="question-answering",
    evaluators="default",
    predictions="result",
    extra_metrics=[faithfulness_metric, relevance_metric, mlflow.metrics.latency()],
    evaluator_config={
        "col_mapping": {
            "inputs": "questions",
            "context": "source_documents",
            }
        },
    )
    metrics_series = pd.Series(results.metrics, name=f'{llm_name}_{embedding_model_name}')
    df_metrics = df_metrics.append(metrics_series)
    
df_metrics = df_metrics.T

In [101]:
df_metrics # The Anthropic model does better than the gpt-3.5 model

Unnamed: 0,gpt-3.5_BGE,Anthropic_BGE
faithfulness/v1/mean,2.6,4.6
faithfulness/v1/variance,3.84,0.24
faithfulness/v1/p90,5.0,5.0
relevance/v1/mean,2.2,3.6
relevance/v1/variance,2.16,1.84
relevance/v1/p90,4.0,4.6


In [None]:
# Lets now log these metrics in Domino
mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
# Define the experiment name
experiment_name = 'RAG eval'
mlflow.set_experiment(experiment_name)
for column in df_metrics:
    with mlflow.start_run(run_name=column):
        for metric_name, metric_value in df_metrics[column].items():
            # Log the metric
            mlflow.log_metric(metric_name, metric_value)