In [11]:
from qdrant_client import models, QdrantClient
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings, HuggingFaceBgeEmbeddings
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores.qdrant import Qdrant
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI, Cohere
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
#
from tqdm.auto import tqdm
from uuid import uuid4
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
from time import time,sleep
import openai
import tiktoken
#
import os
import json
#
import io
#
import mlflow
import itertools

In [2]:
from getpass import getpass
# get a token: https://platform.openai.com/account/api-keys
os.environ['OPENAI_API_KEY'] = getpass("Enter Openai key:")
# get a new token: https://dashboard.cohere.ai/
os.environ['COHERE_API_KEY'] = getpass("Enter Cohere key:")

Enter Openai key: ········
Enter Cohere key: ········


In [3]:
loader = CSVLoader(file_path='/mnt/code/data/disease_components.csv',source_column="link")
data = loader.load()

In [4]:
# Dictionary to hold embeddings with model names as keys
embeddings_dict = {}

metadatas = []
texts = []
for row in data:
  metadatas.append(row.metadata)
  texts.append(row.page_content)
print(len(metadatas),len(texts))

1183 1183


In [5]:
prompt_template = """Use the following pieces of context to answer the question enclosed within  3 backticks at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
Please provide an answer which is factually correct and based on the information retrieved from the vector store.
Please also mention any quotes supporting the answer if any present in the context supplied within two double quotes "" .

{context}

QUESTION:```{question}```
ANSWER:
"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context","question"])
#
chain_type_kwargs = {"prompt": PROMPT}

In [6]:
# Function to load model based on name
def get_qa_chain(llm_name='OpenAI', embedding_model_name='OpenAI'):
    
    qa = None
    doc_store = embeddings_dict.get(embedding_model_name)
    
    if llm_name == 'Cohere':
        rag_llm = Cohere(model="command",
                         temperature=0,
                         cohere_api_key=os.environ["COHERE_API_KEY"])
    else:
        rag_llm = ChatOpenAI(model_name='gpt-3.5-turbo-16k',
                             openai_api_key=os.environ["COHERE_API_KEY"],
                             temperature=0)
            
        return RetrievalQA.from_chain_type(llm=rag_llm,
                                  chain_type="stuff",
                                  chain_type_kwargs={"prompt": PROMPT},
                                  retriever=doc_store.as_retriever(search_kwargs={"k": 5}),
                                  return_source_documents=True
                                  )

In [7]:
def compute_embedding(embedding_model_name='HuggingFace'):
    
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': True}
    
    if embedding_model_name == 'OpenAI':
        embeddings = OpenAIEmbeddings()
           
    elif embedding_model_name =='BGE':
        embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en",
                                      model_kwargs=model_kwargs,
                                      encode_kwargs=encode_kwargs
                                     )
    else:
         embeddings = HuggingFaceEmbeddings(model_kwargs = model_kwargs,
                                            encode_kwargs = encode_kwargs,
                                           )
        
    store = Qdrant.from_texts(texts,
                              metadatas=metadatas,
                              embedding=embeddings,
                              path="/mnt/data/RAG-mktg/",
                              prefer_grpc=True,
                              collection=f"{embedding_model_name}_medical_qa_search")
    
    embeddings_dict[embedding_model_name] = store

In [12]:
# Define your search params
llms = ('OpenAI', 'Cohere')  # Model names
# embedding_models = ('HuggingFace', 'OpenAI', 'BGE')  # Embedding names
# embedding_models = ('HuggingFace', 'BGE')  # Embedding names
embedding_models = ('BGE')  # Embedding names

In [9]:
embeddings_dict = {}  # Dictionary to hold the Qdrant stores

In [10]:
# Compute embeddings
for model_name in embedding_models:
    print(f"Generating embeddings for {model_name}")
    compute_embedding(model_name)

Generating embeddings for HuggingFace


Downloading (…)99753/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)0cdb299753/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)db299753/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)753/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)99753/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)9753/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)0cdb299753/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b299753/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Generating embeddings for BGE


NameError: name 'HuggingFaceBgeEmbeddings' is not defined

In [None]:
#In order to evaluate the qa system we generated a few relevant questions and answers
eval_questions = [
    "I have persistent back pain since 4 weeks,I workout but havent had any sports injury.What might be the cause of the back pain?",
    "I have shortness of breath and frequently feel nauseated and tired.What can be the possible cause?",
    "My 12 year old son has Poor coordination Unsteady walk and a tendency to stumble while walking and poor coordination between two hands.What might be the possible cuase?",
    "What is Baby acne ?",
    "What is Botulism ?",
]

eval_answers = [
    "From the symptoms mentioned you might have a disloacted disk",  # incorrect answer
    "You might have asthama.",  # incorrect answer
    " Movement and coordination problems associated with cerebral palsy.Please consult a doctor for better diagnosis.",
    "Baby acne is small, inflamed bumps on a baby's face, neck, back or chest.",
    "Botulism is a rare and potentially fatal illness caused by a toxin produced by the bacterium Clostridium botulinum.",
]

examples = [
    {"query": q, "ground_truths": [eval_answers[i]]}
    for i, q in enumerate(eval_questions)
print(examples)

In [None]:
# Create all the evaluation chains
from ragas.langchain.evalchain import RagasEvaluatorChain
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
)

# create evaluation chains
faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)
answer_rel_chain = RagasEvaluatorChain(metric=answer_relevancy)
context_rel_chain = RagasEvaluatorChain(metric=context_relevancy)
context_recall_chain = RagasEvaluatorChain(metric=context_recall)

In [None]:
# Generate all combinations of these parameters
search_space = list(itertools.product(llms, embedding_models))
search_spacech_space

In [None]:
def run_experiment(model_name, embedding_name):
    
    qa_chain = get_qa_chain(model_name, embedding_name)
    
    predictions = qa.batch(examples)
    faithfulness_scores = faithfulness_chain.evaluate(examples, predictions)
    context_recall_scores = context_recall_chain.evaluate(examples, predictions)
    answer_rel_scores = answer_rel_chain.evaluate(examples, predictions)
    context_rel_scores = context_rel_chain.evaluate(examples, predictions)
    
    # Combine scores into a dataframe
    df_scores = pd.DataFrame({
        'faithfulness': faithfulness_scores,
        'context_recall': context_recall_scores,
        'answer_relevance': answer_rel_scores,
        'context_relevance': context_rel_scores
    })
    
    # Calculate the median of each column
    median_scores = df_scores.median().to_dict()
    
    # Return the results of the experiment 
    return median_scores

In [None]:
# Create and set the experiment name
experiment_name = "RAG_Parameter_search"
mlflow.set_experiment(experiment_name)

# Iterate through each combination and execute the MLflow runs
for llm_name, embedding_model_name in search:
    run_name = f"{llm_name}_{embedding_model_name}_run"
    print(f'run_name={run_name}')
    # Log parameters
    print(f"model : {llm_name}")
    print(f"embedding : {embedding_model_name}")
   # Run the experiment
    results = run_experiment(llm_name, embedding_model_name)
    # Log results
    for key, value in results.items():
        print(key, value)

In [None]:
# # Create and set the experiment name
# experiment_name = "RAG_Parameter_search"
# mlflow.set_experiment(experiment_name)

# # Iterate through each combination and execute the MLflow runs
# for llm_name, embedding_model_name in search:
#     run_name = f"{llm_name}_{embedding_model_name}_run"
#     with mlflow.start_run(run_name=run_name):
#         # Log parameters
#         mlflow.log_param("model", llm_name)
#         mlflow.log_param("embedding", embedding_model_name)

#         # Run the experiment
#         results = run_experiment(llm_name, embedding_model_name)

#         # Log results
#         for key, value in results.items():
#             mlflow.log_metric(key, value)

#         # End the run
#         mlflow.end_run()