In [12]:
! pip install pinecone-client==2.2.4 #restart the kernel after executing this cell



In [20]:
# What options do I have for adding code packages to my executions?
%run RAG.ipynb

In [21]:
import mlflow
import pandas as pd

from mlflow.metrics.genai import faithfulness, relevance, EvaluationExample
from langchain.chains import RetrievalQA
from langchain import hub
from langchain.prompts.chat import ChatPromptTemplate

In [22]:
# Creating a data frame with e-commerce related questions
eval_df = pd.DataFrame(
    {
        "questions": [
            "What options do I have for adding code packages to my executions?",
            "Does Domino support Project environment variables?",
            "Does Domino have its own managed storage?",
            "Can I attach an external data volumne?",
            "Which vector databases does Domino currently support?",
            "Does Domino have a Feature Store?"
        ],
    }
)

In [23]:
# Create good and bad examples for faithfulness in the context of support questions
faithfulness_examples = [
    EvaluationExample(
        input="Does Domino have its own managed storage?",
        output="Domino Datasets provides high-performance, versioned, and structured filesystem storage in Domino.",
        score=2,
        justification="The output provides a partially correct answer but misses important context about the the advantages of using Domino Datasets for reproducability",
        grading_context={
            "context": " You can use Datasets to build multiple curated collections of data in one Project and share them with your collaborators to use in their Projects. Likewise, you can mount Datasets from other Projects in your own Project if they are shared with you."
        },
    ),
    EvaluationExample(
        input="Which vector databases does Domino currently support?",
        output="Domino currently supports Pinecone and QDrant",
        score=5,
        justification="The output accurately reflects the databases currently supported, but does not indicate our plan to add more connectors",
        grading_context={
            "context": "Domino is in the process of adding more vector database connectors"
        },
    ),
]
# Uncomment below if you don't want to use an AI g/w endpoint
# faithfulness_metric = faithfulness(model="openai:/gpt-4", examples=faithfulness_examples)
faithfulness_metric = faithfulness(model="endpoints:/chat-gpt4-ja", examples=faithfulness_examples)

In [24]:
# Create good and bad examples for relevance in the context of e-commerce questions
relevance_examples = [
    EvaluationExample(
        input="Why can't I see the hardware tier I was expecting to see in my project?",
        output="Root cause: The hardware tiers you are looking for may have restricted access.",
        score=2,
        justification="The output provides general information, but does not provide a solution. For example, Your local Domino admin team will often restrict access to larger GPU/CPU tiers in order to contain costs on those tiers.",
        grading_context={
            "context": "If you don't see hardware tiers you are looking for, reach out to your local admin team for access to those tiers.  Access to hardware tiers in Domino is managed at the organization level.  So you will likely just need to be added to the appropriate Domino org. with access to that hardware tier by your local admin team."
        },
    ),
    EvaluationExample(
        input="What does Suggested and Popular Projects mean?",
        output="The top popular projects are calculated based on how many jobs are getting executed, the number of tags, and the number of collaborators with specific weightings.",
        score=5,
        justification="The output is highly relevant to the question, providing a clear and concise explanation on exactly how Projects are sugested",
        grading_context={
            "context": "The top popular projects are calculated based on how many jobs are getting executed, the number of tags, and the number of collaborators with specific weightings. In Domino's config (this config is not exposed to users), you can add weights to each value (tags, collaborators, job count)."
        },
    ),
]
# Uncomment below if you don't want to use an AI g/w endpoint
# relevance_metric = relevance(model="openai:/gpt-4", examples=relevance_examples)
relevance_metric = relevance(model="endpoints:/chat-gpt4-ja", examples=relevance_examples)

In [None]:
# Function that returns the response from the RAG for the evaluation dataset
def model(input_df):
    answer = []
    for index, row in input_df.iterrows():
        system_prompt, contexts = build_system_prompt(row["questions"], use_hyde=False)            
        messages = [
            SystemMessage(
                content=system_prompt
            ),
            HumanMessage(
                content=row["questions"]
            ),
        ]
        response = qa.predict(input=messages)
       
        answer.append({"result":qa.predict(input=messages),
                      "source_documents":contexts})

    return answer

In [30]:
# Lets run the evaluation for the llm-embedding model combinations

qa = None
df_metrics = pd.DataFrame()

# llms = ('OpenAI', 'Anthropic')
llms = (['OpenAI'])

# Iterate through each combination and execute the evaluations
for llm_name in llms:
    run_name = f"{llm_name}_{embedding_model_name}_run"
    print(f'run_name={run_name}')
    # Log parameters
    print(f"model : {llm_name}")
    print(f"embedding : {embedding_model_name}")
    if "OpenAI" in llm_name:
        qa = conversation_openai
    elif "Anthropic" in llm_name:
        qa= conversation_anthropic
    # Run the evaluation
    results = mlflow.evaluate(
    model,
    eval_df,
    model_type="question-answering",
    evaluators="default",
    predictions="result",
    extra_metrics=[faithfulness_metric, relevance_metric, mlflow.metrics.latency()],
    evaluator_config={
        "col_mapping": {
            "inputs": "questions",
            "context": "source_documents",
            }
        },
    )
    metrics_series = pd.Series(results.metrics, name=f'{llm_name}_{embedding_model_name}')
    metrics_df = pd.DataFrame([metrics_series])
    df_metrics = pd.concat([df_metrics, metrics_df], ignore_index=True)
    
df_metrics = df_metrics.T
df_metrics.columns = llms

run_name=OpenAI_BAAI/bge-small-en_run
model : OpenAI
embedding : BAAI/bge-small-en


2024/05/11 16:18:03 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2024/05/11 16:18:03 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.


5


100%|██████████| 1/1 [00:00<00:00,  6.17it/s]


Indexing error: list index out of range




5


100%|██████████| 1/1 [00:00<00:00,  3.75it/s]


Indexing error: list index out of range




5


100%|██████████| 1/1 [00:00<00:00,  2.48it/s]


Indexing error: list index out of range




5


100%|██████████| 1/1 [00:00<00:00,  1.55it/s]


Indexing error: list index out of range




5


100%|██████████| 1/1 [00:00<00:00,  6.30it/s]

Indexing error: list index out of range





5


100%|██████████| 1/1 [00:00<00:00,  3.12it/s]


Indexing error: list index out of range


2024/05/11 16:19:01 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


  0%|          | 0/1 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/6 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/6 [00:00<?, ?it/s]

In [31]:
df_metrics

Unnamed: 0,OpenAI
latency/mean,9.675278
latency/variance,1.586657
latency/p90,10.998815
faithfulness/v1/mean,
faithfulness/v1/variance,
relevance/v1/mean,
relevance/v1/variance,


In [33]:
# Lets now log these metrics in Domino
# Define the experiment name
experiment_name = 'default-project-6602e00c6cd93c572ea55308'
mlflow.set_experiment(experiment_name)
for column in df_metrics:
    with mlflow.start_run(run_name=column):
        for metric_name, metric_value in df_metrics[column].items():
            # Log the metric
            mlflow.log_metric(metric_name, metric_value)

In [None]:
# Looks good lets push the prompt to a prompt hub
# set LANGCHAIN_HUB_API_KEY in an env variable
# hub.push("subirmansukhani/rakuten-qa-rag", ChatPromptTemplate.from_template(template), new_repo_is_public=False)

In [None]:
# Lets take a look at the prompt hub

# from IPython.display import Javascript, display

# # Define the URL you want to open
# url = 'https://smith.langchain.com/hub/my-prompts?organizationId=6ac11f6f-c332-4bac-b45b-28a8a96410b4'

# # JavaScript code to open a new tab with the specified URL and display it in the cell's output area
# js_code = f'''
# var newWindow = window.open("{url}");
# element.append(newWindow.document.body);
# '''

# # Display the JavaScript output in the cell's output area
# display(Javascript(js_code))