In [46]:
import pandas as pd
import sys
import os
import json
from tqdm import tqdm

notebook_dir = os.getcwd()

# Add the data folder (relative to where you launched the notebook)
data_path = os.path.abspath(os.path.join(notebook_dir, '../data'))
sys.path.append(data_path)

import minsearch

### Step 1 construct basic pipeline for ChatGPT RAG

In [47]:
# Tutotial 里借助chatGPT 生成了一个fake dataset
# data = pd.read_csv('../data/data.csv')
data = pd.read_csv('../data/data_unclean.csv', sep=';')
print(data.shape)
data = data.drop_duplicates(subset='Exercise Name')
print(data.shape)
data.columns = data.columns.str.lower().str.replace(' ', '_')
data.insert(0, 'id', data.index)
print(data.columns)
data.head(2)

(209, 7)
(207, 7)
Index(['id', 'exercise_name', 'type_of_activity', 'type_of_equipment',
       'body_part', 'type', 'muscle_groups_activated', 'instructions'],
      dtype='object')


Unnamed: 0,id,exercise_name,type_of_activity,type_of_equipment,body_part,type,muscle_groups_activated,instructions
0,0,Push-Ups,Strength,Bodyweight,Upper Body,Push,"Pectorals, Triceps, Deltoids",Start in a high plank position with your hands...
1,1,Squats,Strength,Bodyweight,Lower Body,Push,"Quadriceps, Glutes, Hamstrings",Stand with feet shoulder-width apart. Lower yo...


In [48]:
documents = data.to_dict(orient='records')
documents[:3]

[{'id': 0,
  'exercise_name': 'Push-Ups',
  'type_of_activity': 'Strength',
  'type_of_equipment': 'Bodyweight',
  'body_part': 'Upper Body',
  'type': 'Push',
  'muscle_groups_activated': 'Pectorals, Triceps, Deltoids',
  'instructions': 'Start in a high plank position with your hands under your shoulders. Lower your body until your chest nearly touches the floor. Push back up to the starting position.'},
 {'id': 1,
  'exercise_name': 'Squats',
  'type_of_activity': 'Strength',
  'type_of_equipment': 'Bodyweight',
  'body_part': 'Lower Body',
  'type': 'Push',
  'muscle_groups_activated': 'Quadriceps, Glutes, Hamstrings',
  'instructions': 'Stand with feet shoulder-width apart. Lower your body as if sitting back into a chair, keeping your chest up. Return to standing.'},
 {'id': 2,
  'exercise_name': 'Plank',
  'type_of_activity': 'Strength/Mobility',
  'type_of_equipment': 'Bodyweight',
  'body_part': 'Core',
  'type': 'Hold',
  'muscle_groups_activated': 'Rectus Abdominis, Transvers

#### 1.1 indgest raw data into index database

In [49]:
#将每一条记录进行index， 以便于search
index = minsearch.Index(
    text_fields = ['exercise_name', 'type_of_activity', 'type_of_equipment', 'body_part', 'type', 'muscle_groups_activated', 'instructions'],
    keyword_fields = ['id']
)
query = 'give me leg exercises for hamstrings'

In [50]:
index.fit(documents)

<minsearch.Index at 0x119c45f90>

In [51]:
# minsearch 的basic functionality（没有任何额外的parameter， 只是找出最相近的3条记录）
index.search(query, num_results=3)

[{'id': 118,
  'exercise_name': 'Leg Curl',
  'type_of_activity': 'Strength',
  'type_of_equipment': 'Machine',
  'body_part': 'Lower Body',
  'type': 'Pull',
  'muscle_groups_activated': 'Hamstrings',
  'instructions': 'Lie face down on a leg curl machine and curl the pad towards your glutes, then return to the starting position.'},
 {'id': 95,
  'exercise_name': 'Machine Leg Curl',
  'type_of_activity': 'Strength',
  'type_of_equipment': 'Machine',
  'body_part': 'Lower Body',
  'type': 'Pull',
  'muscle_groups_activated': 'Hamstrings',
  'instructions': 'Sit on a leg curl machine and curl the pad down towards your glutes, then return to the starting position.'},
 {'id': 109,
  'exercise_name': 'Seated Leg Curl',
  'type_of_activity': 'Strength',
  'type_of_equipment': 'Machine',
  'body_part': 'Lower Body',
  'type': 'Pull',
  'muscle_groups_activated': 'Hamstrings',
  'instructions': 'Sit on a leg curl machine and curl the pad down towards your glutes, then return to the starting p

In [None]:
# 测试 open AI 的basic functionality， 还没和minsearch 挂起来
from openai import OpenAI
client = OpenAI(api_key='已删掉')
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": query}
    ]
)
response.choices[0].message.content

KeyboardInterrupt: 

#### 1.2 build the RAG flow

In [None]:
# define最基本的query， limit最相近的5条记录， 后面feed到openAI query里去
def search(query):
    boost = {# no boosting
        }
    results = index.search(
        query = query,
        filter_dict = {}, # no filtering
        boost_dict = boost,
        num_results = 5
    )
    return results

In [None]:
#将 prompt_template 和 minsearch的 search result 拼在一起
prompt_template = """
    You're a fitness instructor. Answer the QUESTION base on the CONTEXT from our exercise database. 
    Only use the facts from the CONTEXT when answering the question.
    
    QUESTION: {question}
    
    CONTEXT: {context}
    """.strip()
entry_template = """
    exercise_name: {exercise_name},
    type_of_activity: {type_of_activity},
    type_of_equipment: {type_of_equipment},
    body_part: {body_part},
    type: {type},
    muscle_groups_activated: {muscle_groups_activated},
    instructions: {instructions}
    """.strip()
def build_prompt(query, searching_result):
    context = ""

    for doc in searching_result:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question = query, context = context).strip()

    return prompt

In [None]:
print(build_prompt(query, search(query)))

You're a fitness instructor. Answer the QUESTION base on the CONTEXT from our exercise database. 
    Only use the facts from the CONTEXT when answering the question.
    
    QUESTION: give me leg exercises for hamstrings
    
    CONTEXT: exercise_name: Leg Curl,
    type_of_activity: Strength,
    type_of_equipment: Machine,
    body_part: Lower Body,
    type: Pull,
    muscle_groups_activated: Hamstrings,
    instructions: Lie face down on a leg curl machine and curl the pad towards your glutes, then return to the starting position.

exercise_name: Machine Leg Curl,
    type_of_activity: Strength,
    type_of_equipment: Machine,
    body_part: Lower Body,
    type: Pull,
    muscle_groups_activated: Hamstrings,
    instructions: Sit on a leg curl machine and curl the pad down towards your glutes, then return to the starting position.

exercise_name: Seated Leg Curl,
    type_of_activity: Strength,
    type_of_equipment: Machine,
    body_part: Lower Body,
    type: Pull,
    muscl

In [74]:
# define function for Chat GPT
def llm(prompt, model = 'gpt-4o-mini'):
    response = client.chat.completions.create(model = model,
                                          messages = [{'role': 'user',
                                                       'content': prompt}]
                                             )
    return response.choices[0].message.content

def rag(query, model = 'gpt-4o-mini'):
    search_results = search(query)
    
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model)

    return answer

In [None]:
print(rag(query))

Here are some leg exercises for hamstrings:

1. **Leg Curl**
   - Type of Activity: Strength
   - Equipment: Machine
   - Instructions: Lie face down on a leg curl machine and curl the pad towards your glutes, then return to the starting position.

2. **Machine Leg Curl**
   - Type of Activity: Strength
   - Equipment: Machine
   - Instructions: Sit on a leg curl machine and curl the pad down towards your glutes, then return to the starting position.

3. **Seated Leg Curl**
   - Type of Activity: Strength
   - Equipment: Machine
   - Instructions: Sit on a leg curl machine and curl the pad down towards your glutes, then return to the starting position.

4. **Lying Leg Curl**
   - Type of Activity: Strength
   - Equipment: Machine
   - Instructions: Lie face down on a leg curl machine and curl the pad towards your glutes, then return to the starting position.

5. **Prone Leg Curl**
   - Type of Activity: Strength
   - Equipment: Machine
   - Instructions: Lie face down on a leg curl mac

In [None]:
query = 'I also want some exercise that is helpful for my back'
print(rag(query))

For exercises that are helpful for your back, you can consider the following:

1. **Machine Back Extension**: 
   - **Type**: Strength
   - **Muscle Groups Activated**: Lower Back, Glutes, Hamstrings
   - **Instructions**: Sit on a back extension machine with your feet braced. Extend your hips to lift your torso, then lower back down.

2. **Superman Exercise**: 
   - **Type**: Strength
   - **Muscle Groups Activated**: Lower Back, Glutes, Hamstrings
   - **Instructions**: Lie face down on the floor with arms extended. Lift your arms, chest, and legs off the ground simultaneously, then lower them back down.

3. **Bulgarian Deadlift**: 
   - **Type**: Strength
   - **Muscle Groups Activated**: Hamstrings, Glutes, Lower Back
   - **Instructions**: Stand on one leg holding a dumbbell in each hand. Bend at the hips to lower the weights, then lift back up.

4. **Dumbbell Deadlift**: 
   - **Type**: Strength
   - **Muscle Groups Activated**: Glutes, Hamstrings, Lower Back
   - **Instructions*

### Step2 Evaluate Retrieval

#### 2.1 design pipeline with 1 example

In [None]:
prompt_template_5_questions = """
You emulate a user of our fitness assistant application.
Formulate 5 questions this user might ask based on a provided exercise. 
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

exercise_name: {exercise_name},
type_of_activity: {type_of_activity},
type_of_equipment: {type_of_equipment},
body_part: {body_part},
type: {type},
muscle_groups_activated: {muscle_groups_activated},
instructions: {instructions}     

Provide the output in parsable JSON without using code blocks:

{{'questions': ["question1", "question2", ..., "question5"]}}
""".strip()

In [None]:
documents[0]

{'id': 0,
 'exercise_name': 'Push-Ups',
 'type_of_activity': 'Strength',
 'type_of_equipment': 'Bodyweight',
 'body_part': 'Upper Body',
 'type': 'Push',
 'muscle_groups_activated': 'Pectorals, Triceps, Deltoids',
 'instructions': 'Start in a high plank position with your hands under your shoulders. Lower your body until your chest nearly touches the floor. Push back up to the starting position.'}

In [None]:
print(prompt_template_5_questions.format(**documents[0]))

You emulate a user of our fitness assistant application.
Formulate 5 questions this user might ask based on a provided exercise. 
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

exercise_name: Push-Ups,
type_of_activity: Strength,
type_of_equipment: Bodyweight,
body_part: Upper Body,
type: Push,
muscle_groups_activated: Pectorals, Triceps, Deltoids,
instructions: Start in a high plank position with your hands under your shoulders. Lower your body until your chest nearly touches the floor. Push back up to the starting position.     

Provide the output in parsable JSON without using code blocks:

{'questions': ["question1", "question2", ..., "question5"]}


In [None]:
prompt_0 = prompt_template_5_questions.format(**documents[0])
questions = llm(prompt_0)
questions

'{\n  "questions": [\n    "What is the starting position for performing push-ups?",\n    "Which muscles are primarily targeted during push-ups?",\n    "Do I need any equipment to perform push-ups?",\n    "How do I correctly lower my body when doing push-ups?",\n    "Can push-ups help strengthen my upper body?"\n  ]\n}'

In [None]:
print(questions)

{
  "questions": [
    "What is the starting position for performing push-ups?",
    "Which muscles are primarily targeted during push-ups?",
    "Do I need any equipment to perform push-ups?",
    "How do I correctly lower my body when doing push-ups?",
    "Can push-ups help strengthen my upper body?"
  ]
}


In [None]:
json.loads(questions)

{'questions': ['What is the starting position for performing push-ups?',
  'Which muscles are primarily targeted during push-ups?',
  'Do I need any equipment to perform push-ups?',
  'How do I correctly lower my body when doing push-ups?',
  'Can push-ups help strengthen my upper body?']}

In [None]:
# check if the 5 questions created is align with the previouse record
question = 'What body part do push-ups primarily engage during the exercise?'
print(rag(question))

Push-ups primarily engage the upper body, specifically activating the pectorals, triceps, and deltoids.


#### 2.2 Apply the pipeline into the full dataset
generate 5 questions for each record in the dataset

In [None]:
def generate_questions(doc):
    prompt = prompt_template_5_questions.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [None]:
results = {}
for doc in tqdm(documents[:20]):
    doc_id = doc['id']
    if doc_id in results:
        continue
    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']
results

100%|██████████| 20/20 [00:33<00:00,  1.68s/it]


{0: ['What position should my body be in before starting push-ups?',
  'How far should I lower my body during the exercise?',
  'What muscles are primarily worked when doing push-ups?',
  'Do I need any equipment to perform push-ups correctly?',
  'How can I ensure my form is correct while doing push-ups?'],
 1: ['What are the primary muscle groups worked during squats?',
  'Can squats be performed without any equipment?',
  'How should my body be positioned while performing a squat?',
  'Are squats more effective for strength training or endurance?',
  'What body part do squats primarily target?'],
 2: ['What is the correct position for my elbows when starting the plank?',
  'How should my body be aligned during the plank exercise?',
  'Which muscle groups are primarily engaged when performing a plank?',
  'What type of exercise is the plank classified as?',
  'Do I need any equipment to perform a plank effectively?'],
 3: ['What specific body parts are targeted when performing a dead

In [None]:
final_results = []
for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))
ground_truth_df = pd.DataFrame(final_results, columns = ['id', 'question'])
ground_truth_df.head(5)

Unnamed: 0,id,question
0,0,What position should my body be in before star...
1,0,How far should I lower my body during the exer...
2,0,What muscles are primarily worked when doing p...
3,0,Do I need any equipment to perform push-ups co...
4,0,How can I ensure my form is correct while doin...


#### 2.3 use the generated questions to evaluate the retrieval performance
逻辑是： 

1）对ground_truth里的每一个question， run minsearch, 这样可以得到与这个question最相关的5个document records。

2）检查一下这个5个document records 中有没有 ground_truth question带的那一个

3）设计不同的metrics计算ground_truth 能够 hit document 的准确性

In [None]:
# here we use the fullataset generated in tutorial
ground_truth_df = pd.read_csv('../data/ground-truth-retrieval.csv')
ground_truth_df.head(2)

Unnamed: 0,id,question
0,0,What is the starting position for doing push-ups?
1,0,Which muscle groups are activated during push-...


In [None]:
ground_truth = ground_truth_df.to_dict(orient='records')
ground_truth[0]

{'id': 0, 'question': 'What is the starting position for doing push-ups?'}

In [None]:
def hit_rate(relevance_total):
    # recall, 如果五个里面至少有一个是true, 那hit rate就是1
    hits = sum([1 for r in relevance_total if any(r)])
    return hits / len(relevance_total)

def mrr(relevance_total):
    # Mean Reciprocal Rank(mrr)
    # 根据true的位置, 给每一个true分配不同的权重, 1, 1/2, 1/3, 1/4, 1/5
    total_score = 0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score += 1/(rank+1)
    return total_score / len(relevance_total)

In [None]:
def minsearch_search(query):
    boost = {# no boosting
        }
    results = index.search(
        query = query,
        filter_dict = {}, # no filtering
        boost_dict = boost,
        num_results = 5
    )
    return results

In [None]:
def evaluation(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['id']
        # 用minsearch, 根据q，在初始documents中找到最相关的5个records
        results = search_function(q['question'])
        # 判断“根据这个record生成的q”的"最相关的5个records中，有没有这个record
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total)
    }

In [None]:
evaluation(ground_truth, minsearch_search)

100%|██████████| 1035/1035 [00:01<00:00, 830.19it/s]


{'hit_rate': 0.7980676328502415, 'mrr': 0.7297101449275367}

#### 2.4 improve retrieval performance

via optimize parameter

In [None]:
df_validation = ground_truth_df[:100]
df_test = ground_truth_df[100:]
gt_val = df_validation.to_dict(orient='records')
gt_test = df_test.to_dict(orient='records')

In [None]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [None]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {# no boosting
            }
    results = index.search(
        query = query,
        filter_dict = {}, # no filtering
        boost_dict = boost,
        num_results = 5
    )
    return results

In [None]:
param_ranges = {
    'exercise_name': (0.0, 3.0),
    'type_of_activity': (0.0, 3.0),
    'type_of_equipment': (0.0, 3.0),
    'body_part': (0.0, 3.0),
    'type': (0.0, 3.0),
    'muscle_groups_activated': (0.0, 3.0),
    'instructions': (0.0, 3.0),
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q, boost_params)

    results = evaluation(gt_val, search_function)
    return results['mrr']

In [None]:
simple_optimize(param_ranges, objective, n_iterations = 20)

100%|██████████| 100/100 [00:00<00:00, 725.33it/s]
100%|██████████| 100/100 [00:00<00:00, 834.31it/s]
100%|██████████| 100/100 [00:00<00:00, 830.24it/s]
100%|██████████| 100/100 [00:00<00:00, 824.57it/s]
100%|██████████| 100/100 [00:00<00:00, 822.88it/s]
100%|██████████| 100/100 [00:00<00:00, 831.06it/s]
100%|██████████| 100/100 [00:00<00:00, 825.16it/s]
100%|██████████| 100/100 [00:00<00:00, 826.60it/s]
100%|██████████| 100/100 [00:00<00:00, 822.49it/s]
100%|██████████| 100/100 [00:00<00:00, 812.80it/s]
100%|██████████| 100/100 [00:00<00:00, 827.62it/s]
100%|██████████| 100/100 [00:00<00:00, 827.26it/s]
100%|██████████| 100/100 [00:00<00:00, 816.79it/s]
100%|██████████| 100/100 [00:00<00:00, 806.95it/s]
100%|██████████| 100/100 [00:00<00:00, 815.83it/s]
100%|██████████| 100/100 [00:00<00:00, 816.51it/s]
100%|██████████| 100/100 [00:00<00:00, 819.57it/s]
100%|██████████| 100/100 [00:00<00:00, 803.76it/s]
100%|██████████| 100/100 [00:00<00:00, 817.24it/s]
100%|██████████| 100/100 [00:00

({'exercise_name': 2.954866260303666,
  'type_of_activity': 0.48518325946276974,
  'type_of_equipment': 0.367909398481313,
  'body_part': 0.3309808059875827,
  'type': 0.8927653469623865,
  'muscle_groups_activated': 0.5829335885894592,
  'instructions': 1.5245728409760042},
 0.855)

In [None]:
# check how is it improved:
def minsearch_improved(query, boost=None):
    boost = {'exercise_name': 2.041749120144542,
            'type_of_activity': 0.9408602989217097,
            'type_of_equipment': 0.812726835870334,
            'body_part': 1.1889065452138718,
            'type': 2.7221709718145775,
            'muscle_groups_activated': 2.598837199711558,
            'instructions': 0.7980050177735197}
    results = index.search(
        query = query,
        filter_dict = {}, # no filtering
        boost_dict = boost,
        num_results = 5
    )
    return results

In [None]:
evaluation(ground_truth, minsearch_improved)

100%|██████████| 1035/1035 [00:01<00:00, 816.59it/s]


{'hit_rate': 0.8299516908212561, 'mrr': 0.7998228663446056}

In [None]:
evaluation(gt_val, minsearch_improved)

100%|██████████| 100/100 [00:00<00:00, 783.41it/s]


{'hit_rate': 0.92, 'mrr': 0.8533333333333333}

In [None]:
evaluation(gt_test, minsearch_improved)

100%|██████████| 935/935 [00:01<00:00, 783.66it/s]


{'hit_rate': 0.8203208556149733, 'mrr': 0.7940998217468808}

### Step 3 RAG Evaluation

copy code from Module 4 Monitoring (LLM as a Judge)

relevant/partly relevant/irrelevant

对比ChatGPT（用document records）生成的问题， 和ChatGPT+minsearch 生成的该问题的答案， 是不是相关

In [None]:
# LLM as a judge
prompt_rag_evaluation = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [None]:
question = ground_truth[0]['question']
question

'What is the starting position for doing push-ups?'

In [None]:
answer_llm = rag(question)
prompt = prompt_rag_evaluation.format(question = question, answer_llm = answer_llm)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What is the starting position for doing push-ups?
Generated Answer: The starting position for doing push-ups is to begin in a high plank position with your hands under your shoulders.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [55]:
ground_truth[20]

{'id': 4, 'question': 'What equipment do I need for performing bicep curls?'}

In [88]:
evaluations = []
for record in tqdm(ground_truth[:20]):
    id = record['id']
    question = record['question']
    answer_llm = rag(question)

    prompt = prompt_rag_evaluation.format(question = question, answer_llm = answer_llm)
    evaluation = llm(prompt)
    evaluations.append({
        'id': id,
        'question': question,
        'answer_llm': answer_llm,
        'evaluation': json.loads(evaluation)
    })

100%|██████████| 20/20 [00:36<00:00,  1.84s/it]


In [91]:
evaluations_df = pd.DataFrame(evaluations)
evaluations_df['relevance'] = evaluations_df.evaluation.apply(lambda d: d['Relevance'])
evaluations_df['explanation'] = evaluations_df.evaluation.apply(lambda d: d['Explanation'])
del evaluations_df['evaluation']
evaluations_df.head(2)

Unnamed: 0,id,question,answer_llm,relevance,explanation
0,0,What is the starting position for doing push-ups?,The starting position for doing push-ups is to...,RELEVANT,The generated answer accurately describes the ...
1,0,Which muscle groups are activated during push-...,"During push-ups, the muscle groups activated a...",RELEVANT,The generated answer accurately identifies the...


In [92]:
# change to another model 'gpt-4o'
evaluations_gpt_4 = []
for record in tqdm(ground_truth[:20]):
    id = record['id']
    question = record['question']
    answer_llm = rag(question, model = 'gpt-4o')

    prompt = prompt_rag_evaluation.format(question = question, answer_llm = answer_llm)
    evaluation = llm(prompt)
    evaluations_gpt_4.append({
        'id': id,
        'question': question,
        'answer_llm': answer_llm,
        'evaluation': json.loads(evaluation)
    })

100%|██████████| 20/20 [00:42<00:00,  2.13s/it]


In [93]:
evaluations_df_gpt_4 = pd.DataFrame(evaluations_gpt_4)
evaluations_df_gpt_4['relevance'] = evaluations_df_gpt_4.evaluation.apply(lambda d: d['Relevance'])
evaluations_df_gpt_4['explanation'] = evaluations_df_gpt_4.evaluation.apply(lambda d: d['Explanation'])
del evaluations_df_gpt_4['evaluation']
evaluations_df_gpt_4.head(2)

Unnamed: 0,id,question,answer_llm,relevance,explanation
0,0,What is the starting position for doing push-ups?,The starting position for doing push-ups is to...,RELEVANT,The generated answer directly addresses the qu...
1,0,Which muscle groups are activated during push-...,"During push-ups, the muscle groups activated a...",RELEVANT,The generated answer directly addresses the qu...


In [94]:
evaluations_df.relevance.value_counts()

relevance
RELEVANT           18
PARTLY_RELEVANT     2
Name: count, dtype: int64

In [95]:
evaluations_df_gpt_4.relevance.value_counts()

relevance
RELEVANT           17
PARTLY_RELEVANT     3
Name: count, dtype: int64

### Step 4 Interface and ingestion pipeline