In [14]:
import json
from tqdm import tqdm
import numpy as np

# Load Original Locomo dataset

In [15]:
original_data = json.load(open("data/locomo.json"))

In [20]:
print(len(original_data))

10


# [IMPORTANT] Download the files from the Google Drive Folder. The folder contains 10 runs of Zep on correct implementation.

Link: https://drive.google.com/drive/folders/1wi3dYQmuV-1rpZD5MVFRcyw6dudW8D--?usp=drive_link

Place the folder `eval_files` in the same directory as this notebook.

# Create Mappings of Question and Category

In [17]:
accuracy_list = []

def run_eval(index):

    zep_data = json.load(open(f"eval_files/zep_locomo_grades_{index}.json"))

    # Initialize counters
    correct = 0
    incorrect = 0
    total_questions = 0

    # Process each user's data
    for i in tqdm(range(len(original_data))):
        zep_results = zep_data[f"locomo_experiment_user_{i}"]
        original_data_item = original_data[i]["qa"]

        # Validate data integrity
        if len(original_data_item) != len(zep_results):
            print(f"Error: Length mismatch for user {i}")
            continue

        # Process each question for the current user
        for j in range(len(original_data_item)):
            question_original = original_data_item[j]["question"]
            category = original_data_item[j]["category"]
            question_zep = zep_results[j]["question"]

            # Ensure questions match between datasets
            if question_original != question_zep:
                print(f"Error: Question mismatch for user {i} at index {j}")
                continue

            # Extract answer data
            zep_answer = zep_results[j]["answer"]
            zep_grade = zep_results[j]["grade"]
            total_questions += 1

            # Count correct/incorrect answers (excluding category 5)
            if category != 5:
                if zep_grade:
                    correct += 1
                else:
                    incorrect += 1
    print(f"Results Summary:")
    print(f"📊 Total questions in LOCOMO dataset: {total_questions}")
    print(f"📊 Total questions in LOCOMO dataset (without category 5): {correct + incorrect}")
    print(f"✓ Correct answers: {correct}")
    print(f"✗ Incorrect answers: {incorrect}")
    print(f"📈 Accuracy rate: {correct/(correct + incorrect):.2%}")
    accuracy_list.append(correct/(correct + incorrect))


In [18]:
for i in range(1, 11):
    run_eval(i)
    print("--------------------------------")

100%|██████████| 10/10 [00:00<00:00, 17978.16it/s]


Results Summary:
📊 Total questions in LOCOMO dataset: 1986
📊 Total questions in LOCOMO dataset (without category 5): 1540
✓ Correct answers: 901
✗ Incorrect answers: 639
📈 Accuracy rate: 58.51%
--------------------------------


100%|██████████| 10/10 [00:00<00:00, 20126.22it/s]


Results Summary:
📊 Total questions in LOCOMO dataset: 1986
📊 Total questions in LOCOMO dataset (without category 5): 1540
✓ Correct answers: 903
✗ Incorrect answers: 637
📈 Accuracy rate: 58.64%
--------------------------------


100%|██████████| 10/10 [00:00<00:00, 13833.46it/s]


Results Summary:
📊 Total questions in LOCOMO dataset: 1986
📊 Total questions in LOCOMO dataset (without category 5): 1540
✓ Correct answers: 898
✗ Incorrect answers: 642
📈 Accuracy rate: 58.31%
--------------------------------


100%|██████████| 10/10 [00:00<00:00, 6531.15it/s]


Results Summary:
📊 Total questions in LOCOMO dataset: 1986
📊 Total questions in LOCOMO dataset (without category 5): 1540
✓ Correct answers: 904
✗ Incorrect answers: 636
📈 Accuracy rate: 58.70%
--------------------------------


100%|██████████| 10/10 [00:00<00:00, 8430.76it/s]


Results Summary:
📊 Total questions in LOCOMO dataset: 1986
📊 Total questions in LOCOMO dataset (without category 5): 1540
✓ Correct answers: 896
✗ Incorrect answers: 644
📈 Accuracy rate: 58.18%
--------------------------------


100%|██████████| 10/10 [00:00<00:00, 17855.70it/s]


Results Summary:
📊 Total questions in LOCOMO dataset: 1986
📊 Total questions in LOCOMO dataset (without category 5): 1540
✓ Correct answers: 897
✗ Incorrect answers: 643
📈 Accuracy rate: 58.25%
--------------------------------


100%|██████████| 10/10 [00:00<00:00, 8538.89it/s]


Results Summary:
📊 Total questions in LOCOMO dataset: 1986
📊 Total questions in LOCOMO dataset (without category 5): 1540
✓ Correct answers: 902
✗ Incorrect answers: 638
📈 Accuracy rate: 58.57%
--------------------------------


100%|██████████| 10/10 [00:00<00:00, 14685.94it/s]


Results Summary:
📊 Total questions in LOCOMO dataset: 1986
📊 Total questions in LOCOMO dataset (without category 5): 1540
✓ Correct answers: 902
✗ Incorrect answers: 638
📈 Accuracy rate: 58.57%
--------------------------------


100%|██████████| 10/10 [00:00<00:00, 29248.98it/s]


Results Summary:
📊 Total questions in LOCOMO dataset: 1986
📊 Total questions in LOCOMO dataset (without category 5): 1540
✓ Correct answers: 895
✗ Incorrect answers: 645
📈 Accuracy rate: 58.12%
--------------------------------


100%|██████████| 10/10 [00:00<00:00, 17512.75it/s]

Results Summary:
📊 Total questions in LOCOMO dataset: 1986
📊 Total questions in LOCOMO dataset (without category 5): 1540
✓ Correct answers: 902
✗ Incorrect answers: 638
📈 Accuracy rate: 58.57%
--------------------------------





In [19]:
mean_accuracy = np.mean(accuracy_list)
std_accuracy = np.std(accuracy_list)
print(f"Mean accuracy: {mean_accuracy:.2%}")
print(f"Standard deviation: {std_accuracy:.2%}")

Mean accuracy: 58.44%
Standard deviation: 0.20%


# latency

In [30]:
latency_zep = json.load(open("data/zep_locomo_search_results.json"))

all_latencies = []

for key, value in latency_zep.items():
    for item in value:
        all_latencies.append(item['duration_ms'])

In [31]:
import numpy as np

# Calculate percentiles
p50 = np.percentile(all_latencies, 50)
p95 = np.percentile(all_latencies, 95)

print(f"P50 (median) latency: {p50:.2f} ms")
print(f"P95 latency: {p95:.2f} ms")


P50 (median) latency: 508.01 ms
P95 latency: 1148.48 ms
