In [None]:
######## Installations

!pip install zep-cloud openai gdown
!pip install scipy

In [None]:
######## Imports

import asyncio
import pandas as pd
import gdown
import json
import tarfile
from zep_cloud.client import AsyncZep
from zep_cloud import Message
from openai import AsyncOpenAI
from dotenv import load_dotenv
import os
from datetime import datetime, timezone
from pydantic import BaseModel, Field
from zep_cloud import EntityEdge, EntityNode
from time import time

In [None]:
######## Download the eval dataset from the official Google Drive source

file_id = "1zJgtYRFhOh5zDQzzatiddfjYhFSnyQ80"
url = f"https://drive.google.com/uc?id={file_id}"
file_path = "longmemeval_data.tar.gz"

# Download the compressed dataset
if not os.path.exists(file_path):
    gdown.download(url, file_path, quiet=False)
else:
    print(f"'{file_path}' already exists, skipping download.")

# Extract the tar.gz
if not os.path.exists("./longmemeval_oracle.json"):
    with tarfile.open(file_path, "r:gz") as tar:
        tar.extractall()
else:
    print("'longmemeval_oracle.json' already exists, so skipping extraction.")

In [None]:
######## Load the eval dataset

lme_dataset_option = "data/longmemeval_s.json"  # Can be _oracle, _s, or _m
lme_dataset_df = pd.read_json(lme_dataset_option)

In [None]:
######## (Optionally skip this cell) Display some rows of the dataframe

lme_dataset_df.head()

In [None]:
######## Start up Zep and OpenAI clients (make sure you set these keys in Colab's secrets tab in the left sidebar)
load_dotenv()

# , base_url="https://api.development.getzep.com/api/v2"

zep = AsyncZep(
    api_key=os.getenv("ZEP_API_KEY"),
    base_url="https://api.development.getzep.com/api/v2",
)
oai_client = AsyncOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [None]:
######## LongMemEval  - ingest each multi-session as its own single Zep user/session pair

num_multi_sessions = 500


for multi_session_idx in range(num_multi_sessions):
    multi_session = lme_dataset_df["haystack_sessions"].iloc[multi_session_idx]
    multi_session_dates = lme_dataset_df["haystack_dates"].iloc[multi_session_idx]

    question_type = lme_dataset_df["question_type"][multi_session_idx]

    if question_type != "single-session-assistant":
        continue

    print(question_type)

    # Create a unique Zep user and session for this multi-session.
    # We only use one Zep session because it doesn't change things to use multiple Zep sessions
    user_id = "lme_s_experiment_user_" + str(multi_session_idx)
    session_id = "lme_s_experiment_session_" + str(multi_session_idx)

    # Uncomment this code to delete existing users
    # try:
    #     await zep.user.delete(user_id)
    #     await zep.memory.delete(session_id)
    # except:
    #     pass
    #
    # continue

    await zep.user.add(user_id=user_id)
    await zep.memory.add_session(
        user_id=user_id,
        session_id=session_id,
    )

    for session_idx, session in enumerate(multi_session):
        for msx_idx, msg in enumerate(session):
            date = multi_session_dates[session_idx] + " UTC"
            date_format = "%Y/%m/%d (%a) %H:%M UTC"
            date_string = datetime.strptime(date, date_format).replace(
                tzinfo=timezone.utc
            )
            await zep.memory.add(
                session_id=session_id,
                messages=[
                    Message(
                        role=msg["role"],
                        role_type=msg["role"],
                        content=msg["content"][:8000],
                        created_at=date_string.isoformat(),
                    )
                ],
            )

In [None]:
######## Define prompts for LongMemEval (LME) eval
async def lme_response(llm_client, context: str, question: str) -> str:
    system_prompt = """
        You are a helpful expert assistant answering questions from lme_experiment users based on the provided context.
        """

    prompt = f"""
            Your task is to briefly answer the question. You are given the following context from the previous conversation. If you don't know how to answer the question, abstain from answering.
                <CONTEXT>
                {context}
                </CONTEXT>
                <QUESTION>
                {question}
                </QUESTION>

            Answer:
            """

    response = await llm_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
        ],
        temperature=0,
    )
    result = response.choices[0].message.content or ""

    return result


class Grade(BaseModel):
    is_correct: str = Field(description="yes or no")


async def lme_grader(
    llm_client, question: str, gold_answer: str, response: str, question_type: str
) -> bool:
    system_prompt = """
        You are an expert grader that determines if answers to questions match a gold standard answer
        """

    TEMPORAL_REASONING_PROMPT = f"""
    I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response is equivalent to the correct answer or contains all the intermediate steps to get the correct answer, you should also answer yes. If the response only contains a subset of the information required by the answer, answer no. In addition, do not penalize off-by-one errors for the number of days. If the question asks for the number of days/weeks/months, etc., and the model makes off-by-one errors (e.g., predicting 19 days when the answer is 18), the model’s response is still correct.

    <QUESTION>
    B: {question}
    </QUESTION>
    <CORRECT ANSWER>
    {gold_answer}
    </CORRECT ANSWER>
    <RESPONSE>
    A: {response}
    </RESPONSE>
    """

    KNOWLEDGE_UPDATE_PROMPT = f"""
    I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response contains some previous information along with an updated answer, the response should be considered as correct as long as the updated answer is the required answer.
    
    <QUESTION>
    B: {question}
    </QUESTION>
    <CORRECT ANSWER>
    {gold_answer}
    </CORRECT ANSWER>
    <RESPONSE>
    A: {response}
    </RESPONSE>
    """

    SINGLE_SESSION_PREFERENCE = f"""
    I will give you a question, a rubric for desired personalized response, and a response from a model. Please answer yes if the response satisfies the desired response. Otherwise, answer no. The model does not need to reflect all the points in the rubric. The response is correct as long as it recalls and utilizes the user’s personal information correctly.
    
    <QUESTION>
    B: {question}
    </QUESTION>
    <RUBRIC>
    {gold_answer}
    </RUBRIC>
    <RESPONSE>
    A: {response}
    </RESPONSE>
    """

    DEFAULT_PROMPT = f"""         
    I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response is equivalent to the correct answer or contains all the intermediate steps to get the correct answer, you should also answer yes. If the response only contains a subset of the information required by the answer, answer no.
            
    <QUESTION>
    B: {question}
    </QUESTION>
    <CORRECT ANSWER>
    {gold_answer}
    </CORRECT ANSWER>
    <RESPONSE>
    A: {response}
    </RESPONSE>
    """

    prompt = DEFAULT_PROMPT
    if question_type == "temporal-reasoning":
        prompt = TEMPORAL_REASONING_PROMPT
    elif question_type == "knowledge-update":
        prompt = KNOWLEDGE_UPDATE_PROMPT
    elif question_type == "single-session-preference":
        prompt = SINGLE_SESSION_PREFERENCE

    response = await llm_client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
        ],
        response_format=Grade,
        temperature=0,
    )
    result = response.choices[0].message.parsed

    return result.is_correct.strip().lower() == "yes"

In [None]:
######## LongMemEval  - evaluation

zep_answers_file_path = "./longmemeval_zep_answers.jsonl"

TEMPLATE = """
FACTS and ENTITIES represent relevant context to the current conversation.

# These are the most relevant facts and their valid date ranges. If the fact is about an event, the event takes place during this time.
# format: FACT (Date range: from - to)
<FACTS>
{facts}
</FACTS>

# These are the most relevant entities
# ENTITY_NAME: entity summary
<ENTITIES>
{entities}
</ENTITIES>
"""

total_duration = 0


def format_edge_date_range(edge: EntityEdge) -> str:
    # return f"{datetime(edge.valid_at).strftime('%Y-%m-%d %H:%M:%S') if edge.valid_at else 'date unknown'} - {(edge.invalid_at.strftime('%Y-%m-%d %H:%M:%S') if edge.invalid_at else 'present')}"
    return f"{edge.valid_at if edge.valid_at else 'date unknown'} - {(edge.invalid_at if edge.invalid_at else 'present')}"


def compose_search_context(edges: list[EntityEdge], nodes: list[EntityNode]) -> str:
    facts = [f"  - {edge.fact} ({format_edge_date_range(edge)})" for edge in edges]
    entities = [f"  - {node.name}: {node.summary}" for node in nodes]
    return TEMPLATE.format(facts="\n".join(facts), entities="\n".join(entities))


async def evaluate_conversation(multi_session_idx) -> tuple[int, float, float]:
    # Set user values
    user_id = "lme_s_experiment_user_" + str(multi_session_idx)
    session_id = "lme_s_experiment_session_" + str(multi_session_idx)

    # Now we want to prompt an LLM augmented with Zep memory to answer the question
    question_id = lme_dataset_df["question_id"][multi_session_idx]
    question_type = lme_dataset_df["question_type"][multi_session_idx]

    question = (
        "(date: "
        + lme_dataset_df["question_date"][multi_session_idx]
        + ") "
        + lme_dataset_df["question"][multi_session_idx]
    )
    gold_answer = lme_dataset_df["answer"][multi_session_idx]

    # Get relevant facts and entities
    start = time()
    edges_results = (
        await zep.graph.search(
            user_id=user_id,
            reranker="cross_encoder",
            query=question[0:255],
            scope="edges",
            limit=20,
        )
    ).edges
    node_results = (
        await zep.graph.search(
            user_id=user_id,
            reranker="rrf",
            query=question[0:255],
            scope="nodes",
            limit=20,
        )
    ).nodes
    retrieval_duration = time() - start

    context = compose_search_context(edges_results, node_results)
    context_len = len(context.split(" "))

    # Prompt an LLM with relevant context
    hypothesis = await lme_response(oai_client, context, question)
    duration = time() - start

    grade = await lme_grader(
        oai_client, question, gold_answer, hypothesis, question_type
    )

    zep_answers.append(
        {
            "question_id": question_id,
            "hypothesis": hypothesis,
            "gold_answer": gold_answer,
            "context": context,
            "question_type": question_type,
            "context_len": context_len,
            "retrieval_duration": retrieval_duration,
            "duration": duration,
            "grade": grade,
        }
    )

    if grade:
        return 1, duration, retrieval_duration
    else:
        return 0, duration, retrieval_duration


default_results = {"Number correct": 0, "Number total": 0, "Accuracy": 0}
question_type_set = set(lme_dataset_df["question_type"].unique())
eval_results = {
    question_type: default_results.copy() for question_type in question_type_set
}

num_multi_sessions = 500

zep_answers = []
missed = []
score = 0
grades = []
durations = []
retrieval_durations = []

idx_start = 0
while idx_start < num_multi_sessions - 1:
    print(idx_start)
    results = await asyncio.gather(
        *[
            evaluate_conversation(multi_session_idx)
            for multi_session_idx in range(idx_start, idx_start + 5)
        ]
    )
    idx_start = idx_start + 5

    for grade, duration, retrieval_duration in results:
        grades.append(grade)
        durations.append(duration)
        retrieval_durations.append(retrieval_duration)

with open(zep_answers_file_path, "w") as jsonl_file:
    jsonl_file.write(json.dumps(zep_answers))

In [None]:
with open(zep_answers_file_path, "r") as jsonl_file:
    for i in range(500):
        hypothesis = jsonl_file[i]["hypothesis"]
        gold_answer = lme_dataset_df["answer"][i]
        print(hypothesis, gold_answer)

In [None]:
######## LongMemEval  - evaluation baseline
baseline_answers_file_path = "./longmemeval_baseline_answers.jsonl"


async def evaluate_conversation(multi_session_idx) -> tuple[int, float]:
    # Now we want to prompt an LLM augmented with Zep memory to answer the question
    question_id = lme_dataset_df["question_id"][multi_session_idx]
    question_type = lme_dataset_df["question_type"][multi_session_idx]

    question = (
        "(date: "
        + lme_dataset_df["question_date"][multi_session_idx]
        + ") "
        + lme_dataset_df["question"][multi_session_idx]
    )
    gold_answer = lme_dataset_df["answer"][multi_session_idx]

    multi_session = lme_dataset_df["haystack_sessions"].iloc[multi_session_idx]
    multi_session_dates = lme_dataset_df["haystack_dates"].iloc[multi_session_idx]

    context = ""
    for session_idx, session in enumerate(multi_session):
        for msx_idx, msg in enumerate(session):
            date = multi_session_dates[session_idx] + " UTC"
            date_format = "%Y/%m/%d (%a) %H:%M UTC"
            date_string = datetime.strptime(date, date_format).replace(
                tzinfo=timezone.utc
            )
            context += f"{msg['role']} (date: {date_string}): {msg['content']}\n"

    start = time()
    # Prompt an LLM with relevant context
    hypothesis = await lme_response(oai_client, context, question)

    duration = time() - start

    grade = await lme_grader(
        oai_client, question, gold_answer, hypothesis, question_type
    )

    baseline_answers.append(
        {
            "question_id": question_id,
            "hypothesis": hypothesis,
            "gold_answer": gold_answer,
            "context": context,
            "question_type": question_type,
            "retrieval_duration": retrieval_duration,
            "duration": duration,
            "grade": grade,
        }
    )

    if grade:
        return 1, duration
    else:
        print("id:", multi_session_idx, "answer:", hypothesis)
        return 0, duration


default_results = {"Number correct": 0, "Number total": 0, "Accuracy": 0}
question_type_set = set(lme_dataset_df["question_type"].unique())
eval_results = {
    question_type: default_results.copy() for question_type in question_type_set
}

num_multi_sessions = 500

baseline_answers = []

idx_start = 0
while idx_start < num_multi_sessions - 1:
    results = await asyncio.gather(
        *[
            evaluate_conversation(multi_session_idx)
            for multi_session_idx in range(idx_start, idx_start + 100)
        ]
    )
    idx_start = idx_start + 100

    for curr_grade, curr_duration in results:
        grades.append(curr_grade)
        durations.append(curr_duration)

    with open(zep_answers_file_path, "w") as jsonl_file:
        jsonl_file.write(json.dumps(zep_answers))

In [None]:
######## (Optional dataset investigation) Loop through multi-sessions and record multi-session lengths and session lengths by question type, and print sessions

num_multi_sessions = 500

question_type_set = set(lme_dataset_df["question_type"].unique())
multi_session_lengths = {q_type: [] for q_type in question_type_set}
session_lengths = {q_type: [] for q_type in question_type_set}

for multi_session_idx in range(num_multi_sessions):
    multi_session = lme_dataset_df["haystack_sessions"].iloc[multi_session_idx]
    question_type = lme_dataset_df["question_type"].iloc[multi_session_idx]

    # Create a unique Zep user and session for this multi-session.
    # We only use one Zep session because it doesn't change things to use multiple Zep sessions
    user_id = "lme_experiment_user_" + str(multi_session_idx)
    session_id = "lme_experiment_session_" + str(multi_session_idx)

    multi_session_lengths[question_type].append(len(multi_session))
    for session in multi_session:
        session_lengths[question_type].append(len(session))

        # print(session)

In [None]:
######## (Optional dataset investigation) Print out statistics by question type

print(lme_dataset_option)

# Iterate through each question type to print statistics about the possible multi-session lengths and session lengths
for question_type in question_type_set:
    print(f"******Question Type: {question_type}")

    try:
        print("made it")
        print("Number of multi-sessions:", len(multi_session_lengths[question_type]))
        print()
        print("Max session length:", max(session_lengths[question_type]))
        print(
            "Avg session length:",
            sum(session_lengths[question_type]) / len(session_lengths[question_type]),
        )
        print("Min session length:", min(session_lengths[question_type]))
        print("Numer of sessions of length 0:", session_lengths[question_type].count(0))
    except:
        pass