In [None]:
######## Installations

!pip install zep-cloud openai --upgrade

In [None]:
######## Imports

import pandas as pd
from openai import AsyncOpenAI
from zep_cloud.client import AsyncZep
from zep_cloud import Message, EntityEdge, EntityNode
import os
from dotenv import load_dotenv
from pydantic import BaseModel, Field
import asyncio
from datetime import datetime, timezone

In [None]:
load_dotenv()

ZEP_API_KEY = os.getenv('ZEP_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [None]:
######## Helper functions

def nicely_display_one_data_sample(msc_dataset_df, idx):
    """
    Given the MSC dataset's dataframe and an idx (between 0 and 499), print out one data sample
    in a readable format.
    """

    msc_dataset_df_keys = msc_dataset_df.keys()

    for key in msc_dataset_df_keys:

        keys_object = msc_dataset_df[key][idx]
        print(f"\n*****{key}")

        # Handle metadata and self instruct differently
        if key == "metadata":
            continue
        if key == "self_instruct":
            print("QUESTION (B's Message):", keys_object['B'])
            print("ANSWER (A's Message):", keys_object['A'])

        for i in range(20):
            try:
                print(keys_object[i])
            except:
                break



def get_all_chat_sessions(msc_dataset_df, idx):
    """
    Given the MSC dataset's dataframe and an idx (between 0 and 499), return a list of all
    5 chat sessions in a nice format, only including the information needed for Zep.
    """
    all_chat_sessions = []

    # Gather the previous chat sessions
    for i, prev_dialog in enumerate(msc_dataset_df["previous_dialogs"][idx]):
        prev_dialog_message_list = []
        for msg_dict in prev_dialog["dialog"]:
            prev_dialog_message_list.append(msg_dict["text"])
        all_chat_sessions.append({
             "messages": prev_dialog_message_list,
             "session_num": i + 1,
             "time_num": prev_dialog["time_num"],
             "time_unit": prev_dialog["time_unit"],
             "time_back": prev_dialog["time_back"]
        })


    # Gather the most recent/newest chat session, always session #5
    newest_dialog = msc_dataset_df["dialog"][idx]
    newest_dialog_message_list = []
    for msg_dict in newest_dialog:
        newest_dialog_message_list.append(msg_dict["text"])
    all_chat_sessions.append({"messages": newest_dialog_message_list, "session_num": 5})

    return all_chat_sessions



In [None]:
######## Download the eval dataset from the official HuggingFace Source
# Download this file and store it in data/msc.jsonl
# https://huggingface.co/datasets/MemGPT/MSC-Self-Instruct


In [None]:
######## Load the eval dataset

msc_dataset_df = pd.read_json('locomo_eval/data/msc.jsonl', lines=True)

In [None]:
######## Start up Zep and OpenAI clients
zep = AsyncZep(api_key=os.getenv("ZEP_API_KEY"), base_url="https://api.development.getzep.com/api/v2")
oai_client = AsyncOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [None]:
######## MSC Ingestion loop - ingest messages for each multi-session
async def ingest_msc(zep: AsyncZep):
    num_multi_sessions = 500
    
    for multi_session_idx in range(num_multi_sessions):
        
        if multi_session_idx < 0:
            continue
    
        # Create a unique Zep user and session for this pair of speakers
        user_id = "msc_experiment_user_" + str(multi_session_idx)
        session_id = "msc_experiment_session_" + str(multi_session_idx)
        
        # Uncomment to delete existing users
        # try:
        #     await zep.user.delete(user_id)
        #     await zep.memory.delete(session_id)
        # except:
        #     pass
        # 
        # continue
        
        await zep.user.add(user_id=user_id)
        await zep.memory.add_session(
            user_id=user_id,
            session_id=session_id,
        )
    
        # Now add the messages as episodes to this user-session
    
        all_chat_sessions = get_all_chat_sessions(msc_dataset_df, multi_session_idx)
        for session_idx, session in enumerate(all_chat_sessions):
            for msg_idx, msg in enumerate(session["messages"]):
                if msg_idx % 2 == 0:
                    await zep.memory.add(session_id=session_id, messages=[Message(role="A", role_type="norole", content=msg)])
                else:
                    await zep.memory.add(session_id=session_id, messages=[Message(role="B", role_type="norole", content=msg)])
    
await ingest_msc(zep)


In [None]:
######## Define prompts for Deep Memory Retrieval (DMR) eval
async def dmr_response(llm_client, context: str, question: str) -> str:
    system_prompt = """
        You are speaker A and should respond to all questions in the the first person perspective of A
        """

    prompt = f"""
            Your task is to briefly answer the question. You are given the following context from the previous conversation. If you don't know how to answer the question, abstain from answering.
                <CONTEXT>
                {context}
                </CONTEXT>
                <QUESTION>
                {question}
                </QUESTION>

            Respond with an ANSWER section containing your answer. As well as an EVIDENCE section containing the context that help you came to your conclusion, and an explanation of why that context is relevant.
            """

    response = await llm_client.chat.completions.create(
                model='gpt-4-mini',
                messages=[{"role": "system", "content": system_prompt},
                        {"role": "user", "content": prompt}],
                temperature=0,
            )
    result = response.choices[0].message.content or ''

    return result

class Grade(BaseModel):
  is_correct: bool = Field(description='Whether or not the response is correct')

async def dmr_grader(llm_client, question: str, gold_answer: str, response: str) -> bool:
    system_prompt = """
        You are an expert grader that determines if answers to questions match a gold standard answer
        """

    prompt = f"""         
    I will give you a question, a correct answer, and a response from a model. Please answer true if the response contains the correct answer or touches on the same topic. Otherwise, answer false. If the response is equivalent to the correct answer or contains all the intermediate steps to get the correct answer, you should also answer true. If the response only contains a subset of the information required by the answer, answer false.
            
    <QUESTION>
    B: {question}
    </QUESTION>
    <CORRECT ANSWER>
    {gold_answer}
    </CORRECT ANSWER>
    <RESPONSE>
    A: {response}
    </RESPONSE>
    """

    response = await llm_client.beta.chat.completions.parse(
                model='gpt-4o-mini',
                messages=[{"role": "system", "content": system_prompt},
                        {"role": "user", "content": prompt}],
                response_format=Grade,
                temperature=0,
            )
    result = response.choices[0].message.parsed

    return result.is_correct

In [None]:
######## Baseline eval loop - for each multi-session, evaluate on summary and full context
async def dmr_baseline():
    num_multi_sessions = 500
    eval_results = {
                    "total": num_multi_sessions,
                    "full_conversation_correct": 0, 
                    "full_conversation_accuracy": 0, 
                    "conversation_summary_correct": 0, 
                    "conversation_summary_accuracy": 0,
                  }
    
    for multi_session_idx in range(num_multi_sessions):
        # Build contexts
        full_conversation = ''
    
        all_chat_sessions = get_all_chat_sessions(msc_dataset_df, multi_session_idx)
        for session_idx, session in enumerate(all_chat_sessions):
            for msg_idx, msg in enumerate(session["messages"]):
                if msg_idx % 2 == 0:
                    full_conversation += 'A: ' + msg + '\n'
                else:
                    full_conversation += 'B: ' + msg + '\n'
    
        speaker_a_summary_set = msc_dataset_df["summary_speaker_1"][multi_session_idx]
        speaker_b_summary_set = msc_dataset_df["summary_speaker_2"][multi_session_idx]
    
        
    
        speaker_a_summary = '<SPEAKER A SUMMARY>'
        for session_summary in speaker_a_summary_set:
            for fact in session_summary:
                speaker_a_summary += '\n  ' + fact
        speaker_a_summary += '\n</SPEAKER A SUMMARY>'
    
        speaker_b_summary = '\n<SPEAKER B SUMMARY>'
        for session_summary in speaker_b_summary_set:
            for fact in session_summary:
                speaker_b_summary += '\n  ' + fact
        speaker_b_summary += '\n</SPEAKER B SUMMARY>'
    
        conversation_summary = speaker_a_summary + speaker_b_summary
    
    
        # Set question and golden answer
        question = msc_dataset_df['self_instruct'][multi_session_idx]["B"]
        gold_answer = msc_dataset_df['self_instruct'][multi_session_idx]["A"]
    
        # Prompt the LLM to answer the question
        full_conversation_response = await dmr_response(oai_client, full_conversation, question)
        conversation_summary_response = await dmr_response(oai_client, conversation_summary, question)
    
        # print('QUESTION: ', question)
        print('GOLDEN ANSWER', gold_answer)
        print('FULL CONVERSATION RESPONSE: ', full_conversation_response)
        print('CONVERSATION SUMMARY RESPONSE: ', conversation_summary_response)
    
        # Grade responses
        full_conversation_grade = await dmr_grader(oai_client, question, gold_answer, full_conversation_response)
        conversation_summary_grade = await dmr_grader(oai_client, question, gold_answer, conversation_summary_response)
    
        if full_conversation_grade:
            eval_results["full_conversation_correct"] += 1
        if conversation_summary_grade:
            eval_results["conversation_summary_correct"] += 1
    
    eval_results["full_conversation_accuracy"] = eval_results["full_conversation_correct"] / num_multi_sessions
    eval_results["conversation_summary_accuracy"] = eval_results["conversation_summary_correct"] / num_multi_sessions
    
    print(eval_results)
await dmr_baseline()

In [None]:
######## Main eval loop - for each multi-session, evaluate Zep
TEMPLATE = """
FACTS and ENTITIES represent relevant context to the current conversation.

# These are the most relevant facts and their valid date ranges
# format: FACT (Date range: from - to)
<FACTS>
{facts}
</FACTS>

# These are the most relevant entities
# ENTITY_NAME: entity summary
<ENTITIES>
{entities}
</ENTITIES>
"""

def format_edge_date_range(edge: EntityEdge) -> str:
    # return f"{datetime(edge.valid_at).strftime('%Y-%m-%d %H:%M:%S') if edge.valid_at else 'date unknown'} - {(edge.invalid_at.strftime('%Y-%m-%d %H:%M:%S') if edge.invalid_at else 'present')}"
    return f"{edge.valid_at if edge.valid_at else 'date unknown'} - {(edge.invalid_at if edge.invalid_at else 'present')}"


def compose_search_context(edges: list[EntityEdge], nodes: list[EntityNode]) -> str:
    facts = [f'  - {edge.fact} ({format_edge_date_range(edge)})' for edge in edges]
    entities = [f'  - {node.name}: {node.summary}' for node in nodes]
    return TEMPLATE.format(facts='\n'.join(facts), entities='\n'.join(entities))


async def dmr_eval_session(zep: AsyncZep, idx: int) -> bool:
    # Set user values
        user_id = "msc_experiment_user_" + str(idx)
        session_id = "msc_experiment_session_" + str(idx)
    
    
        # Now we want to prompt an LLM augmented with Zep memory to answer the question
        question = msc_dataset_df['self_instruct'][idx]["B"]
        gold_answer = msc_dataset_df['self_instruct'][idx]["A"]
    
    
        # Get relevant facts and entities
        edges_results = (await zep.graph.search(user_id=user_id, reranker='cross_encoder', query=question, scope='edges', limit=20)).edges
        node_results = (await zep.graph.search(user_id=user_id, reranker='rrf', query=question, scope='nodes', limit=20)).nodes
    
    
        context = compose_search_context(edges_results, node_results)
    
        # Prompt an LLM with relevant context
        response = await dmr_response(oai_client, context, question)
    
        # Grade responses
        grade = await dmr_grader(oai_client, question, gold_answer, response)
        
        if not grade:
            print('IDX: ', idx)
            print('CONTEXT: ', context)
            print('QUESTION: ', question)
            print('GOLDEN ANSWER: ', gold_answer)
            print('RESPONSE: ', response)
            print('GRADE: ', grade)
    
        return grade
    
async def dmr_eval(zep: AsyncZep):
    num_multi_sessions = 500
    
    eval_results = {"correct": 0, "total": num_multi_sessions, "accuracy": 0}
    
    # grades = list(await asyncio.gather(*[dmr_eval_session(zep, multi_session_idx) for multi_session_idx in range(num_multi_sessions)]))
    grades: list[bool] = []
    for i in range(num_multi_sessions):
        grades.append(await dmr_eval_session(zep, i))
    
    for grade in grades:
        if grade:
            eval_results["correct"] += 1
    
        # Now that we have the results, delete the user
        # await zep.user.delete(user_id)
    
    eval_results["accuracy"] = eval_results["correct"] / num_multi_sessions
    
    print(eval_results)
    
await dmr_eval(zep)