In [None]:
######## Installations - BE SURE TO MAKE YOUR OWN LOCAL VENV FIRST

%pip install pandas graphiti-core

In [None]:
######## Imports

import csv
import json
import os
from datetime import datetime

import pandas as pd
from utils import dict_to_entity, entity_to_dict, ingest_and_label_minidataset

from graphiti_core.llm_client import OpenAIClient
from graphiti_core.llm_client.config import LLMConfig
from graphiti_core.nodes import EntityNode

In [None]:
######## Load the eval dataset
folder_path = './longmemeval_data/snippetized_data'
file_path = os.path.join(folder_path, 'longmemeval_oracle_snippetized.csv')
lme_dataset_df = pd.read_csv(file_path)
lme_dataset_df.head()

In [None]:
######## Methods

MAX_NUM_PREVIOUS_MESSAGES = 5


def filter_for_zep_labelling(df):
    """
    Filters the dataset for the snippets we want to use for Zep labelling.
    """

    # Filter to only rows where question_type = single_session_user
    df = df[df['question_type'] == 'single-session-user']

    # Filter only where message_has_answer = True
    df = df[df['message_has_answer'] == True]

    # Filter to only rows where num_previous_messages = 5
    df = df[df['num_previous_messages'] == 5]

    # Limit to only 5 rows
    df = df.head(1)

    return df


def expand_previous_messages(df):
    """
    Expands the previous_messages column into separate columns.
    """
    # First parse the string into actual list of dicts
    df['previous_messages'] = df['previous_messages'].apply(json.loads)

    # Then create separate columns for each message
    for i in range(MAX_NUM_PREVIOUS_MESSAGES):
        df[f'previous_message_{i + 1}'] = df['previous_messages'].apply(
            lambda x: x[i] if i < len(x) else None
        )

    # Drop the original previous_messages column if desired
    return df.drop('previous_messages', axis=1)


def make_messages_readable(df):
    """
    Makes the messages more readable.
    """
    for i in range(MAX_NUM_PREVIOUS_MESSAGES):
        df[f'previous_message_{i + 1}'] = df[f'previous_message_{i + 1}'].apply(
            lambda x: '|' * 10 + f"  {x['role']}  " + '|' * 10 + '\n\n' + f"{x['content']}"
            if x is not None
            else None
        )

    df['message'] = df.apply(
        lambda row: '|' * 10
        + f"  {row['message_role']}  "
        + '|' * 10
        + '\n\n'
        + f"{row['message']}"
        if row['message'] is not None
        else None,
        axis=1,
    )
    return df


def order_columns(df):
    """
    Orders the columns in the way we want them.
    """
    df = df[
        [
            'question_id',
            'question_type',
            'multisession_index',
            'session_index',
            'message_index_within_session',
            'message_index_across_sessions',
            'session_date',
            'message_role',
            'num_previous_messages',
            'message_has_answer',
            'previous_message_1',
            'previous_message_2',
            'previous_message_3',
            'previous_message_4',
            'previous_message_5',
            'message',
        ]
    ]
    return df


def insert_answer_columns(df, num_prompt_instructions):
    for prompt_instruction_index in range(num_prompt_instructions, 0, -1):
        for i in range(MAX_NUM_PREVIOUS_MESSAGES, 0, -1):
            # Insert after each previous message
            column_tag = f'({prompt_instruction_index}.{i})'
            df.insert(
                loc=df.columns.get_loc(f'previous_message_{i}') + 1,
                column=f'Answer to Prompt Instruction {prompt_instruction_index} {column_tag}',
                value='',
            )
            df.insert(
                loc=df.columns.get_loc(f'previous_message_{i}') + 2,
                column=f'Done?                {column_tag}',
                value='',
            )

        column_tag = f'({prompt_instruction_index}.{MAX_NUM_PREVIOUS_MESSAGES + 1})'
        # Insert after each previous message
        df.insert(
            loc=df.columns.get_loc(f'message') + 1,
            column=f'Answer to Prompt Instruction {prompt_instruction_index} {column_tag}',
            value='',
        )
        df.insert(
            loc=df.columns.get_loc(f'message') + 2,
            column=f'Done?                {column_tag}',
            value='',
        )


def insert_default_answers_round1(df):
    """
    Inserts default answers for the first round of prompt instructions.
    """
    for i in range(MAX_NUM_PREVIOUS_MESSAGES, 0, -1):
        column_tag = f'(1.{i})'
        answer_col = f'Answer to Prompt Instruction 1 {column_tag}'
        msg_col = f'previous_message_{i}'

        # Set default value based on role from previous message
        df[answer_col] = df[msg_col].apply(lambda x: f"[${x['role']}$, ]" if x is not None else '')

    # Handle the final message
    column_tag = f'(1.{MAX_NUM_PREVIOUS_MESSAGES + 1})'
    answer_col = f'Answer to Prompt Instruction 1 {column_tag}'

    # Set default value based on role from current message
    df[answer_col] = df.apply(lambda row: f"[${row['message_role']}$, ]", axis=1)


def insert_example_row(df, num_prompt_instructions):
    """
    Inserts an example row at the top of the dataframe with 'EXAMPLE' as values.
    """
    example_row = {col: 'EXAMPLE' for col in df.columns}
    # for i in range(2):
    #     for j in range(num_prompt_instructions):
    #         example_row[f"Done? ({j+1}.{i+1})"] = "x"
    df.loc[-1] = example_row
    df.index = df.index + 1
    df.sort_index(inplace=True)
    return df


def transform_eval_minidataset(df):
    """
    Transforms the eval mini dataset so that there is a row for every message in previous messages.
    """

    df = df.rename(columns={col: f'snippet_{col}' for col in df.columns})

    ### Add new columns
    df.reset_index(drop=True, inplace=True)
    df['snippet_index'] = df.index

    transformed_rows = []

    for _, row in df.iterrows():
        previous_messages = json.loads(row['snippet_previous_messages'])

        for i, message in enumerate(previous_messages):
            new_row = row.copy()
            new_row['message_index_within_snippet'] = i
            new_row['input_message'] = json.dumps(message)
            new_row['input_previous_messages'] = json.dumps(previous_messages[:i])
            transformed_rows.append(new_row)

        new_row = row.copy()
        new_row['message_index_within_snippet'] = len(previous_messages)
        new_row['input_message'] = row['snippet_message']
        new_row['input_previous_messages'] = row['snippet_previous_messages']
        transformed_rows.append(new_row)

    transformed_df = pd.DataFrame(transformed_rows)

    transformed_rows = []
    task_names = [
        'extract_nodes',
        'dedupe_nodes',
        'extract_edges',
        'dedupe_edges',
        'extract_edge_dates',
        'edge_invalidation',
    ]
    for _, row in transformed_df.iterrows():
        for task_index, task_name in enumerate(task_names):
            new_row = row.copy()
            new_row['task_name'] = task_name
            new_row['task_index'] = task_index
            transformed_rows.append(new_row)

    transformed_df = pd.DataFrame(transformed_rows)

    # Reorder columns
    transformed_df = transformed_df[
        [
            'snippet_index',
            'message_index_within_snippet',
            'task_index',
            'task_name',
            'snippet_message',
            'snippet_previous_messages',
            'input_message',
            'input_previous_messages',
        ]
    ]  # , 'input_extracted_nodes', 'input_existing_relevant_nodes', 'input_extracted_edges', 'input_existing_relevant_edges', 'output_zep', 'output_gpt4o', 'output_human']]

    # Ensure to reset the indices to be sequential
    transformed_df.reset_index(drop=True, inplace=True)

    return transformed_df

In [None]:
# Set pandas settings to display all columns and have max width of columns
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 120)

In [None]:
######## Filtering to only snippets/rows we want
lme_dataset_df_filtered = filter_for_zep_labelling(lme_dataset_df)
lme_dataset_df_filtered.head()

In [None]:
#################### Create the eval CSV
eval_minidataset = lme_dataset_df_filtered.copy()
eval_minidataset = transform_eval_minidataset(eval_minidataset)
# Print the number of rows and columns
print(f'Number of rows: {len(eval_minidataset)}')
print(f'Number of columns: {len(eval_minidataset.columns)}')
eval_minidataset.head(100)

In [None]:
# Insert gpt4o answers by doing ingestion in the right order and filling extra input columns as needed
model_name = 'gpt-4o-mini'
llm_config = LLMConfig(
    api_key=os.getenv('OPENAI_API_KEY'),
    model=model_name,
)
llm_client = OpenAIClient(config=llm_config)
output_column_name = 'output_gpt4o_mini'
eval_minidataset_labelled = await ingest_and_label_minidataset(
    llm_client, eval_minidataset, output_column_name
)

# Print the number of rows and columns
print(f'Number of rows: {len(eval_minidataset_labelled)}')
print(f'Number of columns: {len(eval_minidataset_labelled.columns)}')
eval_minidataset_labelled.head(100)

In [None]:
# print the input message column for the first row
index = 2
print('Input Message:')
print(eval_minidataset_labelled.iloc[index]['input_message'])
print('-' * 100)
cell_value = eval_minidataset_labelled.iloc[index][output_column_name]
cell_value_dicts = json.loads(cell_value)
for dict in cell_value_dicts:
    # Print only the 'fact' and 'name' values
    print(f"Fact: {dict.get('fact', 'N/A')}, Name: {dict.get('name', 'N/A')}")
    print('-' * 100)
    print('\n')

In [None]:
#################### Create the human labelling CSV


In [None]:
#################### Create the human labelling CSV (old)


# ######## Expanding the previous_messages column
# lme_dataset_df_filtered_human_labelling = expand_previous_messages(lme_dataset_df_filtered)
# lme_dataset_df_filtered_human_labelling.head()

# ######## Order the columns in the way we want them
# lme_dataset_df_filtered_human_labelling = order_columns(lme_dataset_df_filtered_human_labelling)
# lme_dataset_df_filtered_human_labelling.head()

# ######## Insert empty answer columns
# num_prompt_instructions = 1
# insert_answer_columns(lme_dataset_df_filtered_human_labelling, num_prompt_instructions)
# lme_dataset_df_filtered_human_labelling.head()

# ######## Insert default values for the answers
# insert_default_answers_round1(lme_dataset_df_filtered_human_labelling)
# lme_dataset_df_filtered_human_labelling.head()

# ######## Make the messages more readable
# lme_dataset_df_filtered_human_labelling = make_messages_readable(lme_dataset_df_filtered_human_labelling)
# lme_dataset_df_filtered_human_labelling.head(10)

# ######## Add example row to the top
# insert_example_row(lme_dataset_df_filtered_human_labelling, num_prompt_instructions)
# lme_dataset_df_filtered_human_labelling.head(10)

# ######## Save to csv
# lme_dataset_df_filtered_human_labelling.to_csv("lme_dataset_df_filtered_human_labelling.csv", index=False)