In [1]:
######## Installations

%pip install gdown pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
######## Imports

import pandas as pd
import gdown
import tarfile
import os
import json
from datetime import datetime
import csv

In [3]:
######## Download the eval dataset from the official Google Drive source

file_id = '1zJgtYRFhOh5zDQzzatiddfjYhFSnyQ80'
url = f'https://drive.google.com/uc?id={file_id}'
folder_path = './longmemeval_data'
file_path = os.path.join(folder_path, 'longmemeval_data.tar.gz')

# If it doesn't exist, create a "/.longmemeval_data/" directory
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Download the compressed dataset
if not os.path.exists(file_path):
    gdown.download(url, file_path, quiet=False)
else:
    print(f"'{file_path}' already exists, skipping download.")

# Extract the tar.gz
if not os.path.exists(os.path.join(folder_path, 'longmemeval_oracle.json')):
    with tarfile.open(file_path, 'r:gz') as tar:
        tar.extractall(path=folder_path)
else:
    print("'longmemeval_oracle.json' already exists, so skipping extraction.")

'./longmemeval_data/longmemeval_data.tar.gz' already exists, skipping download.
'longmemeval_oracle.json' already exists, so skipping extraction.


In [28]:
######## Load the eval dataset

lme_dataset_option = os.path.join(folder_path, 'longmemeval_oracle.json') # Can be _oracle, _s, or _m
lme_dataset_df = pd.read_json(lme_dataset_option)
lme_dataset_df.head()

In [57]:
######## Method to save all of the snippets (or only firsts/lasts) of the specified multi-sessions to a CSV file


def snippetize_lme_dataset(lme_filename, max_num_previous_messages=5):
    """
    Creates a csv where each row is a "snippet" from longmemeval. A snippet is a message and set of previous messages.
    """

    lme_dataset_option = os.path.join(folder_path, lme_filename)
    lme_dataset_df = pd.read_json(lme_dataset_option)

    all_snippets = []
    for index, row in lme_dataset_df.iterrows():

        question_id = row['question_id']

        # Extract the haystack_sessions and dates
        sessions = row['haystack_sessions']
        session_dates = row['haystack_dates']

        # Combine into list of dictionaries
        sessions_data = [
            {
                "session": session,
                "date": datetime.strptime(date, "%Y/%m/%d (%a) %H:%M")
            } 
            for session, date in zip(sessions, session_dates)
        ]

        # Sort by date from earliest to latest
        sessions_data.sort(key=lambda x: x["date"])


        all_snippets_this_session = []

        message_index_across_sessions = 0
        for session_index, session_and_date in enumerate(sessions_data):
            for message_index_within_session, message in enumerate(session_and_date["session"]):
                
                num_previous_messages = min(max_num_previous_messages, message_index_across_sessions)
                previous_snippets = all_snippets_this_session[message_index_across_sessions-num_previous_messages:]
                previous_messages_only = [{"role":previous_snippet["message_role"], "content":previous_snippet["message"]} for previous_snippet in previous_snippets]

                snippet = {
                    "question_id": question_id,
                    "question_type": row["question_type"],
                    "multisession_index": index,
                    "session_index": session_index,
                    "message_index_within_session": message_index_within_session,
                    "message_index_across_sessions": message_index_across_sessions,
                    "session_date": session_and_date["date"],
                    "message_role": message["role"],
                    "message": message["content"],
                    "previous_messages": previous_messages_only,
                    "num_previous_messages": num_previous_messages,
                }

                if lme_filename == "longmemeval_oracle.json":
                    snippet["message_has_answer"] = message["has_answer"]

                all_snippets_this_session.append(snippet)
                message_index_across_sessions += 1

        all_snippets.extend(all_snippets_this_session)

    filename = lme_filename.replace(".json", "_snippetized.csv")

    filepath = os.path.join(folder_path, filename)
    
    with open(filepath, "w", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=all_snippets[0].keys())
        writer.writeheader()
        for snippet in all_snippets:
            snippet['message'] = json.dumps(snippet['message'])
            snippet['previous_messages'] = json.dumps(snippet['previous_messages'])
            writer.writerow(snippet)




def snippetize_and_check(lme_filename):
    folder_path = './longmemeval_data'
    file_path = os.path.join(folder_path, lme_filename.replace(".json", "_snippetized.csv"))
    if not os.path.exists(file_path):
        print(f"Snippetizing {lme_filename}...")
        snippetize_lme_dataset(lme_filename)
    else:
        print(f"Skipping snippetization for {lme_filename} because it already exists.")

    # Check first few rows of the csv
    df = pd.read_csv(file_path)
    display(df.head())

In [56]:
lme_filename = "longmemeval_oracle.json"
snippetize_and_check(lme_filename)


Skipping snippetization for longmemeval_oracle.json because it already exists.


Unnamed: 0,question_id,question_type,multisession_index,session_index,message_index_within_session,message_index_across_sessions,session_date,message_role,message,previous_messages,message_has_answer
0,gpt4_2655b836,temporal-reasoning,0,0,0,0,2023-04-10 14:47:00,user,"""I'm thinking of getting a car wax and detaili...",[],False
1,gpt4_2655b836,temporal-reasoning,0,0,1,1,2023-04-10 14:47:00,assistant,"""Choosing the right detailer can make all the ...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",False
2,gpt4_2655b836,temporal-reasoning,0,0,2,2,2023-04-10 14:47:00,user,"""I've been doing some research and found a loc...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",True
3,gpt4_2655b836,temporal-reasoning,0,0,3,3,2023-04-10 14:47:00,assistant,"""That's great to hear that the dealership was ...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",False
4,gpt4_2655b836,temporal-reasoning,0,0,4,4,2023-04-10 14:47:00,user,"""I'll definitely ask those questions when I vi...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",False


In [52]:
lme_filename = "longmemeval_s.json"
snippetize_and_check(lme_filename)

Snippetizing longmemeval_s.json...


Unnamed: 0,question_id,question_type,multisession_index,session_index,message_index_within_session,message_index_across_sessions,session_date,message_role,message,previous_messages
0,e47becba,single-session-user,0,0,0,0,2023-05-20 02:21:00,user,"""The farmer needs to transport a fox, a chicke...",[]
1,e47becba,single-session-user,0,0,1,1,2023-05-20 02:21:00,assistant,"""To solve this puzzle, the farmer can follow t...","[{""role"": ""user"", ""content"": ""The farmer needs..."
2,e47becba,single-session-user,0,1,0,2,2023-05-20 02:57:00,user,"""I'm trying to stay on top of my fitness goals...","[{""role"": ""user"", ""content"": ""The farmer needs..."
3,e47becba,single-session-user,0,1,1,3,2023-05-20 02:57:00,assistant,"""Congratulations on taking the first step (pun...","[{""role"": ""user"", ""content"": ""The farmer needs..."
4,e47becba,single-session-user,0,1,2,4,2023-05-20 02:57:00,user,"""I've been doing some yoga in the morning, and...","[{""role"": ""user"", ""content"": ""The farmer needs..."


In [53]:
lme_filename = "longmemeval_m.json"
snippetize_and_check(lme_filename)

Snippetizing longmemeval_m.json...


Unnamed: 0,question_id,question_type,multisession_index,session_index,message_index_within_session,message_index_across_sessions,session_date,message_role,message,previous_messages
0,7161e7e2,single-session-assistant,0,0,0,0,2023-05-20 00:04:00,user,"""Can you provide the technical details of how ...",[]
1,7161e7e2,single-session-assistant,0,0,1,1,2023-05-20 00:04:00,assistant,"""Yes, here are the technical details of how a ...","[{""role"": ""user"", ""content"": ""Can you provide ..."
2,7161e7e2,single-session-assistant,0,0,2,2,2023-05-20 00:04:00,user,"""Wow, it's amazing how much goes into launchin...","[{""role"": ""user"", ""content"": ""Can you provide ..."
3,7161e7e2,single-session-assistant,0,0,3,3,2023-05-20 00:04:00,assistant,"""Yes, achieving a stable orbit is a highly sop...","[{""role"": ""user"", ""content"": ""Can you provide ..."
4,7161e7e2,single-session-assistant,0,0,4,4,2023-05-20 00:04:00,user,"""It's crazy to think about how astronauts are ...","[{""role"": ""user"", ""content"": ""Can you provide ..."
