In [2]:
######## Installations - BE SURE TO MAKE YOUR OWN LOCAL VENV FIRST

%pip install gdown pandas

Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting beautifulsoup4 (from gdown)
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting filelock (from gdown)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting requests[socks] (from gdown)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm (from gdown)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->gdown)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests[socks]->gdown)
  Downloading charset_normalizer-3.4.1-cp313-cp313-macosx_10_13_universal2.whl.metadata (35 kB)
Collecting idna<4,>=2.5 (from requests[socks]->gdown)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests[socks]->gdown)
  Downloading urllib3-2.3.0-py3-none-any.whl.metadata (6.5 kB)

In [3]:
######## Imports

import pandas as pd
import gdown
import tarfile
import os
import json
from datetime import datetime
import csv

In [4]:
######## Download the eval dataset from the official Google Drive source

file_id = '1zJgtYRFhOh5zDQzzatiddfjYhFSnyQ80'
url = f'https://drive.google.com/uc?id={file_id}'
folder_path = './longmemeval_data'
file_path = os.path.join(folder_path, 'longmemeval_data.tar.gz')

# If it doesn't exist, create a "/.longmemeval_data/" directory
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Download the compressed dataset
if not os.path.exists(file_path):
    gdown.download(url, file_path, quiet=False)
else:
    print(f"'{file_path}' already exists, skipping download.")

# Extract the tar.gz
if not os.path.exists(os.path.join(folder_path, 'longmemeval_oracle.json')):
    with tarfile.open(file_path, 'r:gz') as tar:
        tar.extractall(path=folder_path)
else:
    print("'longmemeval_oracle.json' already exists, so skipping extraction.")

'./longmemeval_data/longmemeval_data.tar.gz' already exists, skipping download.
'longmemeval_oracle.json' already exists, so skipping extraction.


In [5]:
######## Load the eval dataset

lme_dataset_option = os.path.join(folder_path, 'longmemeval_oracle.json') # Can be _oracle, _s, or _m
lme_dataset_df = pd.read_json(lme_dataset_option)
lme_dataset_df.head()

Unnamed: 0,question_id,question_type,question,answer,question_date,haystack_dates,haystack_session_ids,haystack_sessions,answer_session_ids
0,gpt4_2655b836,temporal-reasoning,What was the first issue I had with my new car...,GPS system not functioning correctly,2023/04/10 (Mon) 23:07,"[2023/04/10 (Mon) 17:50, 2023/04/10 (Mon) 14:4...","[answer_4be1b6b4_2, answer_4be1b6b4_3, answer_...","[[{'role': 'user', 'content': 'I'm thinking of...","[answer_4be1b6b4_2, answer_4be1b6b4_3, answer_..."
1,gpt4_2487a7cb,temporal-reasoning,"Which event did I attend first, the 'Effective...",'Data Analysis using Python' webinar,2023/05/28 (Sun) 06:47,"[2023/05/28 (Sun) 21:04, 2023/05/28 (Sun) 07:17]","[answer_1c6b85ea_1, answer_1c6b85ea_2]","[[{'role': 'user', 'content': 'I'm trying to g...","[answer_1c6b85ea_1, answer_1c6b85ea_2]"
2,gpt4_76048e76,temporal-reasoning,Which vehicle did I take care of first in Febr...,bike,2023/03/10 (Fri) 23:15,"[2023/03/10 (Fri) 22:50, 2023/03/10 (Fri) 08:11]","[answer_b535969f_2, answer_b535969f_1]","[[{'role': 'user', 'content': 'I'm thinking of...","[answer_b535969f_2, answer_b535969f_1]"
3,gpt4_2312f94c,temporal-reasoning,"Which device did I got first, the Samsung Gala...",Samsung Galaxy S22,2023/03/15 (Wed) 03:53,"[2023/03/15 (Wed) 00:56, 2023/03/15 (Wed) 10:31]","[answer_5328c3c2_1, answer_5328c3c2_2]","[[{'role': 'user', 'content': 'I'm planning a ...","[answer_5328c3c2_1, answer_5328c3c2_2]"
4,0bb5a684,temporal-reasoning,How many days before the team meeting I was pr...,7 days. 8 days (including the last day) is als...,2023/01/13 (Fri) 19:39,"[2023/01/13 (Fri) 18:07, 2023/01/13 (Fri) 21:38]","[answer_e936197f_1, answer_e936197f_2]","[[{'role': 'user', 'content': 'I'm preparing f...","[answer_e936197f_1, answer_e936197f_2]"


In [13]:
######## Method to save all of the snippets (or only firsts/lasts) of the specified multi-sessions to a CSV file


def snippetize_lme_dataset(lme_filename, max_num_previous_messages=5):
    """
    Creates a csv where each row is a "snippet" from longmemeval. A snippet is a message and set of previous messages.
    """

    lme_dataset_option = os.path.join(folder_path, lme_filename)
    lme_dataset_df = pd.read_json(lme_dataset_option)

    all_snippets = []
    for index, row in lme_dataset_df.iterrows():

        question_id = row['question_id']

        # Extract the haystack_sessions and dates
        sessions = row['haystack_sessions']
        session_dates = row['haystack_dates']

        # Combine into list of dictionaries
        sessions_data = [
            {
                "session": session,
                "date": datetime.strptime(date, "%Y/%m/%d (%a) %H:%M")
            } 
            for session, date in zip(sessions, session_dates)
        ]

        # Sort by date from earliest to latest
        sessions_data.sort(key=lambda x: x["date"])


        all_snippets_this_session = []

        message_index_across_sessions = 0
        for session_index, session_and_date in enumerate(sessions_data):
            for message_index_within_session, message in enumerate(session_and_date["session"]):
                
                num_previous_messages = min(max_num_previous_messages, message_index_across_sessions)
                previous_snippets = all_snippets_this_session[message_index_across_sessions-num_previous_messages:]
                previous_messages_only = [{"role":previous_snippet["message"]["role"], "content":previous_snippet["message"]["content"]} for previous_snippet in previous_snippets]

                snippet = {
                    "question_id": question_id,
                    "question_type": row["question_type"],
                    "multisession_index": index,
                    "session_index": session_index,
                    "message_index_within_session": message_index_within_session,
                    "message_index_across_sessions": message_index_across_sessions,
                    "session_date": session_and_date["date"],
                    "message": message,
                    "previous_messages": previous_messages_only,
                    "num_previous_messages": num_previous_messages,
                }

                if lme_filename == "longmemeval_oracle.json":
                    snippet["message_has_answer"] = message["has_answer"]

                all_snippets_this_session.append(snippet)
                message_index_across_sessions += 1

        all_snippets.extend(all_snippets_this_session)

    

    snippetized_folder = os.path.join(folder_path, "snippetized_data")
    if not os.path.exists(snippetized_folder):
        os.makedirs(snippetized_folder)

    filename = lme_filename.replace(".json", "_snippetized.csv")
    filepath = os.path.join(snippetized_folder, filename)
    
    with open(filepath, "w", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=all_snippets[0].keys())
        writer.writeheader()
        for snippet in all_snippets:
            snippet['message'] = json.dumps(snippet['message'])
            snippet['previous_messages'] = json.dumps(snippet['previous_messages'])
            writer.writerow(snippet)




def snippetize_and_check(lme_filename):
    folder_path = './longmemeval_data/snippetized_data'
    file_path = os.path.join(folder_path, lme_filename.replace(".json", "_snippetized.csv"))
    if not os.path.exists(file_path):
        print(f"Snippetizing {lme_filename}...")
        snippetize_lme_dataset(lme_filename)
    else:
        print(f"Skipping snippetization for {lme_filename} because it already exists.")

    # Check first few rows of the csv
    df = pd.read_csv(file_path)
    display(df.head(10))


In [14]:
lme_filename = "longmemeval_oracle.json"
snippetize_and_check(lme_filename)

Snippetizing longmemeval_oracle.json...


Unnamed: 0,question_id,question_type,multisession_index,session_index,message_index_within_session,message_index_across_sessions,session_date,message,previous_messages,num_previous_messages,message_has_answer
0,gpt4_2655b836,temporal-reasoning,0,0,0,0,2023-04-10 14:47:00,"{""role"": ""user"", ""content"": ""I'm thinking of g...",[],0,False
1,gpt4_2655b836,temporal-reasoning,0,0,1,1,2023-04-10 14:47:00,"{""role"": ""assistant"", ""content"": ""Choosing the...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",1,False
2,gpt4_2655b836,temporal-reasoning,0,0,2,2,2023-04-10 14:47:00,"{""role"": ""user"", ""content"": ""I've been doing s...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",2,True
3,gpt4_2655b836,temporal-reasoning,0,0,3,3,2023-04-10 14:47:00,"{""role"": ""assistant"", ""content"": ""That's great...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",3,False
4,gpt4_2655b836,temporal-reasoning,0,0,4,4,2023-04-10 14:47:00,"{""role"": ""user"", ""content"": ""I'll definitely a...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",4,False
5,gpt4_2655b836,temporal-reasoning,0,0,5,5,2023-04-10 14:47:00,"{""role"": ""assistant"", ""content"": ""Advanced pai...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",5,False
6,gpt4_2655b836,temporal-reasoning,0,0,6,6,2023-04-10 14:47:00,"{""role"": ""user"", ""content"": ""I'll definitely a...","[{""role"": ""assistant"", ""content"": ""Choosing th...",5,False
7,gpt4_2655b836,temporal-reasoning,0,0,7,7,2023-04-10 14:47:00,"{""role"": ""assistant"", ""content"": ""Congratulati...","[{""role"": ""user"", ""content"": ""I've been doing ...",5,False
8,gpt4_2655b836,temporal-reasoning,0,0,8,8,2023-04-10 14:47:00,"{""role"": ""user"", ""content"": ""That's really hel...","[{""role"": ""assistant"", ""content"": ""That's grea...",5,False
9,gpt4_2655b836,temporal-reasoning,0,0,9,9,2023-04-10 14:47:00,"{""role"": ""assistant"", ""content"": ""Congratulati...","[{""role"": ""user"", ""content"": ""I'll definitely ...",5,False


In [15]:
lme_filename = "longmemeval_s.json"
snippetize_and_check(lme_filename)

Snippetizing longmemeval_s.json...


Unnamed: 0,question_id,question_type,multisession_index,session_index,message_index_within_session,message_index_across_sessions,session_date,message,previous_messages,num_previous_messages
0,e47becba,single-session-user,0,0,0,0,2023-05-20 02:21:00,"{""role"": ""user"", ""content"": ""The farmer needs ...",[],0
1,e47becba,single-session-user,0,0,1,1,2023-05-20 02:21:00,"{""role"": ""assistant"", ""content"": ""To solve thi...","[{""role"": ""user"", ""content"": ""The farmer needs...",1
2,e47becba,single-session-user,0,1,0,2,2023-05-20 02:57:00,"{""role"": ""user"", ""content"": ""I'm trying to sta...","[{""role"": ""user"", ""content"": ""The farmer needs...",2
3,e47becba,single-session-user,0,1,1,3,2023-05-20 02:57:00,"{""role"": ""assistant"", ""content"": ""Congratulati...","[{""role"": ""user"", ""content"": ""The farmer needs...",3
4,e47becba,single-session-user,0,1,2,4,2023-05-20 02:57:00,"{""role"": ""user"", ""content"": ""I've been doing s...","[{""role"": ""user"", ""content"": ""The farmer needs...",4
5,e47becba,single-session-user,0,1,3,5,2023-05-20 02:57:00,"{""role"": ""assistant"", ""content"": ""Yoga is an e...","[{""role"": ""user"", ""content"": ""The farmer needs...",5
6,e47becba,single-session-user,0,1,4,6,2023-05-20 02:57:00,"{""role"": ""user"", ""content"": ""That's really hel...","[{""role"": ""assistant"", ""content"": ""To solve th...",5
7,e47becba,single-session-user,0,1,5,7,2023-05-20 02:57:00,"{""role"": ""assistant"", ""content"": ""Foam rolling...","[{""role"": ""user"", ""content"": ""I'm trying to st...",5
8,e47becba,single-session-user,0,1,6,8,2023-05-20 02:57:00,"{""role"": ""user"", ""content"": ""I've also been tr...","[{""role"": ""assistant"", ""content"": ""Congratulat...",5
9,e47becba,single-session-user,0,1,7,9,2023-05-20 02:57:00,"{""role"": ""assistant"", ""content"": ""Monitoring y...","[{""role"": ""user"", ""content"": ""I've been doing ...",5


In [16]:
lme_filename = "longmemeval_m.json"
snippetize_and_check(lme_filename)

Snippetizing longmemeval_m.json...


Unnamed: 0,question_id,question_type,multisession_index,session_index,message_index_within_session,message_index_across_sessions,session_date,message,previous_messages,num_previous_messages
0,7161e7e2,single-session-assistant,0,0,0,0,2023-05-20 00:04:00,"{""role"": ""user"", ""content"": ""Can you provide t...",[],0
1,7161e7e2,single-session-assistant,0,0,1,1,2023-05-20 00:04:00,"{""role"": ""assistant"", ""content"": ""Yes, here ar...","[{""role"": ""user"", ""content"": ""Can you provide ...",1
2,7161e7e2,single-session-assistant,0,0,2,2,2023-05-20 00:04:00,"{""role"": ""user"", ""content"": ""Wow, it's amazing...","[{""role"": ""user"", ""content"": ""Can you provide ...",2
3,7161e7e2,single-session-assistant,0,0,3,3,2023-05-20 00:04:00,"{""role"": ""assistant"", ""content"": ""Yes, achievi...","[{""role"": ""user"", ""content"": ""Can you provide ...",3
4,7161e7e2,single-session-assistant,0,0,4,4,2023-05-20 00:04:00,"{""role"": ""user"", ""content"": ""It's crazy to thi...","[{""role"": ""user"", ""content"": ""Can you provide ...",4
5,7161e7e2,single-session-assistant,0,0,5,5,2023-05-20 00:04:00,"{""role"": ""assistant"", ""content"": ""Astronauts u...","[{""role"": ""user"", ""content"": ""Can you provide ...",5
6,7161e7e2,single-session-assistant,0,1,0,6,2023-05-20 00:16:00,"{""role"": ""user"", ""content"": ""What about aliena...","[{""role"": ""assistant"", ""content"": ""Yes, here a...",5
7,7161e7e2,single-session-assistant,0,1,1,7,2023-05-20 00:16:00,"{""role"": ""assistant"", ""content"": ""Alienation i...","[{""role"": ""user"", ""content"": ""Wow, it's amazin...",5
8,7161e7e2,single-session-assistant,0,1,2,8,2023-05-20 00:16:00,"{""role"": ""user"", ""content"": ""Why you did not m...","[{""role"": ""assistant"", ""content"": ""Yes, achiev...",5
9,7161e7e2,single-session-assistant,0,1,3,9,2023-05-20 00:16:00,"{""role"": ""assistant"", ""content"": ""I apologize ...","[{""role"": ""user"", ""content"": ""It's crazy to th...",5
