In [None]:
######## Installations - BE SURE TO MAKE YOUR OWN LOCAL VENV FIRST

%pip install gdown pandas

In [None]:
######## Imports

import csv
import json
import os
import tarfile
from datetime import datetime

import gdown
import pandas as pd

In [None]:
######## Download the eval dataset from the official Google Drive source

file_id = '1zJgtYRFhOh5zDQzzatiddfjYhFSnyQ80'
url = f'https://drive.google.com/uc?id={file_id}'
folder_path = './longmemeval_data'
file_path = os.path.join(folder_path, 'longmemeval_data.tar.gz')

# If it doesn't exist, create a "/.longmemeval_data/" directory
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Download the compressed dataset
if not os.path.exists(file_path):
    gdown.download(url, file_path, quiet=False)
else:
    print(f"'{file_path}' already exists, skipping download.")

# Extract the tar.gz
if not os.path.exists(os.path.join(folder_path, 'longmemeval_oracle.json')):
    with tarfile.open(file_path, 'r:gz') as tar:
        tar.extractall(path=folder_path)
else:
    print("'longmemeval_oracle.json' already exists, so skipping extraction.")

In [None]:
######## Load the eval dataset

lme_dataset_option = os.path.join(
    folder_path, 'longmemeval_oracle.json'
)  # Can be _oracle, _s, or _m
lme_dataset_df = pd.read_json(lme_dataset_option)
lme_dataset_df.head()

In [None]:
######## Method to save all of the snippets (or only firsts/lasts) of the specified multi-sessions to a CSV file


def snippetize_lme_dataset(lme_filename, max_num_previous_messages=5):
    """
    Creates a csv where each row is a "snippet" from longmemeval. A snippet is a message and set of previous messages.
    """

    lme_dataset_option = os.path.join(folder_path, lme_filename)
    lme_dataset_df = pd.read_json(lme_dataset_option)

    all_snippets = []
    for index, row in lme_dataset_df.iterrows():
        question_id = row['question_id']

        # Extract the haystack_sessions and dates
        sessions = row['haystack_sessions']
        session_dates = row['haystack_dates']

        # Combine into list of dictionaries
        sessions_data = [
            {'session': session, 'date': datetime.strptime(date, '%Y/%m/%d (%a) %H:%M')}
            for session, date in zip(sessions, session_dates)
        ]

        # Sort by date from earliest to latest
        sessions_data.sort(key=lambda x: x['date'])

        all_snippets_this_session = []

        message_index_across_sessions = 0
        for session_index, session_and_date in enumerate(sessions_data):
            for message_index_within_session, message in enumerate(session_and_date['session']):
                num_previous_messages = min(
                    max_num_previous_messages, message_index_across_sessions
                )
                previous_snippets = all_snippets_this_session[
                    message_index_across_sessions - num_previous_messages :
                ]
                previous_messages_only = [
                    {
                        'role': previous_snippet['message']['role'],
                        'content': previous_snippet['message']['content'],
                    }
                    for previous_snippet in previous_snippets
                ]

                snippet = {
                    'question_id': question_id,
                    'question_type': row['question_type'],
                    'multisession_index': index,
                    'session_index': session_index,
                    'message_index_within_session': message_index_within_session,
                    'message_index_across_sessions': message_index_across_sessions,
                    'session_date': session_and_date['date'],
                    'message': message,
                    'previous_messages': previous_messages_only,
                    'num_previous_messages': num_previous_messages,
                }

                if lme_filename == 'longmemeval_oracle.json':
                    snippet['message_has_answer'] = message['has_answer']

                all_snippets_this_session.append(snippet)
                message_index_across_sessions += 1

        all_snippets.extend(all_snippets_this_session)

    snippetized_folder = os.path.join(folder_path, 'snippetized_data')
    if not os.path.exists(snippetized_folder):
        os.makedirs(snippetized_folder)

    filename = lme_filename.replace('.json', '_snippetized.csv')
    filepath = os.path.join(snippetized_folder, filename)

    with open(filepath, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=all_snippets[0].keys())
        writer.writeheader()
        for snippet in all_snippets:
            snippet['message'] = json.dumps(snippet['message'])
            snippet['previous_messages'] = json.dumps(snippet['previous_messages'])
            writer.writerow(snippet)


def snippetize_and_check(lme_filename):
    folder_path = './longmemeval_data/snippetized_data'
    file_path = os.path.join(folder_path, lme_filename.replace('.json', '_snippetized.csv'))
    if not os.path.exists(file_path):
        print(f'Snippetizing {lme_filename}...')
        snippetize_lme_dataset(lme_filename)
    else:
        print(f'Skipping snippetization for {lme_filename} because it already exists.')

    # Check first few rows of the csv
    df = pd.read_csv(file_path)
    display(df.head(10))

In [8]:
lme_filename = 'longmemeval_oracle.json'
snippetize_and_check(lme_filename)

Skipping snippetization for longmemeval_oracle.json because it already exists.


Unnamed: 0,question_id,question_type,multisession_index,session_index,message_index_within_session,message_index_across_sessions,session_date,message,previous_messages,num_previous_messages,message_has_answer
0,gpt4_2655b836,temporal-reasoning,0,0,0,0,2023-04-10 14:47:00,"{""role"": ""user"", ""content"": ""I'm thinking of g...",[],0,False
1,gpt4_2655b836,temporal-reasoning,0,0,1,1,2023-04-10 14:47:00,"{""role"": ""assistant"", ""content"": ""Choosing the...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",1,False
2,gpt4_2655b836,temporal-reasoning,0,0,2,2,2023-04-10 14:47:00,"{""role"": ""user"", ""content"": ""I've been doing s...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",2,True
3,gpt4_2655b836,temporal-reasoning,0,0,3,3,2023-04-10 14:47:00,"{""role"": ""assistant"", ""content"": ""That's great...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",3,False
4,gpt4_2655b836,temporal-reasoning,0,0,4,4,2023-04-10 14:47:00,"{""role"": ""user"", ""content"": ""I'll definitely a...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",4,False
5,gpt4_2655b836,temporal-reasoning,0,0,5,5,2023-04-10 14:47:00,"{""role"": ""assistant"", ""content"": ""Advanced pai...","[{""role"": ""user"", ""content"": ""I'm thinking of ...",5,False
6,gpt4_2655b836,temporal-reasoning,0,0,6,6,2023-04-10 14:47:00,"{""role"": ""user"", ""content"": ""I'll definitely a...","[{""role"": ""assistant"", ""content"": ""Choosing th...",5,False
7,gpt4_2655b836,temporal-reasoning,0,0,7,7,2023-04-10 14:47:00,"{""role"": ""assistant"", ""content"": ""Congratulati...","[{""role"": ""user"", ""content"": ""I've been doing ...",5,False
8,gpt4_2655b836,temporal-reasoning,0,0,8,8,2023-04-10 14:47:00,"{""role"": ""user"", ""content"": ""That's really hel...","[{""role"": ""assistant"", ""content"": ""That's grea...",5,False
9,gpt4_2655b836,temporal-reasoning,0,0,9,9,2023-04-10 14:47:00,"{""role"": ""assistant"", ""content"": ""Congratulati...","[{""role"": ""user"", ""content"": ""I'll definitely ...",5,False


In [None]:
lme_filename = 'longmemeval_s.json'
snippetize_and_check(lme_filename)

In [None]:
lme_filename = 'longmemeval_m.json'
snippetize_and_check(lme_filename)