In [1]:
import os
import requests
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def prepare_MSC(dataset_path:str) -> pd.DataFrame:
    print(dataset_path)
    df = pd.read_json(dataset_path, lines=True)

    multi_session = []
    for idx in range(len(df)): # number of data
        for sess in range(len(df['sessions'][idx])): # number of session
            multi_session_personas1 = df['sessions'][idx][sess]['personas'][0]['text']
            multi_session_personas2 = df['sessions'][idx][sess]['personas'][1]['text']

            multi_session_dialogue = []
            multi_session_speaker = []

            # number of turn
            for turn in range(len(df['sessions'][idx][sess]['dialogue'])):
                multi_session_dialogue.append(df['sessions'][idx][sess]['dialogue'][turn]['text'])
                multi_session_speaker.append(df['sessions'][idx][sess]['dialogue'][turn]['speaker'])

            multi_session.append(['MSC', idx, sess, multi_session_personas1, multi_session_personas2, multi_session_dialogue, multi_session_speaker])

    data = pd.DataFrame(multi_session, columns=['dataset', 'dialoug_id', 'session_id', 'persona1', 'persona2', 'dialogue', 'speaker'])
    return data

In [10]:
dir_path= '../data/downloads'
MSC={}
for split in ['train', 'validation', 'test']:
    MSC[split] = prepare_MSC(f'{dir_path}/MSC/{split}.json')
    MSC[split].to_json(f'../data/downloads/MSC/prerocessed_{split}.jsonl', orient='records', lines=True)

../data/downloads/MSC/train.json
../data/downloads/MSC/validation.json
../data/downloads/MSC/test.json


In [26]:
def upload_to_huggingface(dataset: pd.DataFrame, save_name:str):
    raw_train = Dataset.from_pandas(dataset['train'])
    raw_valid = Dataset.from_pandas(dataset['validation'])
    raw_test = Dataset.from_pandas(dataset['test'])
    concat_dataset = DatasetDict({'train': raw_train, 'validation': raw_valid, 'test': raw_test})
    concat_dataset.push_to_hub(save_name)

huggingface_user_name='nayohan'
upload_to_huggingface(MSC, f'{huggingface_user_name}/multi-session-chat')

Creating parquet from Arrow format: 100%|██████████| 8/8 [00:00<00:00, 375.34ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.37s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  3.43it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 296.58ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.87s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 282.31ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.95s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  3.42it/s]
Downloading metadata: 100%|██████████| 897/897 [00:00<00:00, 7.01MB/s]


In [13]:

def _task_dialouge_generation(preprocessed_dataset:pd.DataFrame) -> pd.DataFrame:
    df = preprocessed_dataset

    task_dsg = []
    for idx in range(len(df)): # number of data
        multi_turn_dialogue = []
        n_turn = len(df['dialogue'][idx])

        for turn in range(n_turn):
            row = f"{df['speaker'][idx][turn]}: {df['dialogue'][idx][turn]} \n"
            multi_turn_dialogue.append(row)

        rand_idx = random.randint(2, turn)
        multi_turn_dialogue_part = multi_turn_dialogue[:rand_idx]

        last_response = multi_turn_dialogue_part[-1]
        last_spaker = multi_turn_dialogue_part[-1].split(':')[0]
        multi_turn_dialogue_part[-1] = last_spaker + ': ###\n'
        context = ''.join(multi_turn_dialogue_part)

        prompt = f"""You will be shown a {len(multi_turn_dialogue_part)} turn dialogues between {df['speaker'][idx][0]} and {df['speaker'][idx][1]}. Please read and understand given Dialogue Session, then complete the task under the guidance of Task Introduction.\n\n"""
        main_context = "```\nDialogue Session:\n" + context + "\n```\n\n"
        Task_Introduction = """```\nTask Introduction:\nAfter reading the entire Dialogue Session, please create an appropriate response.\n```\n Task Result:"""

        input = prompt + main_context + Task_Introduction
        output = last_response
        task_dsg.append([input, output])
    return pd.DataFrame(task_dsg, columns=['input', 'output'])


download_path= '../data/downloads'
task_save_path = '../data/tasks'
task_list = ["DG"]

task_generator = DialogueRelateTaskGenerator()
for task in task_list:
    os.makedirs(f'{task_save_path}/{task}', exist_ok=True)
    dataset_dict={}
    for split in ['train', 'validation', 'test']:
        dataset_dict[split] = _task_dialouge_generation(MSC[split])
        dataset_dict[split].to_json(f'{task_save_path}/{task}/{split}.jsonl', orient='records', lines=True)
    # upload_to_huggingface(dataset_dict, f'{huggingface_user_name}/{task}')

In [None]:
import random

def task_dialouge_span_generation(preprocessed_dataset:pd.DataFrame) -> pd.DataFrame:
    df = preprocessed_dataset

    task_dsg = []
    for idx in range(len(df)): # number of data
        multi_turn_dialogue = []
        n_turn = len(df['dialogue'][idx])

        for turn in range(n_turn):
            row = f"{df['speaker'][idx][turn]}: {df['dialogue'][idx][turn]} \n"
            multi_turn_dialogue.append(row)

        rand_idx = random.randint(0, turn)
        dialogue_span = multi_turn_dialogue[rand_idx]
        multi_turn_dialogue[rand_idx] = multi_turn_dialogue[rand_idx].split(':')[0] + ': ###\n'
        context = ''.join(multi_turn_dialogue)

        prompt = f"""You will be shown a {n_turn}turn conversation between {df['speaker'][idx][0]} and {df['speaker'][idx][1]}.
Please read and understand given Dialogue Session, then complete the task under the guidance of Task Introduction.\n\n"""

        Task_Introduction = """
        ```
        Task Introduction:
        After reading the entire conversation, please create an appropriate conversation in the parts marked ###.
        ```
        """

        input = prompt + "```\n Dialogue Session:\n" + context + "\n```\n\n" + Task_Introduction
        output = dialogue_span
        task_dsg.append([input, output])

    return pd.DataFrame(task_dsg, columns=['input', 'output'])

T_DSG = task_dialouge_span_generation(MSC['train'])
T_DSG

In [2]:
def prepare_ConversationChronicle(dataset_path:str) -> pd.DataFrame:
    print(dataset_path)
    df = pd.read_json(dataset_path, lines=True)
    df = df.sample(frac=0.01, random_state=2023).reset_index()

    multi_session = []
    for idx in range(len(df)): # number of data
        column_name = ["first", "second", "third", "fourth", "fifth"]
        for sess_num, c_name in enumerate(column_name): # number of session
            data_id = df['dataID'][idx]
            relationship = df['relationship'][idx]
            time_interval = df['time_interval'][idx][sess_num]
            summarization = df['summary'][idx][sess_num]
            dialogue = df[f'{c_name}_session_dialogue'][sess_num]
            speaker = df[f'{c_name}_session_dialogue'][sess_num]
            multi_session.append(['CC', data_id, idx, sess_num, relationship, time_interval, summarization, dialogue, speaker])

    data = pd.DataFrame(multi_session, columns=['dataset', 'data_id', 'dialogue_id', 'session_id', 'relationship', 'time_interval', 'summarization', 'dialogue', 'speaker'])
    return data

download_path= '../data/downloads'
dataset_name="conversation_chronicles"
MSC={}
for split in ['train', 'validation', 'test']:
    MSC[split] = prepare_ConversationChronicle(f'{download_path}/{dataset_name}/{split}.json')
    MSC[split].to_json(f'{download_path}/{dataset_name}/prerocessed_{split}.jsonl', orient='records', lines=True)
# upload_to_huggingface(MSC, f'{huggingface_user_name}/conversation-chronicles')

../data/downloads/conversation_chronicles/train.json
../data/downloads/conversation_chronicles/validation.json
../data/downloads/conversation_chronicles/test.json


In [11]:
df = MSC['train']

num_partitions =5
rows_per_partitoins = len(df) // num_partitions

sub_dataframes = [df.iloc[i:i+rows_per_partitoins] for i in range(0, len(df), rows_per_partitoins)]
len(sub_dataframes)

5

In [3]:
df = pd.read_json(f'{download_path}/{dataset_name}/{split}.json', lines=True)
df = df.sample(frac=0.01, random_state=2023)
df['dataID']

8648     episode-150540
15597     episode-31403
6810     episode-120874
17882    episode-210994
3789     episode-161650
              ...      
10953    episode-255156
224       episode-18218
1772     episode-189067
19178    episode-226276
2636     episode-199463
Name: dataID, Length: 200, dtype: object

In [5]:
data = load_dataset('nayohan/conversation_chronicles')
data

DatasetDict({
    train: Dataset({
        features: ['dataset', 'data_id', 'dialogue_id', 'session_id', 'relationship', 'time_interval', 'summarization', 'dialogue', 'speaker'],
        num_rows: 40000
    })
    validation: Dataset({
        features: ['dataset', 'data_id', 'dialogue_id', 'session_id', 'relationship', 'time_interval', 'summarization', 'dialogue', 'speaker'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['dataset', 'data_id', 'dialogue_id', 'session_id', 'relationship', 'time_interval', 'summarization', 'dialogue', 'speaker'],
        num_rows: 5000
    })
})

In [6]:
df = pd.DataFrame(data['train'])
df['relationship'].value_counts()

relationship
Classmates             13040
Neighbors               9885
Co-workers              5685
Mentee and Mentor       3230
Husband and Wife        2865
Parent and Child        1360
Patient and Doctor      1225
Employee and Boss       1085
Student and Teacher     1060
Athlete and Coach        565
Name: count, dtype: int64

In [7]:
df['time_interval'].value_counts()

time_interval
Start                      8000
A few weeks after          6536
A few months after         6433
A few days after           6417
A few hours after          6323
A couple of years after    5897
A couple of years           394
Name: count, dtype: int64

In [61]:
dataset_name = "conversation_chronicles"
dataset_path = f"/home/uj-user/Yo/hybrid-ltm/data/downloads/{dataset_name}"

dataset_dict = {}
for split in ['train', 'validation', 'test']: 
    dataset_dict[split]=pd.read_json(f'{dataset_path}/{split}.json', lines=True)
    dataset_dict[split]=dataset_dict[split].sample(frac=0.05, random_state=2023).reset_index()
    dataset_dict[split].to_json(f'{dataset_path}/sampled_{split}.jsonl', orient='records', lines=True, index=False)

ValueError: 'index=False' is only valid when 'orient' is 'split' or 'table'

In [52]:
def _task_multi_session_dialouge_generation(preprocessed_dataset:pd.DataFrame) -> pd.DataFrame:
    df = preprocessed_dataset
    df = df[df['time_interval']=="Start"].reset_index()
    
    task_dsg = []
    for idx in  tqdm(range(len(df)), 5): # number of data
        
        multi_session_dialogue = []
        for i in range(idx, idx+4):
            multi_turn_dialogue = []
            n_turn = len(df['dialogue'][idx])
            for turn in range(n_turn):
                row = f"{df['speaker'][idx][turn]}: {df['dialogue'][idx][turn]} \n"
                multi_turn_dialogue.append(row)
            multi_session_dialouge.append(multi_turn_dialogue)



        rand_idx = random.randint(2, turn)
        multi_turn_dialogue_part = multi_turn_dialogue[:rand_idx]
        last_response = multi_turn_dialogue_part[-1]
        last_spaker = multi_turn_dialogue_part[-1].split(':')[0]
        multi_turn_dialogue_part[-1] = last_spaker + ': ###\n'
        context = ''.join(multi_turn_dialogue_part)

        prompt = f"""You will be shown a 5 session dialogues between {df['speaker'][idx][0]} and {df['speaker'][idx][1]}. Please read and understand given Dialogue Sessions, then complete the task under the guidance of Task Introduction.\n\n"""
        
        main_context_list = []
        for  i in range(5):
            main_context_list.append(f"```\nDialogue Session #{i}:\n" + context + "\n```\n\n")
        main_context = ''.join(main_context_list)
        task_introduction = """```\nTask Introduction:\nAfter reading the entire Dialogue Session, please create an appropriate response.\n```\n Task Result:"""

        input = prompt + main_context + task_introduction
        output = last_response
        task_dsg.append([input, output])
    return pd.DataFrame(task_dsg, columns=['input', 'output'])
        
download_path= '../data/downloads'
task_save_path = '../data/tasks'
task_list = ["MS_DG"]

MSC = load_dataset(f'{huggingface_user_name}/conversation_chronicles')
for task in task_list:
    os.makedirs(f'{task_save_path}/{task}', exist_ok=True)
    dataset_dict={}
    for split in ['train', 'validation', 'test']:
        dataset_dict[split] = _task_multi_session_dialouge_generation(pd.DataFrame(MSC[split]))
        dataset_dict[split].to_json(f'{task_save_path}/{task}/{split}.jsonl', orient='records', lines=True)
    # upload_to_huggingface(dataset_dict, f'{huggingface_user_name}/{task}')

100%|██████████| 8000/8000 [00:00<00:00, 11252.18it/s]
100%|██████████| 1000/1000 [00:00<00:00, 11542.73it/s]
100%|██████████| 1000/1000 [00:00<00:00, 11278.92it/s]


In [21]:
import random 

random.randint(2, 5)

5

In [2]:
import pandas as pd


df = pd.DataFrame(['asdf', 'asdf','asdf', 'asdf','asdf', 'asdf','asdf', 'asdf','asdf', 'asdf',])
df.split

Unnamed: 0,0
0,asdf
1,asdf
2,asdf
3,asdf
4,asdf
5,asdf
6,asdf
7,asdf
8,asdf
9,asdf
