In [2]:
import pandas as pd
import numpy as np
import json
import os

In [6]:
# Let's define some hyper-parameters
hparams = {
  'traindev_data_path': 'C:\\Users\\danie\\Documents\\TFM\\tfm\\data\\dialogues\\dstc2_traindev\\',
  'test_data_path': 'C:\\Users\\danie\\Documents\\TFM\\tfm\\data\\dialogues\\dstc2_test\\',
}

In [4]:
def load_dstc2_from_disk(traindev_data_path, test_data_path):
    data = []

    traindev_data_path = traindev_data_path + 'data\\'
    test_data_path = test_data_path + 'data\\'

    traindev_data_folders = os.listdir(traindev_data_path)
    traindev_data_folders = [os.path.join(traindev_data_path, folder) for folder in traindev_data_folders]

    test_data_folders = os.listdir(test_data_path)
    test_data_folders = [os.path.join(test_data_path, folder) for folder in test_data_folders]

    all_data_folders = traindev_data_folders + test_data_folders

    for folder_path in all_data_folders:

      print(f"Loading data from {folder_path}...")

      sessions = os.listdir(folder_path)

      for session_id in sessions:
          log_path = os.path.join(folder_path, session_id, 'log.json')
          label_path = os.path.join(folder_path, session_id, 'label.json')
          if os.path.exists(log_path) and os.path.exists(label_path):
              with open(log_path, 'r') as f:
                  log_data = json.load(f)
              with open(label_path, 'r') as f:
                  label_data = json.load(f)

              log_turns = [(turn['output']['transcript'], turn['output']['dialog-acts']) for turn in log_data['turns']]
              label_turns = [(turn['transcription'], turn['audio-file'], turn['semantics']) for turn in label_data['turns']]

              label_turns = [("", None, None)] + label_turns
              turn_index = 0

              for (log_transcript, dialog_acts), (label_transcript, audio_file, semantics) in zip(log_turns, label_turns):

                  if turn_index == 0:  # First turn of the session for the user
                    data.append({
                      'session_id': session_id,
                      'turn_index': turn_index,
                      'from': "user",
                      'transcript': "<user><start>",
                      'dialog_acts': dialog_acts,
                      'audio_file': None,
                      'semantics': None
                    })
                  else:
                    data.append({
                      'session_id': session_id,
                      'turn_index': turn_index,
                      'from': "user",
                      'transcript': "<user>" + label_transcript,
                      'dialog_acts': dialog_acts,  # User's dialog act set to the next system's dialog act
                      'audio_file': audio_file,
                      'semantics': semantics
                    })
                  turn_index += 1

                  data.append({
                      'session_id': session_id,
                      'turn_index': turn_index,
                      'from': "system",
                      'transcript': "<sys>" + log_transcript,
                      'dialog_acts': None,
                      'audio_file': None,
                      'semantics': None,
                  })
                  turn_index += 1

    return data

In [7]:
dataset = load_dstc2_from_disk(hparams['traindev_data_path'], hparams['test_data_path'])
df = pd.DataFrame(dataset)

print(len(df))
df.head()

Loading data from C:\Users\danie\Documents\TFM\tfm\data\dialogues\dstc2_traindev\data\Mar13_S0A0...
Loading data from C:\Users\danie\Documents\TFM\tfm\data\dialogues\dstc2_traindev\data\Mar13_S0A1...
Loading data from C:\Users\danie\Documents\TFM\tfm\data\dialogues\dstc2_traindev\data\Mar13_S1A0...
Loading data from C:\Users\danie\Documents\TFM\tfm\data\dialogues\dstc2_traindev\data\Mar13_S1A1...
Loading data from C:\Users\danie\Documents\TFM\tfm\data\dialogues\dstc2_test\data\Mar13_S2A0...
Loading data from C:\Users\danie\Documents\TFM\tfm\data\dialogues\dstc2_test\data\Mar13_S2A1...
51002


Unnamed: 0,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics
0,voip-00d76b791d-20130327_010416,0,user,<user><start>,"[{'slots': [], 'act': 'welcomemsg'}]",,
1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,
2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south part o...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive..."
3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,
4,voip-00d76b791d-20130327_010416,4,user,<user>any,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a..."


In [9]:
n_dialogues = df['session_id'].nunique()
print(f"Number of dialogues: {n_dialogues}")

Number of dialogues: 3235


In [10]:
df.to_csv('C:\\Users\\danie\\Documents\\TFM\\tfm\\data\\output_data\\1_dialogues.csv', index=False)