In [None]:
!pip install datasets



In [None]:
import pandas as pd
import numpy as np
import json
import os

from sklearn.preprocessing import LabelEncoder

from transformers import AutoTokenizer

from huggingface_hub import notebook_login
from datasets import Dataset, load_dataset

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 1) Data loading and processing


In [None]:
dataset = load_dataset('danielroncel/dstc2_dialogues')

In [None]:
df = dataset['train'].to_pandas()
df.head()

Unnamed: 0,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics
0,voip-00d76b791d-20130327_010416,0,user,<user><start>,"[{'slots': [], 'act': 'welcomemsg'}]",,
1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,
2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south part o...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive..."
3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,
4,voip-00d76b791d-20130327_010416,4,user,<user>any,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a..."


In [None]:
dataset = load_dataset('danielroncel/dstc2_dialogues_transcription')

In [None]:
df_transcript = dataset['train'].to_pandas()\
                    .rename({'path':'audio_file',
                             'session_ids': 'session_id'}, axis=1)
df_transcript.head()

Unnamed: 0,session_id,audio_file,transcription
0,voip-00d76b791d-20130327_010416,pt344x_0000993_0001219.wav,EXPENSIVE RESTAURANT IN THE SOUTH PARLOF TOWN
1,voip-00d76b791d-20130327_010416,pt344x_0001649_0001680.wav,ANY
2,voip-00d76b791d-20130327_010416,pt344x_0002674_0002736.wav,ADDRESS
3,voip-00d76b791d-20130327_010416,pt344x_0003533_0003611.wav,PIBL FOO
4,voip-00d76b791d-20130327_010416,pt344x_0004453_0004531.wav,NK YOU GOOD BYE


In [None]:
print(len(df))
df = df.merge(df_transcript, on=['session_id', 'audio_file'], how='left')
print(len(df))
df.head()

51002
51002


Unnamed: 0,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,transcription
0,voip-00d76b791d-20130327_010416,0,user,<user><start>,"[{'slots': [], 'act': 'welcomemsg'}]",,,
1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,
2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south part o...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",EXPENSIVE RESTAURANT IN THE SOUTH PARLOF TOWN
3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,
4,voip-00d76b791d-20130327_010416,4,user,<user>any,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",ANY


In [None]:
cond_user = df['from'] == 'user'
cond_turn_0 = df['turn_index'] == 0
cond_no_transcript = df['transcription'].isna()
assert (cond_user & ~cond_turn_0 & cond_no_transcript).sum() == 0

In [None]:
# process transcriptions
def process_transcriptions(row):

  source = row['from']
  turn = row['turn_index']
  transcription = row['transcription']

  if source == 'system':
    pass
  elif turn == 0: # if it is from the user but it is turn 0...
     transcription = '<user><start>'
  else: # if it is from the user and it is not turn 0...
    assert pd.notna(transcription)
    transcription = '<user>' + transcription.lower()

  return transcription

df['transcription_processed'] = df.apply(process_transcriptions, axis=1)
df.head()

Unnamed: 0,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,transcription,transcription_processed
0,voip-00d76b791d-20130327_010416,0,user,<user><start>,"[{'slots': [], 'act': 'welcomemsg'}]",,,,<user><start>
1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,,
2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south part o...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",EXPENSIVE RESTAURANT IN THE SOUTH PARLOF TOWN,<user>expensive restaurant in the south parlof...
3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,,
4,voip-00d76b791d-20130327_010416,4,user,<user>any,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",ANY,<user>any


In [None]:
def replace_by_asr_transcript(row):

  transcription = row['transcription_processed']

  if pd.notna(transcription):
    return transcription

  return row['transcript']

df['transcript'] = df.apply(replace_by_asr_transcript, axis=1)
df.head()

Unnamed: 0,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,transcription,transcription_processed
0,voip-00d76b791d-20130327_010416,0,user,<user><start>,"[{'slots': [], 'act': 'welcomemsg'}]",,,,<user><start>
1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,,
2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",EXPENSIVE RESTAURANT IN THE SOUTH PARLOF TOWN,<user>expensive restaurant in the south parlof...
3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,,
4,voip-00d76b791d-20130327_010416,4,user,<user>any,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",ANY,<user>any


In [None]:
df = df.drop(['transcription', 'transcription_processed'], axis=1)
df.head()

Unnamed: 0,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics
0,voip-00d76b791d-20130327_010416,0,user,<user><start>,"[{'slots': [], 'act': 'welcomemsg'}]",,
1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,
2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive..."
3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,
4,voip-00d76b791d-20130327_010416,4,user,<user>any,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a..."


In [None]:
# Eliminar welcome
print(len(df))

cond = df['turn_index'] != 0
df = df[cond].reset_index(drop=False)
print(len(df))
df.head()

51002
47767


Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics
0,1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,
1,2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive..."
2,3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,
3,4,voip-00d76b791d-20130327_010416,4,user,<user>any,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a..."
4,5,voip-00d76b791d-20130327_010416,5,system,<sys>the good luck chinese food takeaway is a ...,,,


In [None]:
print(f"{df['session_id'].nunique()} dialogues")
print(f"{len(df)} rows")

3235 dialogues
47767 rows


In [None]:
def eval_if_not_none(x):

  if pd.isna(x):
    return x
  return eval(x)

df['dialog_acts'] = df['dialog_acts'].apply(eval_if_not_none)
df['semantics'] = df['semantics'].apply(eval_if_not_none)

In [None]:
possible_acts = set()
for dialog_act in df['dialog_acts']:

  if type(dialog_act) != list:
    continue

  for d in dialog_act:
    act = d['act']
    possible_acts.add(act)

possible_acts

{'canthelp',
 'canthelp.exception',
 'confirm-domain',
 'expl-conf',
 'impl-conf',
 'inform',
 'offer',
 'repeat',
 'reqmore',
 'request',
 'select'}

In [None]:
possible_acts - set(['welcomemsg', 'offer', 'reqmore',
                    'confirm-domain', 'repeat', 'inform',
                     'impl-conf', 'expl-conf', 'select', 'canthelp',
                     'canthelp.exception', 'request'])

set()

In [None]:
set(['welcomemsg', 'offer', 'reqmore',
                    'confirm-domain', 'repeat', 'inform',
                     'impl-conf', 'expl-conf', 'select', 'canthelp',
                     'canthelp.exception', 'request']) - possible_acts

{'welcomemsg'}

In [None]:
# ctr = 0
# for dialog_act in df['dialog_acts']:

#   if type(dialog_act) != list:
#     continue

#   for d in dialog_act:

#     if d['act'] == 'request':
#       print(dialog_act)
#       ctr += 1

In [None]:
def get_labels(dialog_acts):

  labels = []

  if type(dialog_acts) != list:
    return ""

  for d in dialog_acts:
    act = d['act']
    slots = d['slots']

    if act in ['welcomemsg', 'reqmore',
               'confirm-domain', 'repeat']:
      labels.append(act)

    elif act in ['offer', 'inform', 'impl-conf', 'expl-conf', 'select',
                 'canthelp', 'canthelp.exception']:
      for l in slots:
        labels.append(act + '|' + l[0])
    elif act in ['request']:
      for l in slots:
        labels.append(act + '|' + l[1])

  labels = sorted(list(set(labels)))

  labels = '_'.join(labels)

  return labels

In [None]:
df['label'] = df['dialog_acts'].apply(get_labels)
df.head()

Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,label
0,1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,
1,2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",request|food
2,3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,
3,4,voip-00d76b791d-20130327_010416,4,user,<user>any,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",inform|area_inform|food_inform|pricerange_offe...
4,5,voip-00d76b791d-20130327_010416,5,system,<sys>the good luck chinese food takeaway is a ...,,,,


In [None]:
cond_1 = df['from'] == 'user'
cond_2 = df['label'].isna() | (df['label'] == '')
df[cond_1 & cond_2]

Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,label


In [None]:
def add_db_query_tokens(row):
    if "offer" in row['label']:
        # Concatenate the desired string to transcript
        row['transcript'] = f"{row['transcript']}<API_call><DB_result>"
    elif "canthelp" in row['label']:
        # Concatenate the desired string to transcript
        row['transcript'] = f"{row['transcript']}<API_call><no_DB_result>"
    return row

# Apply the function to each row
df = df.apply(add_db_query_tokens, axis=1)

df.head()

Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,label
0,1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,
1,2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",request|food
2,3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,
3,4,voip-00d76b791d-20130327_010416,4,user,<user>any<API_call><DB_result>,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",inform|area_inform|food_inform|pricerange_offe...
4,5,voip-00d76b791d-20130327_010416,5,system,<sys>the good luck chinese food takeaway is a ...,,,,


In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
# new tokens
new_tokens = ["<sys>", "<user>" ,"<DA_pred>"]

# check if the tokens are already in the vocabulary
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())

new_tokens

{'<DA_pred>', '<sys>', '<user>'}

In [None]:
# modify the tokenizer to take into account the new token
tokenizer.add_tokens(list(new_tokens))

# Add padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [None]:
df.head()

Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,label
0,1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,
1,2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",request|food
2,3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,
3,4,voip-00d76b791d-20130327_010416,4,user,<user>any<API_call><DB_result>,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",inform|area_inform|food_inform|pricerange_offe...
4,5,voip-00d76b791d-20130327_010416,5,system,<sys>the good luck chinese food takeaway is a ...,,,,


In [None]:
def concat_last_9_turns(row):

  session_id = row['session_id']
  turn_index = row['turn_index']

  cond_1 = df['session_id'] == session_id
  cond_2 = df['turn_index'] <= turn_index
  sub_df = df[cond_1 & cond_2].sort_values(by='turn_index', ascending=True)

  sub_df = sub_df.iloc[-18:] #18 we consider both 9 turns of the user and 9 turns of the system

  concat_last_9_turns = sub_df['transcript'].sum()

  return concat_last_9_turns

In [None]:
df['chat_history_last_9'] = df.apply(concat_last_9_turns, axis=1)

df.head()

Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,label,chat_history_last_9
0,1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,,"<sys>Hello , welcome to the Cambridge restaura..."
1,2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",request|food,"<sys>Hello , welcome to the Cambridge restaura..."
2,3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,,"<sys>Hello , welcome to the Cambridge restaura..."
3,4,voip-00d76b791d-20130327_010416,4,user,<user>any<API_call><DB_result>,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",inform|area_inform|food_inform|pricerange_offe...,"<sys>Hello , welcome to the Cambridge restaura..."
4,5,voip-00d76b791d-20130327_010416,5,system,<sys>the good luck chinese food takeaway is a ...,,,,,"<sys>Hello , welcome to the Cambridge restaura..."


In [None]:
chat_history_last_9_tokenized = tokenizer(list(df['chat_history_last_9'].values), padding='longest')

df['chat_history_last_9_tokenized'] = chat_history_last_9_tokenized['input_ids']
df['attention_mask'] = chat_history_last_9_tokenized['attention_mask']

df.head()

Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,label,chat_history_last_9,chat_history_last_9_tokenized,attention_mask
0,1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",request|food,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,4,voip-00d76b791d-20130327_010416,4,user,<user>any<API_call><DB_result>,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",inform|area_inform|food_inform|pricerange_offe...,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,5,voip-00d76b791d-20130327_010416,5,system,<sys>the good luck chinese food takeaway is a ...,,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
def get_speaker_text(row):

  transcript = row['transcript']
  source = row['from']

  len_transcript_tokens = len(tokenizer(transcript)['input_ids'])

  if source == 'user':
    return '<user>' * len_transcript_tokens
  elif source == 'system':
    return '<sys>' * len_transcript_tokens

  raise Exception(f"Unexpected source {source}")

df['speaker_text'] = df.apply(get_speaker_text, axis=1)

df.head()

Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,label,chat_history_last_9,chat_history_last_9_tokenized,attention_mask,speaker_text
0,1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys><...
1,2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",request|food,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...
2,3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys>
3,4,voip-00d76b791d-20130327_010416,4,user,<user>any<API_call><DB_result>,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",inform|area_inform|food_inform|pricerange_offe...,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...
4,5,voip-00d76b791d-20130327_010416,5,system,<sys>the good luck chinese food takeaway is a ...,,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys><...


In [None]:
def concat_last_9_turns(row):

  session_id = row['session_id']
  turn_index = row['turn_index']

  cond_1 = df['session_id'] == session_id
  cond_2 = df['turn_index'] <= turn_index
  sub_df = df[cond_1 & cond_2].sort_values(by='turn_index', ascending=True)

  sub_df = sub_df.iloc[-18:] #18 we consider both 9 turns of the user and 9 turns of the system

  concat_last_9_turns = sub_df['speaker_text'].sum() #+ '<DA_pred>'

  return concat_last_9_turns

In [None]:
df['speaker_text_last_9'] = df.apply(concat_last_9_turns, axis=1)
df.head()

Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,label,chat_history_last_9,chat_history_last_9_tokenized,attention_mask,speaker_text,speaker_text_last_9
0,1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...
1,2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",request|food,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...
2,3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys>,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...
3,4,voip-00d76b791d-20130327_010416,4,user,<user>any<API_call><DB_result>,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",inform|area_inform|food_inform|pricerange_offe...,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...
4,5,voip-00d76b791d-20130327_010416,5,system,<sys>the good luck chinese food takeaway is a ...,,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...


In [None]:
speaker_text_last_9_tokenized = tokenizer(list(df['speaker_text_last_9'].values), padding='longest')

df['speaker_text_last_9_tokenized'] = speaker_text_last_9_tokenized['input_ids']

df.head()

Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,label,chat_history_last_9,chat_history_last_9_tokenized,attention_mask,speaker_text,speaker_text_last_9,speaker_text_last_9_tokenized
0,1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502..."
1,2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",request|food,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502..."
2,3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys>,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502..."
3,4,voip-00d76b791d-20130327_010416,4,user,<user>any<API_call><DB_result>,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",inform|area_inform|food_inform|pricerange_offe...,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502..."
4,5,voip-00d76b791d-20130327_010416,5,system,<sys>the good luck chinese food takeaway is a ...,,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502..."


In [None]:
assert (df['chat_history_last_9_tokenized'].apply(len) - df['speaker_text_last_9_tokenized'].apply(len)).sum() == 0

In [None]:
# Add <DA_pred>
DA_pred_token = tokenizer('<DA_pred>')['input_ids'][0]
sys_token = tokenizer('<sys>')['input_ids'][0]

df['chat_history_last_9_tokenized'].apply(lambda x: x.append(DA_pred_token))
df['speaker_text_last_9_tokenized'].apply(lambda x: x.append(sys_token))
df['attention_mask'].apply(lambda x: x.append(1))

df.head()

Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,label,chat_history_last_9,chat_history_last_9_tokenized,attention_mask,speaker_text,speaker_text_last_9,speaker_text_last_9_tokenized
0,1,voip-00d76b791d-20130327_010416,1,system,"<sys>Hello , welcome to the Cambridge restaura...",,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502..."
1,2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",request|food,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502..."
2,3,voip-00d76b791d-20130327_010416,3,system,<sys>What kind of food would you like?,,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys>,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502..."
3,4,voip-00d76b791d-20130327_010416,4,user,<user>any<API_call><DB_result>,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",inform|area_inform|food_inform|pricerange_offe...,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502..."
4,5,voip-00d76b791d-20130327_010416,5,system,<sys>the good luck chinese food takeaway is a ...,,,,,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502..."


In [None]:
# eliminar filas system
df = df[df['from'] == 'user'].reset_index(drop=True)

In [None]:
df['semantics'].sample(10).values.tolist()

[{'json': [{'slots': [], 'act': 'reqalts'}], 'cam': 'reqalts()'},
 {'json': [{'slots': [['pricerange', 'cheap']], 'act': 'inform'},
   {'slots': [['area', 'north']], 'act': 'inform'}],
  'cam': 'inform(area=north,pricerange=cheap,type=restaurant,task=find)'},
 {'json': [{'slots': [['food', 'spanish']], 'act': 'inform'}],
  'cam': 'inform(food=spanish)'},
 {'json': [{'slots': [['slot', 'food']], 'act': 'request'}],
  'cam': 'request(food)'},
 {'json': [{'slots': [], 'act': 'reqalts'},
   {'slots': [['food', 'gastropub']], 'act': 'inform'}],
  'cam': 'reqalts(food=gastropub)'},
 {'json': [{'slots': [['this', 'dontcare']], 'act': 'inform'}],
  'cam': 'inform(=dontcare)'},
 {'json': [], 'cam': 'null()'},
 {'json': [{'slots': [['slot', 'addr']], 'act': 'request'},
   {'slots': [['slot', 'phone']], 'act': 'request'}],
  'cam': 'request(addr,phone)'},
 {'json': [{'slots': [], 'act': 'negate'}], 'cam': 'negate()'},
 {'json': [{'slots': [['pricerange', 'expensive']], 'act': 'inform'}],
  'cam':

In [None]:
set_semantics_acts = set()
for semantics_dict in df['semantics']:

    if semantics_dict is None:
      continue

    semantics_list = semantics_dict['json']

    for semantics in semantics_list:
      set_semantics_acts.add(semantics['act'])
set_semantics_acts

{'ack',
 'affirm',
 'bye',
 'confirm',
 'deny',
 'hello',
 'inform',
 'negate',
 'repeat',
 'reqalts',
 'reqmore',
 'request',
 'restart',
 'thankyou'}

In [None]:
set_semantics_acts - set(['thankyou', 'restart', 'reqmore', 'reqalts', 'repeat', 'negate',
                          'hello', 'bye', 'affirm', 'ack', 'inform', 'deny', 'confirm',
                          'request',])

set()

In [None]:
set(['thankyou', 'restart', 'reqmore', 'reqalts', 'repeat', 'negate',
                          'hello', 'bye', 'affirm', 'ack', 'inform', 'deny', 'confirm',
                          'request',]) - set_semantics_acts

set()

In [None]:
# set_semantics_acts = set()
# for semantics_dict in df['semantics']:

#     if semantics_dict is None:
#       continue

#     semantics_list = semantics_dict['json']

#     for semantics in semantics_list:
#       # set_semantics_acts.add(semantics['act'])
#       if semantics['act'] == 'confirm':
#         print(semantics_list)

In [None]:
def get_semantics_label(semantics_dict):

  labels = []

  if semantics_dict is None:
    return ""

  semantics_list = semantics_dict['json']

  for semantics in semantics_list:

    act = semantics['act']
    slots = semantics['slots']

    if act in ['thankyou', 'restart', 'reqmore', 'reqalts', 'repeat', 'negate',
               'hello', 'bye', 'affirm', 'ack',]:
      labels.append(act)
    elif act in ['inform', 'deny', 'confirm',]:
      for l in slots:
        labels.append(act + '|' + l[0])
    elif act in ['request',]:
      for l in slots:
        labels.append(act + '|' + l[1])

  labels =  sorted(list(set(labels)))

  labels = '_'.join(labels)

  return labels

In [None]:
df['label_semantics'] = df['semantics'].apply(get_semantics_label)
df.head()

Unnamed: 0,index,session_id,turn_index,from,transcript,dialog_acts,audio_file,semantics,label,chat_history_last_9,chat_history_last_9_tokenized,attention_mask,speaker_text,speaker_text_last_9,speaker_text_last_9_tokenized,label_semantics
0,2,voip-00d76b791d-20130327_010416,2,user,<user>expensive restaurant in the south parlof...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0000993_0001219.wav,"{'json': [{'slots': [['pricerange', 'expensive...",request|food,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502...",inform|area_inform|pricerange
1,4,voip-00d76b791d-20130327_010416,4,user,<user>any<API_call><DB_result>,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0001649_0001680.wav,"{'json': [{'slots': [['this', 'dontcare']], 'a...",inform|area_inform|food_inform|pricerange_offe...,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502...",inform|this
2,6,voip-00d76b791d-20130327_010416,6,user,<user>address<API_call><DB_result>,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0002674_0002736.wav,"{'json': [{'slots': [['slot', 'addr']], 'act':...",inform|addr_offer|name,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502...",request|addr
3,8,voip-00d76b791d-20130327_010416,8,user,<user>pibl foo<API_call><DB_result>,"[{'slots': [['name', 'the good luck chinese fo...",pt344x_0003533_0003611.wav,"{'json': [{'slots': [['slot', 'food']], 'act':...",inform|area_inform|pricerange_offer|name,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502...",request|food
4,12,voip-00d76b791d-20130327_011116,2,user,<user>tbanahi's restaurant in the westpart of ...,"[{'slots': [['slot', 'food']], 'act': 'request'}]",pt344x_0001082_0001294.wav,"{'json': [{'slots': [['food', 'lebanese']], 'a...",request|food,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<user><user><user><user><user><user><user><use...,<sys><sys><sys><sys><sys><sys><sys><sys><sys><...,"[50257, 50257, 50257, 50257, 50257, 50257, 502...",inform|area_inform|food


In [None]:
# keep only the necessary columns
cols_to_keep = ['session_id', 'turn_index', 'audio_file', 'transcript',
                'chat_history_last_9', 'chat_history_last_9_tokenized',
                'speaker_text_last_9_tokenized', 'attention_mask',
                'label_semantics', 'label']
df = df[cols_to_keep]

df.head()

Unnamed: 0,session_id,turn_index,audio_file,transcript,chat_history_last_9,chat_history_last_9_tokenized,speaker_text_last_9_tokenized,attention_mask,label_semantics,label
0,voip-00d76b791d-20130327_010416,2,pt344x_0000993_0001219.wav,<user>expensive restaurant in the south parlof...,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[50257, 50257, 50257, 50257, 50257, 50257, 502...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",inform|area_inform|pricerange,request|food
1,voip-00d76b791d-20130327_010416,4,pt344x_0001649_0001680.wav,<user>any<API_call><DB_result>,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[50257, 50257, 50257, 50257, 50257, 50257, 502...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",inform|this,inform|area_inform|food_inform|pricerange_offe...
2,voip-00d76b791d-20130327_010416,6,pt344x_0002674_0002736.wav,<user>address<API_call><DB_result>,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[50257, 50257, 50257, 50257, 50257, 50257, 502...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",request|addr,inform|addr_offer|name
3,voip-00d76b791d-20130327_010416,8,pt344x_0003533_0003611.wav,<user>pibl foo<API_call><DB_result>,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[50257, 50257, 50257, 50257, 50257, 50257, 502...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",request|food,inform|area_inform|pricerange_offer|name
4,voip-00d76b791d-20130327_011116,2,pt344x_0001082_0001294.wav,<user>tbanahi's restaurant in the westpart of ...,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[50257, 50257, 50257, 50257, 50257, 50257, 502...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",inform|area_inform|food,request|food


In [None]:
# Eliminar fias con label=welcommsg

In [None]:
# le = LabelEncoder()
# df['label_encoded'] = le.fit_transform(df['label'])

# df.head()

In [None]:
# le = LabelEncoder()
# df['label_semantics_encoded'] = le.fit_transform(df['label_semantics'])

# df.head()

In [None]:
cond = ~df['label'].str.contains('welcomemsg', regex=False)

print(len(df))
df = df[cond]
print(len(df))

df.head()

22266
22266


Unnamed: 0,session_id,turn_index,audio_file,transcript,chat_history_last_9,chat_history_last_9_tokenized,speaker_text_last_9_tokenized,attention_mask,label_semantics,label
0,voip-00d76b791d-20130327_010416,2,pt344x_0000993_0001219.wav,<user>expensive restaurant in the south parlof...,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[50257, 50257, 50257, 50257, 50257, 50257, 502...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",inform|area_inform|pricerange,request|food
1,voip-00d76b791d-20130327_010416,4,pt344x_0001649_0001680.wav,<user>any<API_call><DB_result>,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[50257, 50257, 50257, 50257, 50257, 50257, 502...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",inform|this,inform|area_inform|food_inform|pricerange_offe...
2,voip-00d76b791d-20130327_010416,6,pt344x_0002674_0002736.wav,<user>address<API_call><DB_result>,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[50257, 50257, 50257, 50257, 50257, 50257, 502...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",request|addr,inform|addr_offer|name
3,voip-00d76b791d-20130327_010416,8,pt344x_0003533_0003611.wav,<user>pibl foo<API_call><DB_result>,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[50257, 50257, 50257, 50257, 50257, 50257, 502...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",request|food,inform|area_inform|pricerange_offer|name
4,voip-00d76b791d-20130327_011116,2,pt344x_0001082_0001294.wav,<user>tbanahi's restaurant in the westpart of ...,"<sys>Hello , welcome to the Cambridge restaura...","[50257, 15496, 837, 7062, 284, 262, 14457, 707...","[50257, 50257, 50257, 50257, 50257, 50257, 502...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",inform|area_inform|food,request|food


In [None]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['session_id', 'turn_index', 'audio_file', 'transcript', 'chat_history_last_9', 'chat_history_last_9_tokenized', 'speaker_text_last_9_tokenized', 'attention_mask', 'label_semantics', 'label', '__index_level_0__'],
    num_rows: 22266
})

In [None]:
dataset = dataset.remove_columns('__index_level_0__')
dataset

Dataset({
    features: ['session_id', 'turn_index', 'audio_file', 'transcript', 'chat_history_last_9', 'chat_history_last_9_tokenized', 'speaker_text_last_9_tokenized', 'attention_mask', 'label_semantics', 'label'],
    num_rows: 22266
})

In [None]:
# Antes, eliminar lo que tenemos ahora subido en este dataset
dataset.push_to_hub('danielroncel/dstc2_dialogues_transcription_processed')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/danielroncel/dstc2_dialogues_transcription_processed/commit/7faf11be01eeae3589dcd21e8fda71e8470a8557', commit_message='Upload dataset', commit_description='', oid='7faf11be01eeae3589dcd21e8fda71e8470a8557', pr_url=None, pr_revision=None, pr_num=None)