In [1]:
# This code pulls a sample of dialogue acts and exports an excel spreadsheet

import os
os.chdir("../../") # import convokit
import convokit
import numpy as np
import pandas as pd
import re
from collections import defaultdict


In [2]:
from convokit import meta_index
os.chdir("datasets/switchboard-corpus") # then come back for swda

# Download corpus
corpus = convokit.Corpus(filename = "./corpus")



In [3]:
# Create dictionary with metadata collapsed to utterance level
utterance_ids = corpus.get_utterance_ids()
convo_ids =  corpus.get_conversation_ids()

rows = []
for uid in utterance_ids:
    # Extract conv id from uid. There's probably a simpler approach.
    conv_id = re.search(r"\d*(?=-)", uid).group()
    
    # Combine relevant information from dictionaries into a row
    rows.append({**corpus.utterances[uid].meta,
                 **{'text': corpus.utterances[uid].text},
                 **corpus.utterances[uid].user.meta,
                 **corpus.meta['metadata'][conv_id]})

In [4]:
# Create dataframe out of list of rows
metadata = pd.DataFrame(rows, index=utterance_ids)
metadata.head(10)

Unnamed: 0,birth_year,dialect_area,education,filename,from_caller,length,pos,prompt,sex,tag,talk_day,text,to_caller,topic_description,trees
4325-0,1962,WESTERN,2,4/sw4325,1632,5,Okay/UH ./.,FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD ...,FEMALE,o,1992-03-23,Okay. /,1519,CHILD CARE,"[[[Okay], [.], [E_S]]]"
4325-1,1962,WESTERN,2,4/sw4325,1632,5,"So/UH ,/,",FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD ...,FEMALE,qw,1992-03-23,"{D So, }",1519,CHILD CARE,"[[[['So']], [,], [[['What'], ['kind']], [['of'..."
4325-2,1971,SOUTH MIDLAND,1,4/sw4325,1632,5,"[ I/PRP ] guess/VBP ,/,",FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD ...,FEMALE,qy^d,1992-03-23,"[ [ I guess, +",1519,CHILD CARE,"[[[[['\\[']], [[['\\[']], [[['I']], [['guess']..."
4325-3,1962,WESTERN,2,4/sw4325,1632,5,[ What/WP kind/NN ] of/IN [ experience/NN ] d...,FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD ...,FEMALE,+,1992-03-23,"What kind of experience [ do you, + do you ] h...",1519,CHILD CARE,"[[[['So']], [,], [[['What'], ['kind']], [['of'..."
4325-4,1971,SOUTH MIDLAND,1,4/sw4325,1632,5,"[ I/PRP ] think/VBP ,/, uh/UH ,/, [ I/PRP ] w...",FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD ...,FEMALE,+,1992-03-23,"I think, ] + {F uh, } I wonder ] if that worke...",1519,CHILD CARE,"[[[[['\\[']], [[['\\[']], [[['I']], [['guess']..."
4325-5,1962,WESTERN,2,4/sw4325,1632,5,Does/VBZ [ it/PRP ] say/VB [ something/NN ] ?/.,FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD ...,FEMALE,qy,1992-03-23,Does it say something? /,1519,CHILD CARE,"[[[Does], [['it']], [['say'], [['something']]]..."
4325-6,1971,SOUTH MIDLAND,1,4/sw4325,1632,5,[ I/PRP ] think/VBP [ it/PRP ] usually/RB doe...,FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD ...,FEMALE,sd,1992-03-23,I think it usually does. /,1519,CHILD CARE,"[[[['I']], [['think'], [['0'], [[['it']], [['u..."
4325-7,1971,SOUTH MIDLAND,1,4/sw4325,1632,5,"[ You/PRP ] might/MD try/VB ,/, uh/UH ,/,",FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD ...,FEMALE,ad,1992-03-23,"You might try, {F uh, } /",1519,CHILD CARE,"[[[['You']], [['might'], [['try'], [','], [['u..."
4325-8,1971,SOUTH MIDLAND,1,4/sw4325,1632,5,"[ I/PRP ] do/VBP n't/RB know/VB ,/,",FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD ...,FEMALE,h,1992-03-23,"I don't know, /",1519,CHILD CARE,"[[[['I']], [['do'], [""n't""], [['know']]], [,],..."
4325-9,1971,SOUTH MIDLAND,1,4/sw4325,1632,5,hold/VB [ it/PRP ] down/RB [ a/DT little/RB ...,FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD ...,FEMALE,ad,1992-03-23,"hold it down a little longer, /",1519,CHILD CARE,"[[[['*']], [['hold'], [['it']], [['down']], [[..."


In [5]:
# Add a column that gives the names for each tag
# https://web.stanford.edu/~jurafsky/ws97/manual.august1.html

# Some tags are omitted from the tag_dict. See the above link for these. 
tag_dict = {'sd':'Statement-non-opinion',
'b':'Acknowledge (Backchannel)',
'sv':'Statement-opinion',
'aa':'Agree/Accept',
'%':'Abandoned or Turn-Exit',
'ba':'Appreciation',
'qy':'Yes-No-Question',
'x':'Non-verbal',
'ny':'Yes answers',
'fc':'Conventional-closing',
'%':'Uninterpretable',
'qw':'Wh-Question',
'nn':'No answers',
'bk':'Response Acknowledgement',
'h':'Hedge',
'qy^d':'Declarative Yes-No-Question',
'fo_o_fw_by_bc':'Other',
'bh':'Backchannel in question form',
'^q':'Quotation',
'bf':'Summarize/reformulate',
'na':'Affirmative non-yes answers',
'ad':'Action-directive',
'^2':'Collaborative Completion',
'b^m':'Repeat-phrase',
'qo':'Open-Question',
'qh':'Rhetorical-Questions',
'^h':'Hold before answer/agreement',
'ar':'Reject',
'ng':'Negative non-no answers',
'br':'Signal-non-understanding',
'no':'Other answers',
'fp':'Conventional-opening',
'qrr':'Or-Clause',
'arp_nd':'Dispreferred answers',
't3':'3rd-party-talk',
'oo_co_cc':'Offers, Options, Commits',
't1':'Self-talk',
'bd':'Downplayer',
'aap_am':'Maybe/Accept-part',
'^g':'Tag-Question',
'qw^d':'Declarative Wh-Question',
'fa':'Apology',
'ft':'Thanking',
'+': 'Segment (multi-utterance)'}


In [6]:
# Create a new column mapping on the tag names
metadata['tag_name'] = metadata['tag'].map(tag_dict)

In [7]:
metadata = metadata[['tag_name', 'tag', 'text']]

In [8]:
# Number of examples of each dialogue act to pull
sample_number = 10

# First drop all tag groups that have less than 10 occurences, then sample without replacement within each remaining group. 
metadata_filtered = metadata.groupby('tag').filter(lambda x: len(x) >= sample_number)

metadata_output = metadata_filtered.sample(frac=1).groupby('tag').head(sample_number)

In [11]:
metadata_output.to_excel("Sample_of_dialogue_acts.xlsx")

In [12]:
len(metadata_output)

940