### Code to Convert the Switchboard dataset into Convokit format

In [1]:
import os
os.chdir("../../") # import convokit
from convokit import Corpus, User, Utterance
os.chdir("datasets/switchboard-corpus") # then come back for swda
from swda import Transcript
import glob

ModuleNotFoundError: No module named 'swda'

#### Create Users

Each caller is considered a user, and there are total of 440 different callers in this dataset. Each user is marked with a numerical id, and the metadata for each user includes the following information:

- Gender (str): MALE or FEMALE
- Education (int): 0, 1, 2, 3, 9
- Birth Year (int): YYYY
- Dialect Area (str): MIXED, NEW ENGLAND, NORTH MIDLAND, NORTHERN, NYC, SOUTH MIDLAND, SOUTHERN, UNK, WESTERN

In [2]:
files = glob.glob("./swda/*/sw_*.utt.csv") # Switchboard utterance files
user_meta = {}

for file in files:
    trans = Transcript(file, './swda/swda-metadata.csv')
    user_meta[str(trans.from_caller)] = {"sex": trans.from_caller_sex,
                                    "education": trans.from_caller_education,
                                    "birth_year": trans.from_caller_birth_year,
                                    "dialect_area": trans.from_caller_dialect_area}
    user_meta[str(trans.to_caller)] = {"sex": trans.to_caller_sex,
                                  "education": trans.to_caller_education,
                                  "birth_year": trans.to_caller_birth_year,
                                  "dialect_area": trans.to_caller_dialect_area}

Create a User object for each unique user in the dataset

In [3]:
corpus_users = {k: User(name = k, meta = v) for k,v in user_meta.items()}

Check number of users in the dataset

In [4]:
print("Number of users in the data = {}".format(len(corpus_users)))

Number of users in the data = 440


In [5]:
# Example metadata from user 1632
corpus_users['1632'].meta

{'sex': 'FEMALE',
 'education': 2,
 'birth_year': 1962,
 'dialect_area': 'WESTERN'}

#### Create Utterances

Utterances are found in the "text" field of each Transcript object. There are 221,616 utterances in total.

Each Utterance object has the following fields:

- id (str): the unique id of the utterance
- user (User): the User giving the utterance
- root (str): id of the root utterance of the conversation
- reply_to (str): id of the utterance this replies to
- timestamp: timestamp of the utterance (not applicable in Switchboard)
- text (str): text of the utterance
- metadata
    - tag (str): the DAMSL act-tag of the utterance
    - pos (str): the part-of-speech tagged portion of the utterance
    - trees (nltk Tree): parsed tree of the utterance

In [6]:
utterance_corpus = {}

# Iterate thru each transcript
for file in files:
    
    trans = Transcript(file, './swda/swda-metadata.csv')
    utts = trans.utterances
    root = str(trans.conversation_no) + "-0" # Get id of root utterance
    
    recent_A = None
    recent_B = None
    
    # Iterate thru each utterance in transcript
    for utt in utts:
        
        idx = str(utt.conversation_no) + "-" + str(utt.transcript_index)
        text = utt.text
        
        # Check which user is talking
        if 'A' in utt.caller:
            recent_A = idx;
            user = str(trans.from_caller)
        else:
            recent_B = idx;
            user = str(trans.to_caller)
        
        # Put act-tag and POS information into metadata
        meta = {'tag': utt.act_tag,
                'pos': utt.pos,
                'trees': utt.trees}
    
        # For reply_to, find the most recent utterance from the other caller
        if 'A' in utt.caller:
            reply_to = recent_B
        else:
            reply_to = recent_A
            
        utterance_corpus[idx] = Utterance(idx, corpus_users[user], root,
                                          reply_to, None, text, meta)

In [7]:
utterance_list = [utterance for k,utterance in utterance_corpus.items()]

Check number of utterances in the dataset

In [8]:
print("Number of utterances in the data = {}".format(len(utterance_corpus)))

Number of utterances in the data = 221616


In [9]:
# Example utterance object
utterance_corpus['4325-2']

Utterance({'id': '4325-2', 'user': User([('name', '1519')]), 'root': '4325-0', 'reply_to': '4325-1', 'timestamp': None, 'text': '[ [ I guess, +', 'meta': {'tag': 'qy^d', 'pos': '[ I/PRP ] guess/VBP ,/,', 'trees': [Tree('S', [Tree('EDITED', [Tree('RM', [Tree('-DFL-', ['\\['])]), Tree('EDITED', [Tree('RM', [Tree('-DFL-', ['\\['])]), Tree('S', [Tree('NP-SBJ', [Tree('PRP', ['I'])]), Tree('VP-UNF', [Tree('VBP', ['guess'])])]), Tree(',', [',']), Tree('IP', [Tree('-DFL-', ['\\+'])])]), Tree('S', [Tree('NP-SBJ', [Tree('PRP', ['I'])]), Tree('VP-UNF', [Tree('VBP', ['think'])])]), Tree(',', [',']), Tree('RS', [Tree('-DFL-', ['\\]'])]), Tree('IP', [Tree('-DFL-', ['\\+'])])]), Tree('INTJ', [Tree('UH', ['uh'])]), Tree(',', [',']), Tree('NP-SBJ', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ['wonder']), Tree('RS', [Tree('-DFL-', ['\\]'])]), Tree('SBAR', [Tree('IN', ['if']), Tree('S', [Tree('NP-SBJ', [Tree('DT', ['that'])]), Tree('VP', [Tree('VBD', ['worked'])])])])]), Tree('.', ['.']), Tree('-DFL-'

#### Create corpus from list of utterances

In [10]:
switchboard_corpus = Corpus(utterances=utterance_list, version=1)
print("number of conversations in the dataset = {}".format(len(switchboard_corpus.get_conversation_ids())))

number of conversations in the dataset = 1155


#### Update corpus level metadata

In [11]:
switchboard_meta = {}
for file in files:
    trans = Transcript(file, './swda/swda-metadata.csv')
    idx = trans.conversation_no
    switchboard_meta[idx] = {}
    switchboard_meta[idx]['filename'] = trans.ptd_basename
    switchboard_meta[idx]['talk_day'] = trans.talk_day
    switchboard_meta[idx]['topic_description'] = trans.topic_description
    switchboard_meta[idx]['length'] = trans.length
    switchboard_meta[idx]['prompt'] = trans.prompt
                     
    
switchboard_corpus.meta['metadata'] = switchboard_meta
switchboard_corpus.meta['name'] = "The Switchboard Dialog Act Corpus"

In [12]:
switchboard_corpus.meta['metadata'][4325]

{'filename': '4/sw4325',
 'talk_day': datetime.datetime(1992, 3, 23, 0, 0),
 'topic_description': 'CHILD CARE',
 'length': 5,
 'prompt': 'FIND OUT WHAT CRITERIA THE OTHER CALLER WOULD USE IN SELECTING CHILD CARE SERVICES FOR A PRESCHOOLER.  IS IT EASY OR DIFFICULT TO FIND SUCH CARE?'}

#### Save created corpus

In [13]:
switchboard_corpus.dump("corpus", base_path = "./")

Check if available info from dataset can be viewed directly

In [14]:
from convokit import meta_index
meta_index(filename = "./corpus")

{'utterances-index': {'tag': "<class 'str'>",
  'pos': "<class 'str'>",
  'trees': "<class 'list'>"},
 'users-index': {'sex': "<class 'str'>",
  'education': "<class 'int'>",
  'birth_year': "<class 'int'>",
  'dialect_area': "<class 'str'>"},
 'conversations-index': {},
 'overall-index': {'metadata': 'bin', 'name': "<class 'str'>"},
 'version': 1}

In [15]:
test_corpus = Corpus(filename = "./corpus")
test_corpus.print_summary_stats()

Number of Users: 440
Number of Utterances: 221616
Number of Conversations: 1155
