# Preprocess data by utterance, not by turn

The original SWBD corpus organizes its annotation by utterance; the SWBD NXT corpus uses turns. I have been using turns heretofore, but since the audio is organized by the original Switchboard system, I'm going to have to switch to using utterances. 

Utterances are also prevented from being too long, and so will likely be easier to process when using audio.

#### Step 1:

Find conversation numbers in SWBD NXT. SWBD NXT is a subset of SWBD, and we only want to find Kaldi feats for conversations that are in both. 

In [1]:
import os 
import xml.etree.ElementTree as ET

NITE = '{http://nite.sourceforge.net/}'
PHON_DIR = '/afs/inf.ed.ac.uk/group/corpora/large/switchboard/nxt/xml/phonwords'
ACC_DIR = '/afs/inf.ed.ac.uk/group/corpora/large/switchboard/nxt/xml/accent'
SPEAKERS = ['A','B']

In [2]:
convs = set()

for filename in os.listdir(ACC_DIR):
    convs.add(filename.split('.')[0])

convs = sorted(list(convs))
convs[:3]

['sw2018', 'sw2060', 'sw2107']

#### Step 2: 

For each conversation, iterate through phonwords, separating by the 'msstate' field (which corresponds to the utterance). Store in a dictionary, where key = conversation num + utterance num, value = list of tuples of (phonword, accent) or (phonword, id) (haven't decided which yet)

In [3]:
def reg_orth(orth):
    mispron_dict = {
        '[row/wow]': 'wow',
        '[mot/lot]': 'lot',
        '[trest/test]': 'test',
        '[adamnet/adapted]': 'adapted',
        '[storly/story]': 'story',
        '[unconvenient/inconvenient]': 'inconvenient',
        '[dib/bit]': 'bit',
        '[tack/talking]': 'talking',
        '[banding/banning]': 'banning',
        '[ruther/rather]': 'rather',
        '[shrip/shrimp]': 'shrimp',
    }

    orth = orth.lower()
    orth = orth.strip('-')
    orth = orth.strip('{')
    orth = orth.strip('}')
    if '[laughter-' in orth:
        orth = orth.replace('[laughter-', '')
        orth = orth.replace(']', '')
    if orth in mispron_dict:
        orth = mispron_dict[orth]

    return orth


In [4]:
utterances = {}

for conv in convs:
    for sp in SPEAKERS:
        acc_wds = set()
        accfile = os.path.join(ACC_DIR,'.'.join([conv,sp,'accents','xml']))
        accroot = ET.parse(accfile).getroot()
        for acc in accroot.findall('accent'):
            for child in acc:
                wd_id = child.attrib['href'].split('(')[-1][:-1]
                acc_wds.add(wd_id)
        
        phonfile = os.path.join(PHON_DIR,'.'.join([conv,sp,'phonwords','xml']))
        phonroot = ET.parse(phonfile).getroot()
        for phonword in phonroot.findall('phonword'):
            orth = reg_orth(phonword.attrib['orth'])
            wd = phonword.attrib[NITE+'id']
            if wd in acc_wds:
                acc = 1
            else:
                acc = 0
            utt = phonword.attrib['msstate']
            if not utt=='': # TODO Fix this to salvage stuff that doesn't have an msstate value
                if not utt in utterances:
                    utterances[utt] = [(orth,acc)]
                else:
                    utterances[utt].append((orth,acc))
    
#utterances            

#### Step 3: Print out file for training

Format for each line:

`utterance_id <TAB> kaldi_handle <TAB> tokens <TAB> labels`
    
Kaldi handle is the key used in feats.scp. Slightly different formatting and uses timestamps instead of utterance number.

In [5]:
swbd_dir = '/afs/inf.ed.ac.uk/group/corpora/large/switchboard/switchboard1/transcriptions/swb_ms98_transcriptions'

def kaldify_utt(utt):
    subdir = utt[2:4]
    conversation = utt[2:6]
    conv_sp = utt.split('-')[0]
    sp = conv_sp[-1]
    
    filename = '-'.join([conv_sp,'ms98','a','trans'])+'.text'
    transcript_path = os.path.join(swbd_dir,subdir,conversation,filename)
    time_dict = {}
    with open(transcript_path,'r') as f:
        lines = [line.split()[:3] for line in f.readlines()]
        for line in lines:
            time_dict[line[0]] = (line[1],line[2])     
    start = kaldify_time(time_dict[utt][0])
    end = kaldify_time(time_dict[utt][1])
    kaldi_handle = 'sw0'+conversation+'-'+sp+'_'+start+'-'+end
    return kaldi_handle

def kaldify_time(time):
    time = float(time)
    time = str(round(time*100))
    pad_num = 6-len(time)
    time = ('0'*pad_num)+time
    return time
    

kaldify_utt('sw2018A-ms98-a-0037')

'sw02018-A_017785-017911'

In [6]:
datafile = 'data/utterances.txt'

with open(datafile,'w') as f:
    for utt in utterances:
        tokens = [tup[0] for tup in utterances[utt]]
        labels = [str(tup[1]) for tup in utterances[utt]]
        kaldi_handle = kaldify_utt(utt)
        f.write(utt+'\t'+kaldi_handle+'\t'+' '.join(tokens)+'\t'+' '.join(labels)+'\n')

In [7]:
datafile = 'data/utterances_text_only.txt'

with open(datafile,'w') as f:
    for utt in utterances:
        tokens = [tup[0] for tup in utterances[utt]]
        labels = [str(tup[1]) for tup in utterances[utt]]
        kaldi_handle = kaldify_utt(utt)
        f.write(' '.join(tokens)+'\t'+' '.join(labels)+'\n')

In [8]:
utterances

{'sw2018A-ms98-a-0001': [('hello', 1), ('this', 1), ('is', 0), ('lois', 1)],
 'sw2018A-ms98-a-0003': [('and', 1),
  ('um', 0),
  ('i', 1),
  ('called', 1),
  ('you', 0),
  ('know', 1),
  ('from', 0),
  ('that', 0),
  ('the', 1),
  ('the', 0),
  ('ti', 1),
  ('database', 1),
  ('calling', 1),
  ('instructions', 1)],
 'sw2018A-ms98-a-0005': [('yeah', 0),
  ('this', 1),
  ('is', 0),
  ('about', 1),
  ('changes', 1),
  ('in', 0),
  ('women', 1),
  ('in', 0),
  ('the', 0)],
 'sw2018A-ms98-a-0006': [('uh', 0),
  ("there's", 0),
  ('really', 1),
  ('a', 0),
  ('lot', 1),
  ("isn't", 1),
  ('there', 1),
  ('i', 1),
  ('mean', 0),
  ('there', 0),
  ('really', 1),
  ('is', 1)],
 'sw2018A-ms98-a-0008': [('oh', 0),
  ('i', 0),
  ('guess', 1),
  ('the', 0),
  ('work', 1),
  ('force', 0),
  ('would', 0),
  ('be', 1),
  ('the', 0),
  ('main', 1),
  ("wouldn't", 1),
  ('it', 0),
  ('it', 1),
  ('uh', 0)],
 'sw2018A-ms98-a-0010': [('okay', 1)],
 'sw2018A-ms98-a-0012': [('all', 1),
  ('right', 0),
  ('a