# Data preparation

#### Current format: 
Tokens each have an xml entry in their phonwords file. 
Accents each have an xml entry in their accents file.
Turns have are in their own files, and are recorded as time spans. The time span of a phonword is somewhere inside the time span of a turn.

Note: Each file is per-speaker, not per-turn or per-conversation.

#### Desired format:
One turn per line, whitespace-separated tokens, tab, binary string with 1 for nuclear pitch accent.

##### Find files:

In [57]:
import os

data_dir = '/afs/inf.ed.ac.uk/group/corpora/large/switchboard/nxt/xml'


dialog_num = 'sw2018'
users = ('A','B')
turn_files = [os.path.join(data_dir,'turns','.'.join([dialog_num,user,'turns','xml'])) for user in users]
acc_files = [os.path.join(data_dir,'accent','.'.join([dialog_num,user,'accents','xml'])) for user in users]
wd_files = [os.path.join(data_dir,'phonwords','.'.join([dialog_num,user,'phonwords','xml'])) for user in users]

In [82]:
import numpy as np
import xml.etree.ElementTree as ET
nite = '{http://nite.sourceforge.net/}'

accent_dict = {'nuclear':1,
               'plain':0,
               'pre-nuclear':0}

words = []
words_i = []
ids = []
times = []
speaker = []
wd_to_i = {}
i_to_wd = {}
id_to_acc = {}
counter = 0
for i,wd_file in enumerate(wd_files):
    if os.path.exists(acc_files[i]): # only pay attention to ones that have accent files
        wd_tree = ET.parse(wd_file)
        wd_root = wd_tree.getroot()
        for phonword in wd_root.findall('phonword'):
            orth = phonword.attrib['orth']
            id_num = phonword.attrib[nite+'id']
            start_time = phonword.attrib[nite+'start']
            if not orth in wd_to_i:
                wd_to_i[orth] = counter
                i_to_wd[counter] = orth
                counter += 1
            words.append(wd_to_i[orth])
            ids.append(id_num) # TODO since these ids are unique, I can make a lookup table for them too for speed
            times.append(start_time)
            if i == 0:
                speaker.append('A')
            else:
                speaker.append('B')
                
        acc_tree = ET.parse(acc_files[i])
        acc_root = acc_tree.getroot()
        for acc in acc_root.findall('accent'):
            for chld in acc:
                acc_id = chld.attrib['href'].split('(')[-1][:-1]
                id_to_acc[acc_id]=accent_dict[acc.attrib['type']]
    else:
        print('no accent file found')
        
            
print([i_to_wd[i] for i in words[:5]])


['hello', 'this', 'is', 'Lois', 'and']


### Make np array of accents

In [113]:
words = np.array(words)
times = np.array(times) 
ordering = np.argsort(times)

ordering

array([524,   0, 525, ..., 703, 142, 143])

In [99]:
accents = np.zeros(words.shape)

for i in range(words.shape[0]):
    id_num = ids[i]
    if id_num in id_to_acc:
        accents[i] = id_to_acc[id_num]
    else:
        accents[i] = 0
    
accents

array([1., 0., 0., ..., 1., 0., 1.])

### Iterate through turns, writing the final form out turn by turn

In [112]:
turns_a = [(child.attrib[nite+'id'], child.attrib[nite+'start'], child.attrib[nite+'end']) for child in ET.parse(turn_files[0]).getroot()]
turns_b = [(child.attrib[nite+'id'], child.attrib[nite+'start'], child.attrib[nite+'end']) for child in ET.parse(turn_files[1]).getroot()]
turns_b[0]

('t1', '0.29625', '0.76')

In [None]:
for i in range(words.shape[0]):
    