In [10]:
import json
import os

In [27]:
def get_start_end(d):
    start = d['start_step'] if 'start_step' in d else d['step']
    end = d['end_step'] if 'end_step' in d else start + 1
    return start, end
    
def get_steps(word_data, state_data):
    steps = {}
    for data in [word_data, state_data]:
        for k in data:
            d = data[k]
            p = d['participant']
            t = d['task']
            if (p,t) not in steps:
                steps[(p,t)] = [float('inf'), -float('inf')]
            start, end = get_start_end(d)
            steps[(p,t)][0] = min(start, steps[(p,t)][0])
            steps[(p,t)][1] = max(end, steps[(p,t)][1])
    return steps

In [28]:
def get_aligned(steps, word_data, state_data):
    aligned = {}
    
    for p,t in steps:
        mn, mx = steps[(p,t)]
        aligned[(p,t)] = [[None, None] for _ in range(mn, mx+1)]
        
    for k in word_data:
        d = word_data[k]
        p = d['participant']
        t = d['task']
        start, end = steps[(p,t)]
        for i in range(d['start_step'], d['end_step']):
            aligned[(p,t)][int(i-start)][0] = d
        
    for k in state_data:
        d = state_data[k]
        p = d['participant']
        t = d['task']
        start, end = steps[(p,t)]
        s, e = get_start_end(d)
        for i in range(s, e):
            aligned[(p,t)][int(i-start)][1] = d
            
    return aligned

In [38]:
def write_aligned(fname, aligned, steps, objs=False):
    out = open(fname, 'w')
    hdrs = ['participant','task','step','word',
            'lemma','pos','z','start_obj','end_obj']
    out.write('\t'.join(hdrs))
    for p, t in aligned:
        start, end = steps[(p,t)]
        last = None
        for i, d in enumerate(aligned[(p, t)]):
            if d[0] is not None:
                word_lst = [d[0]['word'], d[0]['lemma'], d[0]['pos']]
            else:
                word_lst = ['NA', 'NA', 'NA']
            if d[1] is not None:
                state_lst = ['%s'%d[1]['z'] if 'z' in d[1] else '%s'%d[1]['state']]
                if objs:
                    state_lst += ['%s'%d[1]['start_obj'], '%s'%d[1]['end_obj']]
                else:
                    state_lst += ['NA', 'NA']
            else:
                state_lst = ['NA', 'NA', 'NA']
            desc = '%s\t%s'%('\t'.join(word_lst), '\t'.join(state_lst))
            if not desc == last:
                out.write("%s\t%s\t%s\t%s\n"%(p, t, start+i, desc))
                last = desc
    out.close()

In [40]:
raw_dir = 'raw_data/may/states'
for f in os.listdir(raw_dir):
    print(f)
    words = json.loads(open('raw_data/april/words.json').read())
    states = json.loads(open('%s/%s'%(raw_dir, f)).read())
    fname = f.split('.')[0]
    outfile = 'aligned_data/may/%s.txt'%fname

    steps = get_steps(words, states)
    aligned = get_aligned(steps, words, states)
    write_aligned(outfile, aligned, steps)

partial_states_kappa=10.json
partial_states_kappa=100.json
partial_states_kappa=1000.json
partial_states_kappa=200.json
partial_states_kappa=50.json
partial_states_kappa=500.json
partial_states_latentdim=2_beta=0_KMeans=100.json
partial_states_latentdim=2_beta=0_KMeans=20.json
partial_states_latentdim=2_beta=0_KMeans=200.json
partial_states_latentdim=2_beta=0_KMeans=50.json
partial_states_latentdim=3_beta=0_KMeans=100.json
partial_states_latentdim=3_beta=0_KMeans=20.json
partial_states_latentdim=3_beta=0_KMeans=200.json
partial_states_latentdim=3_beta=0_KMeans=50.json
partial_states_latentdim=4_beta=0_KMeans=100.json
partial_states_latentdim=4_beta=0_KMeans=20.json
partial_states_latentdim=4_beta=0_KMeans=200.json
partial_states_latentdim=4_beta=0_KMeans=50.json
