This notebook reads transcripts, extracts routines, and exports routines.

In [1]:
import re

import pathlib as pl
import pandas as pd
import numpy as np

from spacy.lang.en import English

from read_utils import read_tables

In [2]:
def natural_sort(l):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
    return sorted(l, key = alphanum_key)

## Define paths.

In [3]:
# Inputs.
data_dir = pl.Path('../data')
transcripts_dir = data_dir.joinpath('transcripts')

dialign_dir = pl.Path('dialign-1.0')
dialign_jar_file = dialign_dir.joinpath('dialign.jar')

output_dir = pl.Path('../outputs')
interm_dir = output_dir.joinpath('intermediate')
task_features_file = interm_dir.joinpath(
    'log_features/justhink19_log_features_task_level.csv')

# Outputs.
processed_data_dir = pl.Path('../processed_data')
dialign_inputs_dir = processed_data_dir.joinpath('dialign_inputs')
dialign_outputs_dir = processed_data_dir.joinpath('dialign_outputs')
routines_dir = processed_data_dir.joinpath('routines')
utterances_dir = processed_data_dir.joinpath('utterances')
tokens_dir = processed_data_dir.joinpath('tokens')

dirs = [
    dialign_inputs_dir, dialign_outputs_dir,
    routines_dir, 
    utterances_dir, tokens_dir,
]
for d in dirs:
    if not d.exists():
        d.mkdir()
        print('Created {}'.format(d))


synthesis_dep_file = dialign_outputs_dir.joinpath(
    'metrics-speaker-dependent.tsv')
synthesis_indep_file = dialign_outputs_dir.joinpath(
    'metrics-speaker-dependent.tsv')

## Define task-specific referents.

In [4]:
node_words = {
    'basel',
    'luzern',
    'zurich',
    'bern',
    'zermatt',
    'interlaken',
    'montreux',
    'neuchatel',
    'gallen',
    'davos',
}

task_words = node_words

print(len(task_words), sorted(task_words))

10 ['basel', 'bern', 'davos', 'gallen', 'interlaken', 'luzern', 'montreux', 'neuchatel', 'zermatt', 'zurich']


## Load data.

### Read transcripts.

In [5]:
transcript_dfs = read_tables(transcripts_dir, form='transcript')

Reading transcript files from ../data/transcripts.
transcript 10 files found.
File justhink19_transcript_07 belongs to team  7
File justhink19_transcript_08 belongs to team  8
File justhink19_transcript_09 belongs to team  9
File justhink19_transcript_10 belongs to team 10
File justhink19_transcript_11 belongs to team 11
File justhink19_transcript_17 belongs to team 17
File justhink19_transcript_18 belongs to team 18
File justhink19_transcript_20 belongs to team 20
File justhink19_transcript_28 belongs to team 28
File justhink19_transcript_47 belongs to team 47
Transcript of  7 has  639 utterances
Transcript of  8 has  669 utterances
Transcript of  9 has  810 utterances
Transcript of 10 has  469 utterances
Transcript of 11 has  567 utterances
Transcript of 17 has  325 utterances
Transcript of 18 has  359 utterances
Transcript of 20 has  507 utterances
Transcript of 28 has  348 utterances
Transcript of 47 has  396 utterances


### Refine task and transcript durations.

#### Compute speaking durations from transcripts.

In [6]:
end_times = dict()

for team_no, df in transcript_dfs.items():       
    dff = df[df['utterance'] == '(omitted)']
    if len(dff) > 0:
        end_time = dff['start'].min()
    else:
        end_time = df.iloc[-1]['end']
                
    end_times[team_no] = end_time

end_times

{7: 1573.338,
 8: 1570.4229999999998,
 9: 2160.63,
 10: 1384.955,
 11: 1580.5079999999998,
 17: 1075.073,
 18: 1804.0870000000002,
 20: 1155.259,
 28: 669.247,
 47: 1178.847}

#### Print the total transcribed duration in hours.

In [7]:
values = [td / 60 / 60 for td in end_times.values()]
sum(values)

3.9312130555555553

### Slice the transcripts by their inferred duration. 
There is sometimes more talk after the task ends, some of which was also transcribed, we omit that.
This is specifically when the team fails i.e. time is up, and we intervene.

In [8]:
for team_no in transcript_dfs:
    df = transcript_dfs[team_no]
    df = df[df.end <= end_times[team_no]]
    transcript_dfs[team_no] = df
    
# # A quick check.
# transcript_dfs[7].tail(), end_times[7]

## Generate inputs for dialign to extract routines.

### Define a tokeniser. 
Create a tokeniser with the default settings for English, including punctuation rules and exceptions.

In [9]:
nlp = English()
tokeniser = nlp.Defaults.create_tokenizer(nlp)

Define a tokeniser method for dialign (as per dialign input format). 

In [10]:
def tokenise_utterances(df, tokeniser):
    df = df.copy()
    texts = list()
    for u in df['utterance']:
        tokens = tokeniser(u)
        text = ' '.join([t.text for t in tokens])
        texts.append(text)

    df['utterance'] = texts

    return df

### Define an exporter for dialign. 

In [11]:
def export_for_dialign(df, file):
    with open(str(file), 'w') as f:
        for i, row in df.iterrows():
            print('{}\t{}'.format(row['interlocutor'], row['utterance']),
                  file=f)

### Rework the transcripts for dialign: obtain simpler transcripts (tokenised and interlocutors A & B only).

In [12]:
print('Reworking the transcripts to input into dialign...')
utterance_dfs = dict()
for team_no, df in transcript_dfs.items():
    # Filter for interlocutors A and B only.
    df = df[df['interlocutor'].isin(['A', 'B'])]

    # Tokenise.
    df = tokenise_utterances(df, tokeniser)

    # Reset the utterance numbers.
    df['utterance_no'] = range(len(df))

    # Keep.
    utterance_dfs[team_no] = df

print('Done!')

Reworking the transcripts to input into dialign...
Done!


### Export the transcripts in dialign input format.

In [13]:
print('Exporting the transcripts in dialign input format...')

for team_no, df in utterance_dfs.items():
    # Construct filename.
    file = 'justhink19_dialogue_{:02d}.tsv'.format(team_no)
    file = dialign_inputs_dir.joinpath(file)
    
    # Export table to file.
    export_for_dialign(df, file)

    print('Written for team {:2d} to {}'.format(team_no, file))

print('Done!')

Exporting the transcripts in dialign input format...
Written for team  7 to ../processed_data/dialign_inputs/justhink19_dialogue_07.tsv
Written for team  8 to ../processed_data/dialign_inputs/justhink19_dialogue_08.tsv
Written for team  9 to ../processed_data/dialign_inputs/justhink19_dialogue_09.tsv
Written for team 10 to ../processed_data/dialign_inputs/justhink19_dialogue_10.tsv
Written for team 11 to ../processed_data/dialign_inputs/justhink19_dialogue_11.tsv
Written for team 17 to ../processed_data/dialign_inputs/justhink19_dialogue_17.tsv
Written for team 18 to ../processed_data/dialign_inputs/justhink19_dialogue_18.tsv
Written for team 20 to ../processed_data/dialign_inputs/justhink19_dialogue_20.tsv
Written for team 28 to ../processed_data/dialign_inputs/justhink19_dialogue_28.tsv
Written for team 47 to ../processed_data/dialign_inputs/justhink19_dialogue_47.tsv
Done!


## Run dialign.

In [14]:
cmd = 'java -jar {} -i {} -o {}'.format(
    dialign_jar_file.resolve(), 
    dialign_inputs_dir.resolve(), 
    dialign_outputs_dir.resolve())
print(cmd)

print('Running for dialogues...')
!$cmd
print('Done!')

java -jar /home/utku/playground/justhink-dialogue-and-actions-corpus/tools/dialign-1.0/dialign.jar -i /home/utku/playground/justhink-dialogue-and-actions-corpus/processed_data/dialign_inputs -o /home/utku/playground/justhink-dialogue-and-actions-corpus/processed_data/dialign_outputs
Running for dialogues...
Done!


## Read routine tables.
i.e. shared expression lexicons as termed by dialign.

In [15]:
routine_dfs = dict()

for team_no in sorted(transcript_dfs):
    routine_file = 'justhink19_dialogue_{:02d}_tsv-lexicon.tsv'.format(
        team_no)
    routine_file = dialign_outputs_dir.joinpath(routine_file)
    df = pd.read_csv(str(routine_file), sep='\t')
    print('Read for team {:02d}: {} routines'.format(team_no, len(df)))

    l = list()
    for e in df['Surface Form']:
        tokenised_e = [t.text for t in tokeniser(e)]
        v = 0
        for n in task_words:
            if n in tokenised_e:
                v += 1
        l.append(v)
    df.insert(3, 'task_spec_referent_count', l)

    df['utterances'] = [[int(n) for n in seq.split(', ')]
                        for seq in df['Turns']]

    routine_dfs[team_no] = df

# # Example/debugging.
# team_no = 18
# routine_dfs[team_no].head(3)

Read for team 07: 384 routines
Read for team 08: 420 routines
Read for team 09: 533 routines
Read for team 10: 226 routines
Read for team 11: 371 routines
Read for team 17: 149 routines
Read for team 18: 149 routines
Read for team 20: 287 routines
Read for team 28: 194 routines
Read for team 47: 223 routines


## Filter for routines with task-specific referents.

In [16]:
for team_no, df in routine_dfs.items():
    df = df[df.task_spec_referent_count > 0]
    df = df.drop('task_spec_referent_count', axis=1)
    routine_dfs[team_no] = df

## Rework routine instances with token positions.

### Construct token tables.

In [17]:
token_dfs = dict()
for team_no, df in utterance_dfs.items():
    df = df.copy()

    # Split the utterances into words, convert to a list.
    df['token'] = [u.split() for u in df['utterance']]
    # df = df.assign(**{'words': df['object'].str.split()})

    # Transform each word to a row, preserving the other values in the row.
    df = df.explode('token')

    # Assign a subutterance no.
    df.insert(2, 'token_no', range(len(df)))

    token_dfs[team_no] = df

df = token_dfs[7].copy()
df.head()

Unnamed: 0,team_no,utterance_no,token_no,start,end,interlocutor,utterance,token
1,7,0,0,42.525,43.004,A,okay .,okay
1,7,0,1,42.525,43.004,A,okay .,.
2,7,1,2,47.732,48.615,A,i am doing this .,i
2,7,1,3,47.732,48.615,A,i am doing this .,am
2,7,1,4,47.732,48.615,A,i am doing this .,doing


In [18]:
def find_sub_list(sl, l):
    # allows for multiple matches
    # from https://stackoverflow.com/a/17870684
    results = []
    sll = len(sl)
    for ind in (i for i, e in enumerate(l) if e == sl[0]):
        if l[ind:ind+sll] == sl:
            results.append((ind, ind+sll-1))

    return results

# # Try.
# greeting = ['hello', 'my', 'name', 'is', 'bob',
#             'how', 'are', 'you', 'my', 'name', 'is']
# print(find_sub_list(['my', 'name', 'is'], greeting))

### Find routine expressions' subutterance numbers from utterance numbers.

In [19]:
def get_start_indices(subutterance, u_no, u_df):
    l = list()  # subutterance list to be built.

    # Find the utterance (row) with that utterance no.
    utterance_row = u_df[u_df.utterance_no == u_no]
    # Make sure there is only one such row.
    assert len(utterance_row) == 1, print(
        'Multiple utterances found at {}'.format(u_no))
    # Select the first (and only) row.
    utterance_row = utterance_row.iloc[0]
    # Get the utterance string at that row.
    utterance = utterance_row['utterance']

    # Find the occurrences of subutterance routine in the utterance.
    indices = find_sub_list(subutterance.split(), utterance.split())
    assert len(indices) != 0, print(
        'Could not find subutterance "{}" at utterance "{}" ({})'.format(
            subutterance, utterance, u_no))

    # Get the token offset of the utterance.
    offset = t_df[t_df.utterance_no == u_no].iloc[0]['token_no']

    # Put the initial positions of the occurrences into a list.
    for start, end in indices:
        l.append(start + offset)

    return l

In [24]:
for team_no, df in routine_dfs.items():
    print('Finding routine token indices for team {:2d}'.format(
        team_no))

    u_df = utterance_dfs[team_no]
    t_df = token_dfs[team_no]
    tokens_list = list()
    establish_list = list()
    priming_list = list()
    for i, row in df.iterrows():
        subutterance = row['Surface Form']

        # subutterance list to be built, for each row.
        l = list()
        for u_no in row['utterances']:  # for each utterance no
            l += get_start_indices(subutterance, u_no, u_df)
        tokens_list.append(l)

        # priming token from the priming utterance no.
        u_no = row['utterances'][0]
        t = get_start_indices(subutterance, u_no, u_df)[0]
        priming_list.append(t)

        # establishment token from the establishment utterance no.
        u_no = row['Establishment turn']
        t = get_start_indices(subutterance, u_no, u_df)[0]
        establish_list.append(t)

    df['tokens'] = tokens_list
    df['priming_token'] = priming_list
    df['establish_token'] = establish_list


print('Done!')

routine_dfs[7].head()

Finding routine token indices for team  7
Finding routine token indices for team  8
Finding routine token indices for team  9
Finding routine token indices for team 10
Finding routine token indices for team 11
Finding routine token indices for team 17
Finding routine token indices for team 18
Finding routine token indices for team 20
Finding routine token indices for team 28
Finding routine token indices for team 47
Done!


Unnamed: 0,Freq.,Free Freq.,Size,Surface Form,Establishment turn,Spanning,Priming,First Speaker,Turns,utterances,tokens,priming_token,establish_token
0,3,3,6,go to mount saint gallen .,508,145,2,A,"364, 395, 508","[364, 395, 508]","[2023, 2164, 2776]",2023,2776
1,2,2,6,"mount montreux , mount montreux .",380,114,1,A,"267, 380","[267, 380]","[1525, 2084]",1525,2084
2,6,3,5,to mount saint gallen .,508,251,5,A,"258, 287, 316, 364, 395, 508","[258, 287, 316, 364, 395, 508]","[1478, 1627, 1806, 2024, 2165, 2777]",1478,2777
3,3,3,5,from mount davos to mount,559,21,2,B,"539, 541, 559","[539, 541, 559]","[2933, 2949, 3046]",2933,3046
4,3,3,5,mount davos to mount zermatt,559,329,1,B,"259, 559, 587","[259, 559, 587]","[1483, 3047, 3197]",1483,3047


## Export routine tables.

In [21]:
print('Exporting routine tables...')

for team_no, df in routine_dfs.items():
    # Construct filename.
    file = 'justhink19_routines_{:02d}.csv'.format(team_no)
    file = routines_dir.joinpath(file)

    # Write the table to file.
    df.to_csv(file, index=False, sep='\t')

    print('Exported routines for {:2d} to {}'.format(team_no, file))
    
print('Done!')

Exporting routine tables...
Exported routines for  7 to ../processed_data/routines/justhink19_routines_07.csv
Exported routines for  8 to ../processed_data/routines/justhink19_routines_08.csv
Exported routines for  9 to ../processed_data/routines/justhink19_routines_09.csv
Exported routines for 10 to ../processed_data/routines/justhink19_routines_10.csv
Exported routines for 11 to ../processed_data/routines/justhink19_routines_11.csv
Exported routines for 17 to ../processed_data/routines/justhink19_routines_17.csv
Exported routines for 18 to ../processed_data/routines/justhink19_routines_18.csv
Exported routines for 20 to ../processed_data/routines/justhink19_routines_20.csv
Exported routines for 28 to ../processed_data/routines/justhink19_routines_28.csv
Exported routines for 47 to ../processed_data/routines/justhink19_routines_47.csv
Done!


## Export the simplified transcripts ("utterances").

In [22]:
print('Exporting tokenised filtered transcripts (utterances)')
for team_no, df in utterance_dfs.items():
    file = 'justhink19_utterances_{:02d}.csv'.format(team_no)
    file = utterances_dir.joinpath(file)

    # Export table to file.
    df.to_csv(file, index=False, float_format='%.3f', sep='\t')

    print('Exported utterances for {:2d} to {}'.format(team_no, file))

print('Done!')

Exporting tokenised filtered transcripts (utterances)
Exported utterances for  7 to ../processed_data/utterances/justhink19_utterances_07.csv
Exported utterances for  8 to ../processed_data/utterances/justhink19_utterances_08.csv
Exported utterances for  9 to ../processed_data/utterances/justhink19_utterances_09.csv
Exported utterances for 10 to ../processed_data/utterances/justhink19_utterances_10.csv
Exported utterances for 11 to ../processed_data/utterances/justhink19_utterances_11.csv
Exported utterances for 17 to ../processed_data/utterances/justhink19_utterances_17.csv
Exported utterances for 18 to ../processed_data/utterances/justhink19_utterances_18.csv
Exported utterances for 20 to ../processed_data/utterances/justhink19_utterances_20.csv
Exported utterances for 28 to ../processed_data/utterances/justhink19_utterances_28.csv
Exported utterances for 47 to ../processed_data/utterances/justhink19_utterances_47.csv
Done!


## Export the token tables.

In [23]:
for team_no, df in token_dfs.items():
    file = 'justhink19_tokens_{:02d}.csv'.format(team_no)
    file = tokens_dir.joinpath(file)

    # Export table to file.
    df.to_csv(file, index=False, float_format='%.3f', sep='\t')
    
    print('Exported tokens for {:2d} to {}'.format(team_no, file))
    
print('Done!')

Exported tokens for  7 to ../processed_data/tokens/justhink19_tokens_07.csv
Exported tokens for  8 to ../processed_data/tokens/justhink19_tokens_08.csv
Exported tokens for  9 to ../processed_data/tokens/justhink19_tokens_09.csv
Exported tokens for 10 to ../processed_data/tokens/justhink19_tokens_10.csv
Exported tokens for 11 to ../processed_data/tokens/justhink19_tokens_11.csv
Exported tokens for 17 to ../processed_data/tokens/justhink19_tokens_17.csv
Exported tokens for 18 to ../processed_data/tokens/justhink19_tokens_18.csv
Exported tokens for 20 to ../processed_data/tokens/justhink19_tokens_20.csv
Exported tokens for 28 to ../processed_data/tokens/justhink19_tokens_28.csv
Exported tokens for 47 to ../processed_data/tokens/justhink19_tokens_47.csv
Done!
