This notebook reads transcripts, extracts routines, and exports routines.

In [1]:
import re

import pathlib as pl
import pandas as pd
import numpy as np

from spacy.lang.en import English

from read_utils import read_tables

In [2]:
def natural_sort(l):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
    return sorted(l, key = alphanum_key)

## Define paths.

In [3]:
# Inputs.
data_dir = pl.Path('../data')
transcripts_dir = data_dir.joinpath('transcripts')

dialign_dir = pl.Path('dialign-1.0')
dialign_jar_file = dialign_dir.joinpath('dialign.jar')

output_dir = pl.Path('../outputs')
interm_dir = output_dir.joinpath('intermediate')
task_features_file = interm_dir.joinpath(
    'log_features/justhink19_log_features_task_level.csv')

# Outputs.
processed_data_dir = pl.Path('../processed_data')
dialign_inputs_dir = processed_data_dir.joinpath('dialign_inputs')
dialign_outputs_dir = processed_data_dir.joinpath('dialign_outputs')
routines_dir = processed_data_dir.joinpath('routines')

dirs = [
    dialign_inputs_dir, dialign_outputs_dir,  # dialign_transcripts_dir,
    routines_dir,
]
for d in dirs:
    if not d.exists():
        d.mkdir()
        print('Created {}'.format(d))


synthesis_dep_file = dialign_outputs_dir.joinpath(
    'metrics-speaker-dependent.tsv')
synthesis_indep_file = dialign_outputs_dir.joinpath(
    'metrics-speaker-dependent.tsv')

Created ../processed_data/dialign_inputs
Created ../processed_data/dialign_outputs
Created ../processed_data/routines


## Define task-specific referents.

In [4]:
node_words = {
    'basel',
    'luzern',
    'zurich',
    'bern',
    'zermatt',
    'interlaken',
    'montreux',
    'neuchatel',
    'gallen',
    'davos',
}

task_words = node_words

print(len(task_words), sorted(task_words))

10 ['basel', 'bern', 'davos', 'gallen', 'interlaken', 'luzern', 'montreux', 'neuchatel', 'zermatt', 'zurich']


## Load data.

### Read transcripts.

In [5]:
transcript_dfs = read_tables(transcripts_dir, form='transcript')

Reading transcript files from ../data/transcripts.
transcript 10 files found.
File justhink19_transcript_07 belongs to team  7
File justhink19_transcript_08 belongs to team  8
File justhink19_transcript_09 belongs to team  9
File justhink19_transcript_10 belongs to team 10
File justhink19_transcript_11 belongs to team 11
File justhink19_transcript_17 belongs to team 17
File justhink19_transcript_18 belongs to team 18
File justhink19_transcript_20 belongs to team 20
File justhink19_transcript_28 belongs to team 28
File justhink19_transcript_47 belongs to team 47
Transcript of  7 has  639 utterances
Transcript of  8 has  669 utterances
Transcript of  9 has  810 utterances
Transcript of 10 has  469 utterances
Transcript of 11 has  567 utterances
Transcript of 17 has  325 utterances
Transcript of 18 has  359 utterances
Transcript of 20 has  507 utterances
Transcript of 28 has  348 utterances
Transcript of 47 has  396 utterances


### Refine task and transcript durations.

#### Compute speaking durations from transcripts.

In [6]:
end_times = dict()

for team_no, df in transcript_dfs.items():       
    dff = df[df['utterance'] == '(omitted)']
    if len(dff) > 0:
        end_time = dff['start'].min()
    else:
        end_time = df.iloc[-1]['end']
                
    end_times[team_no] = end_time

end_times

{7: 1573.338,
 8: 1570.4229999999998,
 9: 2160.63,
 10: 1384.955,
 11: 1580.5079999999998,
 17: 1075.073,
 18: 1804.0870000000002,
 20: 1155.259,
 28: 669.247,
 47: 1178.847}

#### Print the total transcribed duration in hours.

In [7]:
values = [td / 60 / 60 for td in end_times.values()]
sum(values)

3.9312130555555553

### Slice the transcripts by their inferred duration. 
There is sometimes more talk after the task ends, some of which was also transcribed, we omit that.
This is specifically when the team fails i.e. time is up, and we intervene.

In [8]:
for team_no in transcript_dfs:
    df = transcript_dfs[team_no]
    df = df[df.end <= end_times[team_no]]
    transcript_dfs[team_no] = df
    
# # A quick check.
# transcript_dfs[7].tail(), end_times[7]

## Generate inputs for dialign to extract routines.

### Define a tokenizer. 
Create a Tokenizer with the default settings for English, including punctuation rules and exceptions.

In [9]:
nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)

Define a tokeniser method for dialign (as per dialign input format). 

In [10]:
def tokenise_utterances(df, tokenizer):
    df = df.copy()
    texts = list()
    for u in df['utterance']:
        tokens = tokenizer(u)
        text = ' '.join([t.text for t in tokens])
        texts.append(text)

    df['utterance'] = texts

    return df

### Define an exporter for dialign. 

In [11]:
def export_for_dialign(df, file):
    with open(str(file), 'w') as f:
        for i, row in df.iterrows():
            print('{}\t{}'.format(row['interlocutor'], row['utterance']),
                  file=f)

### Export the transcripts for dialign.

In [12]:
print('Exporting the transcripts in dialign input format...')

for task_index, df in transcript_dfs.items():
    # Construct filename.
    file = 'justhink19_dialogue_{:02d}.tsv'.format(task_index)
    file = dialign_inputs_dir.joinpath(file)
    
    # Slice for interlocutors A and B only.
    df = df[df['interlocutor'].isin(['A', 'B'])]

    # Tokenise.
    df = tokenise_utterances(df, tokenizer)
    
    # Export table to file.
    export_for_dialign(df, file)

    print('Written task {:2d} to {}'.format(task_index, file))

print('Done!')

Exporting the transcripts in dialign input format...
Written task  7 to ../processed_data/dialign_inputs/justhink19_dialogue_07.tsv
Written task  8 to ../processed_data/dialign_inputs/justhink19_dialogue_08.tsv
Written task  9 to ../processed_data/dialign_inputs/justhink19_dialogue_09.tsv
Written task 10 to ../processed_data/dialign_inputs/justhink19_dialogue_10.tsv
Written task 11 to ../processed_data/dialign_inputs/justhink19_dialogue_11.tsv
Written task 17 to ../processed_data/dialign_inputs/justhink19_dialogue_17.tsv
Written task 18 to ../processed_data/dialign_inputs/justhink19_dialogue_18.tsv
Written task 20 to ../processed_data/dialign_inputs/justhink19_dialogue_20.tsv
Written task 28 to ../processed_data/dialign_inputs/justhink19_dialogue_28.tsv
Written task 47 to ../processed_data/dialign_inputs/justhink19_dialogue_47.tsv
Done!


### Export the transcripts in full, compliant with dialign inputs/indices.
e.g. for aligning with actions.
The tables contain additional information like start and end times of the utterances.

## Run dialign.

In [13]:
cmd = 'java -jar {} -i {} -o {}'.format(
    dialign_jar_file.resolve(), 
    dialign_inputs_dir.resolve(), 
    dialign_outputs_dir.resolve())
print(cmd)

print('Running for dialogues...')
!$cmd
print('Done!')

java -jar /home/utku/playground/justhink-dialogue-and-actions-corpus/tools/dialign-1.0/dialign.jar -i /home/utku/playground/justhink-dialogue-and-actions-corpus/processed_data/dialign_inputs -o /home/utku/playground/justhink-dialogue-and-actions-corpus/processed_data/dialign_outputs
Running for dialogues...
Done!


## Read routine tables.
i.e. shared expression lexicons as termed by dialign.

In [14]:
routine_dfs = dict()

for team_no in sorted(transcript_dfs):
    routine_file = 'justhink19_dialogue_{:02d}_tsv-lexicon.tsv'.format(
        team_no)
    routine_file = dialign_outputs_dir.joinpath(routine_file)
    df = pd.read_csv(str(routine_file), sep='\t')
    print('Read for team {:02d}: {} routines'.format(team_no, len(df)))

    l = list()
    for e in df['Surface Form']:
        tokenized_e = [t.text for t in tokenizer(e)]
        v = 0
        for n in task_words:
            # if n in e:
            if n in tokenized_e:
                v += 1
        l.append(v)
    df.insert(3, 'task_spec_referent_count', l)

    df['utterance_no_list'] = [[int(n) for n in seq.split(', ')]
                       for seq in df['Turns']]


    routine_dfs[team_no] = df

# # Example/debugging.
# team_no = 18
# routine_dfs[team_no].head(3)

Read for team 07: 384 routines
Read for team 08: 420 routines
Read for team 09: 533 routines
Read for team 10: 226 routines
Read for team 11: 371 routines
Read for team 17: 149 routines
Read for team 18: 149 routines
Read for team 20: 287 routines
Read for team 28: 194 routines
Read for team 47: 223 routines


## Filter for routines with task-specific referents.

In [15]:
for team_no, df in routine_dfs.items():
    df = df[df.task_spec_referent_count > 0]
    df = df.drop('task_spec_referent_count', axis=1)
    routine_dfs[team_no] = df

## Export routine tables.

In [16]:
print('Exporting routine tables...')

for team_no, df in routine_dfs.items():
    # Construct filename.
    file = 'justhink19_routines_{:02d}.csv'.format(team_no)
    file = routines_dir.joinpath(file)

    # Write the table to file.
    df.to_csv(file, index=False, sep='\t')

    print('Exported routines for {:2d} to {}'.format(team_no, file))
    
print('Done!')

Exporting routine tables...
Exported routines for  7 to ../processed_data/routines/justhink19_routines_07.csv
Exported routines for  8 to ../processed_data/routines/justhink19_routines_08.csv
Exported routines for  9 to ../processed_data/routines/justhink19_routines_09.csv
Exported routines for 10 to ../processed_data/routines/justhink19_routines_10.csv
Exported routines for 11 to ../processed_data/routines/justhink19_routines_11.csv
Exported routines for 17 to ../processed_data/routines/justhink19_routines_17.csv
Exported routines for 18 to ../processed_data/routines/justhink19_routines_18.csv
Exported routines for 20 to ../processed_data/routines/justhink19_routines_20.csv
Exported routines for 28 to ../processed_data/routines/justhink19_routines_28.csv
Exported routines for 47 to ../processed_data/routines/justhink19_routines_47.csv
Done!
