This notebook merges transcripts with event logs to have a combined dialogue and actions corpus, to be processed e.g. to detect follow-up actions.

In [1]:
import pathlib as pl
import pandas as pd

from read_utils import read_tables

## Define paths.

In [2]:
# Inputs.
data_dir = pl.Path('../data')
transcripts_dir = data_dir.joinpath('transcripts')
logs_dir = data_dir.joinpath('logs')

# Outputs.
processed_data_dir = pl.Path('../processed_data')
corpus_dir = processed_data_dir.joinpath('corpus')

for d in [corpus_dir]:
    if not d.exists():
        d.mkdir(parents=True)
        print('Created {}'.format(d))

## Load data.

### Load logs.

In [3]:
log_dfs = read_tables(logs_dir, form='log')

Reading log files from ../data/logs.
log 39 files found.
File justhink19_log_06 belongs to team  6
File justhink19_log_07 belongs to team  7
File justhink19_log_08 belongs to team  8
File justhink19_log_09 belongs to team  9
File justhink19_log_10 belongs to team 10
File justhink19_log_11 belongs to team 11
File justhink19_log_12 belongs to team 12
File justhink19_log_14 belongs to team 14
File justhink19_log_15 belongs to team 15
File justhink19_log_16 belongs to team 16
File justhink19_log_17 belongs to team 17
File justhink19_log_18 belongs to team 18
File justhink19_log_19 belongs to team 19
File justhink19_log_20 belongs to team 20
File justhink19_log_21 belongs to team 21
File justhink19_log_22 belongs to team 22
File justhink19_log_23 belongs to team 23
File justhink19_log_25 belongs to team 25
File justhink19_log_26 belongs to team 26
File justhink19_log_27 belongs to team 27
File justhink19_log_28 belongs to team 28
File justhink19_log_29 belongs to team 29
File justhink19_log

### Load transcripts.

In [4]:
transcript_dfs = read_tables(transcripts_dir, form='transcript')

Reading transcript files from ../data/transcripts.
transcript 10 files found.
File justhink19_transcript_07 belongs to team  7
File justhink19_transcript_08 belongs to team  8
File justhink19_transcript_09 belongs to team  9
File justhink19_transcript_10 belongs to team 10
File justhink19_transcript_11 belongs to team 11
File justhink19_transcript_17 belongs to team 17
File justhink19_transcript_18 belongs to team 18
File justhink19_transcript_20 belongs to team 20
File justhink19_transcript_28 belongs to team 28
File justhink19_transcript_47 belongs to team 47
Transcript of  7 has  639 utterances
Transcript of  8 has  669 utterances
Transcript of  9 has  810 utterances
Transcript of 10 has  469 utterances
Transcript of 11 has  567 utterances
Transcript of 17 has  325 utterances
Transcript of 18 has  359 utterances
Transcript of 20 has  507 utterances
Transcript of 28 has  348 utterances
Transcript of 47 has  396 utterances


In [5]:
def combine_log_with_transcript(log_df, transcript_df):
    '''merge a log and a transcript table'''
    log_df = log_df.copy()
    log_df['start'] = log_df['time']
    log_df['end'] = log_df['time']
    log_df.drop(columns=['time', 'event_no'], inplace=True)

    transcript_df = transcript_df.copy()
    transcript_df['verb'] = 'says'
    mapper = {'interlocutor': 'subject', 'utterance': 'object'}
    transcript_df.rename(columns=mapper, inplace=True)
    # transcript_df.drop(columns=['utterance_no'], inplace=True)

    dfs = [log_df, transcript_df]
    df = pd.concat(dfs, ignore_index=True)
    df.sort_values('start', inplace=True)
    df.reset_index(inplace=True, drop=True)

    # Fill the unknown attempt no etc. coming from the transcript
    # first backward to propogate new attempt/turn to utterances in that turn, 
    # and then forward.
    cols = ['team_no', 'attempt_no', 'turn_no']
    df[cols] = df[cols].fillna(method='bfill')
    df[cols] = df[cols].fillna(method='ffill')
    
    df.fillna(value={'utterance_no': -1}, inplace=True)
    df.utterance_no = df.utterance_no.astype(int)
    # Make sure there are no NaNs.
    assert not df.isnull().values.any()

    # Convert attempt no etc. to integer types.
    for c in cols:
        df[c] = df[c].astype(int)

    # Reorder the columns.
    cols = ['team_no', 'attempt_no', 'turn_no', 'utterance_no', 
            'start', 'end',
            'subject', 'verb', 'object',
            ]
    df = df[cols]
    
    return df


# Try.
team_no = 28
log_df = log_dfs[team_no]
transcript_df = transcript_dfs[team_no]

combine_log_with_transcript(log_df, transcript_df).head()

Unnamed: 0,team_no,attempt_no,turn_no,utterance_no,start,end,subject,verb,object
0,28,1,1,-1,0.296,0.296,R,shows,observe gesture
1,28,1,1,-1,0.365,0.365,R,says,"so, ann and bob, let's start building the trac..."
2,28,1,1,-1,33.409,33.409,A,presses,help (enabled)
3,28,1,1,0,40.0,41.161,A,says,"okay , so"
4,28,1,1,1,40.58,45.036,B,says,so we have to connect all the places with trac...


## Combine the transcript and logs for each team.

In [6]:
corpus_dfs = dict()
for team_no in sorted(transcript_dfs):
    print('Process team {:2d} ...'.format(team_no))

    log_df = log_dfs[team_no]
    transcript_df = transcript_dfs[team_no]
    df = combine_log_with_transcript(log_df, transcript_df)

    corpus_dfs[team_no] = df

print('Done!')

Process team  7 ...
Process team  8 ...
Process team  9 ...
Process team 10 ...
Process team 11 ...
Process team 17 ...
Process team 18 ...
Process team 20 ...
Process team 28 ...
Process team 47 ...
Done!


## Export the corpus tables to files.

In [7]:
for team_no in sorted(corpus_dfs):
    # Make filename.
    file = corpus_dir.joinpath(
        'justhink19_corpus_{:02d}.csv'.format(team_no))
    print('Save team {:2d} to {}'.format(team_no, file))
    
    # Export to file.
    corpus_dfs[team_no].to_csv(
        file, sep='\t', float_format='%.3f', index=False)

Save team  7 to ../processed_data/corpus/justhink19_corpus_07.csv
Save team  8 to ../processed_data/corpus/justhink19_corpus_08.csv
Save team  9 to ../processed_data/corpus/justhink19_corpus_09.csv
Save team 10 to ../processed_data/corpus/justhink19_corpus_10.csv
Save team 11 to ../processed_data/corpus/justhink19_corpus_11.csv
Save team 17 to ../processed_data/corpus/justhink19_corpus_17.csv
Save team 18 to ../processed_data/corpus/justhink19_corpus_18.csv
Save team 20 to ../processed_data/corpus/justhink19_corpus_20.csv
Save team 28 to ../processed_data/corpus/justhink19_corpus_28.csv
Save team 47 to ../processed_data/corpus/justhink19_corpus_47.csv
