In [1]:
import argparse
import logging
import numpy as np
import os
import random
import pandas as pd
from tqdm.notebook import tqdm_notebook as tqdm
logger = logging.getLogger(__name__)

import sys
sys.path.insert(1, '../src/')
from BNC import Corpus

In [2]:
data_path = '../../../erp/data/bnc2014spoken-xml/spoken/'
out_path = '../../data/BNC-2014/two-speakers/analysis/dialign-format/'
n_speakers = 2

## Load Spoken BNC (Love et al., 2017)

In [3]:
logger.info('Load corpus from {}'.format(data_path))
corpus = Corpus(
    untagged_path=os.path.join(data_path, "untagged"),
    tagged_path=os.path.join(data_path, "tagged")
)

  0%|          | 0/1 [00:00<?, ?it/s]

Number of XML files: 1251


1251it [03:55,  5.32it/s]                    


## Split corpus for finetuning and analysis 
In the next cell, you can load the splits. The code to obtain the splits is provided below the next cell.

In [4]:
with open('../../data/BNC-2014/two-speakers/finetuning_ids.csv', 'r') as f:
    lines = f.readlines()
    assert len(lines) == 1
    finetuning_ids = lines[0].split(',')
    print(len(finetuning_ids))
    
with open('../../data/BNC-2014/two-speakers/analysis_ids.csv', 'r') as f:
    lines = f.readlines()
    assert len(lines) == 1
    analysis_ids = lines[0].split(',')
    print(len(analysis_ids))

435
187


In [18]:
n_words_in_turn = []
n_words_in_dial = []
n_turns_in_dial = []

for d_id in finetuning_ids + analysis_ids:
    _n_words = 0
    _n_turns = 0
    for utt in corpus.conversations[d_id].utterances:
        _len = len(utt.tokens)
        n_words_in_turn.append(_len)
        _n_words += _len
        _n_turns += 1
        
    n_words_in_dial.append(_n_words)
    n_turns_in_dial.append(_n_turns)


In [19]:
for _list in [n_turns_in_dial, n_words_in_dial, n_words_in_turn]:
    print(np.mean(_list), np.std(_list), np.median(_list), np.min(_list), np.max(_list))


736.1864951768489 599.347456472185 541.5 67 4859
7752.729903536978 5596.089452337154 6102.0 819 39575
10.530931977602489 15.0506085700599 6.0 0 982


In [5]:
n_words_in_turn = []
n_words_in_dial = []
n_turns_in_dial = []

for dial_id in contexts:
    _n_words = 0
    _n_turns = 0
    
    for _, turn in contexts[dial_id].items():
        _len = len(turn[2].split(' '))
        
        n_words_in_turn.append(_len)
        _n_words += _len
        _n_turns += 1
        
    n_words_in_dial.append(_n_words)
    n_turns_in_dial.append(_n_turns)


NameError: name 'contexts' is not defined

### Convert analysis split to `dialign` format

#### Without POS

In [9]:
for conv_id in tqdm(analysis_ids):
    tsv_content = ''
    for u in corpus.conversations[conv_id].utterances:
        if not u.sentence:
            continue
        
      # TODO! is it ok to just skip these utterances?
        if u.speaker_id in ['UNKFEMALE', 'UNKMALE', 'UNKMULTI']:
            continue
            
        new_line = '{}:\t{}\n'.format(u.speaker_id, u.sentence.strip().lower())
        tsv_content += new_line
    
    filename = '{}.tsv'.format(conv_id)
    with open(os.path.join(out_path, 'nopos', filename), 'w') as f_out:
        f_out.write(tsv_content)
        

  0%|          | 0/187 [00:00<?, ?it/s]

#### With POS

In [36]:
for conv_id in tqdm(analysis_ids):
    tsv_content = ''
    for u in corpus.conversations[conv_id].utterances:
        if not u.sentence:
            continue
        
      # TODO! is it ok to just skip these utterances?
        if u.speaker_id in ['UNKFEMALE', 'UNKMALE', 'UNKMULTI']:
            continue
        
        sentence = ''
        for t in u.tokens:
            sentence += '{}#{} '.format(t.form.strip().lower(), t.word_class)
            
        new_line = '{}:\t{}\n'.format(u.speaker_id, sentence.strip())
        tsv_content += new_line
    
    filename = '{}.tsv'.format(conv_id)
    with open(os.path.join(out_path, 'pos', filename), 'w') as f_out:
        f_out.write(tsv_content)
        

  0%|          | 0/187 [00:00<?, ?it/s]

### Store finetuning and analysis split to csv format for LM training and evaluation

### Prepare analysis data for learning rate selection

In [6]:
random.seed(42)
out_path_lr = '../../data/BNC-2014/two-speakers/analysis/lr/'


In [7]:
lr_selection_ids = np.random.choice(analysis_ids, 20).tolist()
lr_selection_ids


['SV2V',
 'SFET',
 'SF2F',
 'SZBR',
 'S6ZU',
 'SG2E',
 'SAXQ',
 'S87R',
 'SRRQ',
 'SQ63',
 'SLRY',
 'SNRP',
 'SJV7',
 'SDEX',
 'S38F',
 'S9WZ',
 'S9N4',
 'SZBR',
 'SRDJ',
 'SDEX']

In [12]:
for dial in lr_selection_ids:
    
    dial_dataset = ''
    
    for u in corpus.conversations[dial].utterances:
        if not u.sentence:
            continue
        new_line = '{}\n'.format(u.sentence.strip())
        dial_dataset += new_line
    
    filename = '{}.txt'.format(dial)
    with open(os.path.join(out_path_lr, filename), 'w') as f_out:
        f_out.write(dial_dataset)
    
    
    remaining_ids = lr_selection_ids[:]
    remaining_ids.remove(dial)
    eval_dataset = ''
    
    for eval_dial in lr_selection_ids:
        for u in corpus.conversations[dial].utterances:
            if not u.sentence:
                continue
            new_line = '{}\n'.format(u.sentence.strip())
            eval_dataset += new_line
    
    filename = '{}-eval.txt'.format(dial)
    with open(os.path.join(out_path_lr, filename), 'w') as f_out:
        f_out.write(eval_dataset)


### TODO: Save finetuning and analysis dialogues as dataframes for info density estimates

---