# Prepare BNC Corpus

This notebook downloads and processes the Audio BNC corpus (http://www.phon.ox.ac.uk/AudioBNC).

We use the words from the phonemic transcriptions to group the phonemes into words and use alignment with the orthographic transcriptions to group the words into utterances.

In [1]:
import pandas as pd
from pathlib import Path
import re
import requests
import sys

from bnc_to_ipa import convert_bnc_to_ipa

sys.path.append('../..')
from corpus_phonemizer import phonemize_utterances

from tqdm import tqdm

ORTHOGRAPHIC_TRANSCRIPTS_REPO = "http://bnc.phon.ox.ac.uk/filelist-html.txt"
PHONEMIC_TRANSCRIPTS_REPO = "http://bnc.phon.ox.ac.uk/filelist-textgrid.txt"

## Downloading BNC

We start by downloading the transcripts from BNC. We then work through the transcripts and extract the orthographic utterances.

In [2]:
transcript_paths = requests.get(ORTHOGRAPHIC_TRANSCRIPTS_REPO).text.split('\n')  
orthographic_utterances = {}

for path in tqdm(transcript_paths, 'Getting orthographic utterances'):
    if path == '':
        continue
    section = path.split('/')[-1].split('.')[0]
    lines = requests.get(path).text.split('\n')
    tape_ref = None

    for line in lines:
        # If the line is in the form "<h4>1 (Tape XXXXXX)</h4>", save the number XXXXXX
        if '(Tape' in line:
            recording_number = line.split('<h4>')[1].split(' ')[0].strip()
            tape_ref = section + '_' + recording_number
        elif 'Undivided text' in line:
            tape_ref = section + '_1'
        elif tape_ref is not None and line.strip().startswith('['):
            utterance = ']'.join(line.strip().split('<')[0].split(']')[1:]).strip()
            utterance = re.sub(r'\[.*?\]', '', utterance) # Remove annotations in square brackets
            utterance = re.sub(r'[^\w\s\']', '', utterance) # Remove punctuation
            utterance = re.sub(r'\s{2,}', ' ', utterance) # Remove extra spaces
            utterance = utterance.strip().lower() # Remove leading and trailing spaces and make lowercase
            if utterance != '':
                if tape_ref not in orthographic_utterances:
                    orthographic_utterances[tape_ref] = []
                orthographic_utterances[tape_ref].append(utterance)

print(f'Got {len(orthographic_utterances)} orthographic utterances')

Getting orthographic utterances: 100%|██████████| 909/909 [00:49<00:00, 18.45it/s]

Got 4370 orthographic utterances





## Extracting phonemes and words

We then work through the textgrid files of AudioBNC to extract the phonemes and words, aligning them according to the linebreaks in the orthographic transcription. We then convert the phonemes from BNC's representation to IPA.

In [3]:
# Tries to match the formatting of TextGrid words in BNC to the orthographic words in BNC
def clean_textgrid_words(words):
    word_line = ' '.join([word for word in words if not word in ['sp', '{OOV}', '{LG}', '{GAP_ANONYMIZATION}', '{CG}', '{XX}']])
    word_line = (
        word_line.replace(" 'S", "'S")
        .replace(" 'VE", "'VE")
        .replace("GON NA", "GONNA")
        .replace("DUN N","DUN XXXXN")
        .replace("DUN N NO","DUNNO")
        .replace("DU N NO","DUNNO")
        .replace(" N IT","NIT")
        .replace("GOT TA","GOTTA")
        .replace("WAN NA","WANNA").strip()
    )
    return word_line.lower()

# Get the paths to the phonemic transcriptions
grid_paths = requests.get(PHONEMIC_TRANSCRIPTS_REPO).text.split('\n')

phone_lines = []
word_lines = []
orthographic_word_lines = []

for path in tqdm(grid_paths, 'Getting utterances'):
    if path == '':
        continue
    tape_ref = '_'.join(path.split('.')[-2].split('_')[-2:]) # Extracts e.g. 'KDP_1' from 'http://bnc.phon.ox.ac.uk/data/021A-C0897X0004XX-AAZZP0_000406_KDP_1.TextGrid'
    if not tape_ref in orthographic_utterances:
        raise ValueError('No orthographic words for tape {}'.format(tape_ref))
    orthographic_words = orthographic_utterances[tape_ref]

    # Download and read file
    text = requests.get(path).text.split('\n')

    # Get the phones and words
    phones = []
    words = []
    i = 0

    # Get to the phones
    while not text[i].startswith('"phone"'):
        i += 1
    i += 1

    # Get all phones
    while i < len(text):
        while not text[i].startswith('"'):
            i += 1
        if text[i].startswith('"IntervalTier"'):
            break
        phone = text[i].strip()[1:-1]
        start_time = float(text[i-2].strip())
        phones.append((phone, start_time))
        i += 1

    # Get to the words
    while not text[i].startswith('"word"'):
        i += 1
    i += 1

    # Get all words
    while i < len(text):
        while not text[i].startswith('"'):
            i += 1
        if text[i].startswith('"IntervalTier"'):
            break
        word = text[i].strip()[1:-1]
        start_time = float(text[i-2].strip())
        words.append((word, start_time))
        i += 1
            
    # Get the phones for each word, and add an utterance boundary if the word aligns with a whole line of orthographic words
    phones_in_word = []
    phone_line = ''

    current_word_index = 1
    start_word_index = 0
    orthographic_words_index = 0
    num_orthographic_words = len(orthographic_words)
    num_errors = 0

    # Iterate through phones, using words to determine word boundaries and aligning with orthographic words to determine utterance boundaries
    for phone, start_time in phones:
        if current_word_index >= len(words):
            break
        # Check for start of new word
        if start_time >= words[current_word_index][1]:
            if phones_in_word != []:
                phone_line = phone_line + ' '.join(convert_bnc_to_ipa(phones_in_word)) + ' WORD_BOUNDARY '
                phones_in_word = []
            # Check if start of new utterances
            word_line = clean_textgrid_words([word[0] for word in words[start_word_index : current_word_index]])
            orthographic_word_line = orthographic_words[orthographic_words_index]
            if word_line.strip() != '' and word_line.strip() != ' ' and orthographic_words_index < num_orthographic_words and abs(len(orthographic_word_line) - len(word_line)) < 3 and (orthographic_word_line[0] == word_line[0] and orthographic_word_line[-2:] == word_line[-2:]): # Allow for a bit of leeway
                phone_lines.append(phone_line)
                word_lines.append(word_line)
                orthographic_word_lines.append(orthographic_word_line)
                phone_line = ''
                orthographic_words_index += 1
                start_word_index = current_word_index
                if orthographic_words_index >= len(orthographic_words):
                    remaining_words = clean_textgrid_words([word[0] for word in words[current_word_index:]])
                    if remaining_words != '':
                        num_errors += 1
                    break
            current_word_index += 1
        # Ignore pause markers and other non-phones
        if phone in ['sil', 'ns', 'sp', 'lg', 'cg', 'ls', 'br', 'ns1q']:
            continue
        phones_in_word.append(phone)

    # Add the last utterance
    if phones_in_word != []:
        phone_line = phone_line + ' '.join(convert_bnc_to_ipa(phones_in_word)) + ' WORD_BOUNDARY'
        phone_lines.append(phone_line)
        word_lines.append(word_line)
        orthographic_word_lines.append(orthographic_word_line)

    # Remove empty lines
    empty_indices = [i for i, word_line in enumerate(word_lines) if word_line.strip() == '']
    for i in reversed(empty_indices):
        del phone_lines[i]
        del word_lines[i]
        del orthographic_word_lines[i]

print(f'Got {len(phone_lines)} utterances')
print(f'Got {len(word_lines)} word lines')
print(f'Got {len(orthographic_word_lines)} orthographic word lines')
print(f'Got {sum([1 for word_line in word_lines if word_line.strip() == ""])} empty word lines')
print(f'Got {num_errors} mismatched word lines')

Getting utterances: 100%|██████████| 3273/3273 [45:07<00:00,  1.21it/s]  

Got 288879 utterances
Got 288879 word lines
Got 288879 orthographic word lines
Got 0 empty word lines
Got 0 mismatched word lines





## Converting orthographic lines to phonemes

In order to facilitate validation of the corpus phonemizer tool, we convert the orthographic text into phonemes.

In [4]:
import os
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = "/opt/local/lib/libespeak-ng.dylib"

phonemized_orthographic_lines = phonemize_utterances(word_lines, backend='phonemizer', language='en-gb', keep_word_boundaries=True,
                                                     verbose=False, use_folding=True, allow_possibly_faulty_word_boundaries=True, preserve_punctuation=False)



In [5]:
# Print nums
print('Num orthographic:', len(orthographic_word_lines))
print('Num TextGrid:', len(word_lines))
print('Num phones:', len(phone_lines))
print('Num phonemized orthographic:', len(phonemized_orthographic_lines))
print()

# Print example line to check
print('Example:')
print('Orthographic:', orthographic_word_lines[0])
print('TextGrid:', word_lines[0])
print('Phones:', phone_lines[0])
print('Phonemized:', phonemized_orthographic_lines[0])


Num orthographic: 288879
Num TextGrid: 288879
Num phones: 288879
Num phonemized orthographic: 288879

Example:
Orthographic: right then i'll just put it down here nine teas your not actually shooting till tuesday are you
TextGrid: right then i'll just put it down here nine teas your not actually shooting till tuesday are you
Phones: r aɪ t WORD_BOUNDARY ð ɛ n WORD_BOUNDARY aɪ l WORD_BOUNDARY d̠ʒ ʌ s t WORD_BOUNDARY p ʌ t WORD_BOUNDARY ɪ t WORD_BOUNDARY d aʊ n WORD_BOUNDARY h ɪ r WORD_BOUNDARY n aɪ n WORD_BOUNDARY t i: z WORD_BOUNDARY j ɔ: WORD_BOUNDARY n ɑ: t WORD_BOUNDARY a k ʃ ə l i: WORD_BOUNDARY ʃ u: t ɪ ŋ WORD_BOUNDARY t ɪ l WORD_BOUNDARY t j u: z d eɪ WORD_BOUNDARY ɚ WORD_BOUNDARY j u: WORD_BOUNDARY 
Phonemized: ɹ aɪ tʰ WORD_BOUNDARY ð e n WORD_BOUNDARY aɪ l WORD_BOUNDARY d̠ʒ ʌ s tʰ WORD_BOUNDARY pʰ ʊ tʰ WORD_BOUNDARY ɪ tʰ WORD_BOUNDARY d aʊ n WORD_BOUNDARY h ɪə WORD_BOUNDARY n aɪ n WORD_BOUNDARY tʰ iː z WORD_BOUNDARY j ɔː WORD_BOUNDARY n ɒ tʰ WORD_BOUNDARY æ kʰ t̠ʃ uː ə l i WORD

### Folding

We put together a simple folding map in order to try to match the two phoneme inventories. Most changes are simple one-to-one mappings made by comparing the phonemes missing from the `phonemized_orthographic_lines` to the phonemes missing from the `phone_lines`. The mapping is very similar to the folding mapping used for the `en-gb` accent to match the Phoible inventory. Many changes are simply to correct the long vowel symbol `:` to be `ː`.

We also make a few additional changes according to some analysis:


In [6]:
folding_map = {
    'u:' : 'uː',
    'i:' : 'iː',
    'g' : 'ɡ',
    'oɪ' : 'ɔɪ',
    'ɔ:' : 'ɔː',
    'p' : 'pʰ',
    't ' : 'tʰ ',
    'k' : 'kʰ',
#    'ɑ:' : 'aː',
    'ɛ ' : 'e ',
    'a ' : 'æ ',
    'r' : 'ɹ',
    'ə:' : 'ɜː',

    # Fixing dipthongs
    'e ə' : 'eə',
}

folded_phone_lines = []
for phone_line in phone_lines:
    for k, v in folding_map.items():
        phone_line = phone_line.replace(k, v)
    folded_phone_lines.append(phone_line)

In [7]:
def get_vocabulary(lines):
    vocabulary = {}
    for line in lines:
        for token in line.split():
            if not token in vocabulary:
                vocabulary[token] = 0
            vocabulary[token] += 1
    return vocabulary

phone_vocabulary = get_vocabulary(folded_phone_lines)
phonemized_vocabulary = get_vocabulary(phonemized_orthographic_lines)

unseen = phone_vocabulary.keys() - phonemized_vocabulary.keys()
unknown = phonemized_vocabulary.keys() - phone_vocabulary.keys()

print('Phonemes in original but not phonemized:', {phone: phone_vocabulary[phone] for phone in unseen})
print('Phonemes in phonemized but not original:', {phone: phonemized_vocabulary[phone] for phone in unknown})


Phonemes in original but not phonemized: {'ɚ': 73612, 'ɑ:': 130579}
Phonemes in phonemized but not original: {'ɐ': 83565, 'x': 4, 'ɡʲ': 2, 'r': 4, 'ɑ̃': 12, 'i': 84046, 'ɑː': 45585, 'ɬ': 11, 'aː': 18, 'n̩': 1462, 'ɔ': 13, 'ʊə': 3034}


In [8]:
counts = {}
for i, line in enumerate(phonemized_orthographic_lines):
    words = line.split('WORD_BOUNDARY')
    folded_words = folded_phone_lines[i].split('WORD_BOUNDARY')
    if len(words) != len(folded_words):
        continue
    for word, folded_word in zip(words, folded_words):
        index = word.find('ɐ')
        if index != -1:
            a = folded_word[index] if index < len(folded_word) else ''
            if not a in counts:
                counts[a] = 0
            counts[a] += 1

print(counts)

{'ə': 34806, 'e': 4064, 'ɑ': 311, 'æ': 6250, 'ɚ': 456, ' ': 105, 'm': 23, 'p': 8, 'ɪ': 3, 'd': 2, 'ɡ': 21, 'ɹ': 3, 'b': 1, 'j': 2, '': 1, 'ʰ': 1, 'ɔ': 4, 'ʒ': 1}


## Saving dataset

Finally, we save all lines as a dataset.

In [9]:
dataset = {'orthographic': word_lines, 'original_phonemic': phone_lines, 'folded_phonemic': folded_phone_lines, 'phonemized_orthographic': phonemized_orthographic_lines}
# Use final 10,000 utterances as test set
train_dataset = {k: v[:-10000] for k, v in dataset.items()}
test_dataset = {k: v[-10000:] for k, v in dataset.items()}

print(f'Got {len(train_dataset["orthographic"])} training utterances')
print(f'Got {len(test_dataset["orthographic"])} test utterances')

# Write to files
df = pd.DataFrame(train_dataset)
df.to_csv('BNC-dataset/train.csv', index=False)
df = pd.DataFrame(test_dataset)
df.to_csv('BNC-dataset/test.csv', index=False)


Got 278879 training utterances
Got 10000 test utterances
