In [1]:
from SERDatasets import ImprovDatasetConstructor
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
data = ImprovDatasetConstructor(dataset_save_location='../prepared_datasets/improv_dataset.pk')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import re
speaker_to_annotators = {f'{g}0{i}': [] for g in ['M', 'F'] for i in range(1,7)}
speaker_to_datasize = {f'{g}0{i}': 0 for g in ['M', 'F'] for i in range(1,7)}
for utt_id in data.labels:
    annotators = data.labels[utt_id]['annotators']
    match = re.search(r'-([FM]0[1-6])-', utt_id)
    speaker_to_annotators[match.group(0).replace('-','')].extend(annotators)
    speaker_to_datasize[match.group(0).replace('-','')] += 1


In [3]:
speaker_to_annotators = {x: set(speaker_to_annotators[x]) for x in speaker_to_annotators}

In [4]:
import re
annotator_to_speaker = {}
for annotator in data.individual_annotators:
    annotator_to_speaker[annotator] = []
    for utt_id in data.individual_annotators[annotator]:
        match = re.search(r'-([FM]0[1-6])-', utt_id)
        annotator_to_speaker[annotator].append(match.group(0).replace('-',''))
    annotator_to_speaker[annotator] = set(annotator_to_speaker[annotator])
    
ignored_annotators = {i: v for i,v in annotator_to_speaker.items() if len(v) == 1}
ignored_annotators

{'c3rguqctj3qv57avvd1auph3g3-p3b': {'M01'},
 'p2ej6pr8f950rulpdkrqea4db0-p3b': {'F06'},
 '3bgvfacsgsv5khlph0vrg30d63-p3b': {'F02'},
 'e77a0ejejogqba4eoo9evci6b6-p3b': {'F05'},
 '6dq25lmn3tpsi59p858c673u46-p3b': {'F02'},
 '17qnfnhcn28ep46hidcfoftu52-p3b': {'F06'},
 'quhtr8erbtdq8uk8neu4ahbjk4-p3b': {'M01'},
 'A3KBQJ36JL7G7C-p1': {'M03'},
 'mdth1cd9c7534ld062tkhpkch2-p3b': {'F01'},
 'qgvtj408e2mlis9qbcolldr3a6-p3b': {'F03'},
 '54pqi818compujpopuqos7ql21-p3a': {'F05'},
 'l86bd511okn032pitt5j30ert5-p2': {'F03'},
 'jtdie0b8b5thslb9lecebf4dv6-p3b': {'M05'},
 'AAE9H4ZIKQS37-p1': {'M05'},
 'k00rpsnvq46626i5erckirdeq7-p3b': {'F01'},
 'ev7mo9kkj8bu6lfbdmd0b7vot1-p3a': {'F01'},
 'n8d09c9g0dk3ol976v91i9n4i5-p3a': {'F04'},
 'l0oehm30h1ebfgi2675pscrpn0-p2': {'M01'},
 'cc5dkr7ovud56jkedobb7d4480-p2': {'M01'},
 'A2AMH6PII3Z5JA-p1': {'F02'},
 'A7K79O7QPK86A-p_spon': {'F05'},
 '62fjv2tr1s1fpbu5lak5vuhea2-p3b': {'M03'},
 'kepht27gr2gbnd51co93fm0dv2-p3b': {'M03'},
 'A3DE3ZWIPQO5BG-p1': {'F01'},
 'A2WHA7XE

In [13]:
from tqdm import tqdm
import random

# Do this recursively to select best split 
def get_annotators(selected_speakers, uncovered_annotators, selected_datasize, depth=0, check_test_set=False):
    can_valid_test_set_be_found = set(speaker_to_annotators.keys()) - set(selected_speakers)
    if check_test_set:
        can_valid_test_set_be_found = select_val_test_split(can_valid_test_set_be_found)

    # Base case -- selected more than 60% of dataset set
    if selected_datasize/len(data.labels) > 0.7:
        return False
        return selected_speakers, uncovered_annotators, selected_datasize

    if not can_valid_test_set_be_found:
        return False

    # Base case -- no covered annotators
    if not len(uncovered_annotators):
        return selected_speakers, uncovered_annotators, selected_datasize, can_valid_test_set_be_found

    # Base case -- selected all speakers
    if len(selected_speakers) == len(speaker_to_annotators):
        return selected_speakers, uncovered_annotators, selected_datasize, can_valid_test_set_be_found

    # Recursive case -- score all speakers and return speakers that cover the most annotators
    best_score = range(99999)
    best_speakers = None
    best_size = None
    best_test_set = None
    choices = list(speaker_to_annotators.keys())
    random.shuffle(choices)
    loop_value = tqdm(choices) if depth < 1 else choices
    last_chosen = selected_speakers[-1] if len(selected_speakers) else ''
    for speaker in loop_value:
        if speaker in selected_speakers + ['M01', 'F02']:
            continue
        if ('M' in speaker and 'M' in last_chosen) or ('F' in speaker and 'F' in last_chosen):
            continue # Alternate between choosing a male and female speaker to ensure gender balance in train/val/test
        res = get_annotators(selected_speakers + [speaker], uncovered_annotators - speaker_to_annotators[speaker], selected_datasize + speaker_to_datasize[speaker], depth+1, check_test_set=check_test_set)
        if res:
            possible_speakers, possible_score, possible_size, valid_test_set = res
        else:
            continue # If get_annotators returns false then too much data was selected 
        if not valid_test_set:
            continue
        if len(possible_score) < len(best_score) or (len(possible_score) == len(best_score) and possible_size > best_size):
            best_speakers = possible_speakers
            best_score = possible_score
            best_size = possible_size
            best_test_set = valid_test_set
    if best_speakers is None:
        best_speakers = selected_speakers
        best_size = selected_datasize
        best_score = uncovered_annotators
        best_test_set = can_valid_test_set_be_found
    if depth < 2: # Only print this for the parent and sub-parent
        covered_annotators = set.union(*[speaker_to_annotators[speaker] for speaker in best_speakers])
        print('Given', selected_speakers, f'best choice leaves {len(best_score)} annotators uncovered and covers {len(covered_annotators)}. Train split size: {100*best_size/len(data.labels):.2f}%. Validation/Test split: {best_test_set}')
    return best_speakers, best_score, best_size, best_test_set

def select_val_test_split(possible_speakers, selected_speakers=([],[]), datasizes=(0,0)):
    val_speakers, test_speakers = selected_speakers
    val_size, test_size = datasizes
    val_last_chosen = val_speakers[-1] if len(val_speakers) else ''
    test_last_chosen = test_speakers[-1] if len(test_speakers) else ''
    # Base case -- no speaker 
    if not len(possible_speakers):
        # If datasizes are not balanced well enough this should fail
        if val_size/len(data.labels) < 0.15 or test_size/len(data.labels) < 0.15:
            return False

        return selected_speakers, datasizes, possible_speakers
    # Base case -- exactly one speaker remaining
    if len(possible_speakers) == 1:
        if val_size/len(data.labels) < 0.15 and test_size/len(data.labels) < 0.15:
            return False

        if val_size/len(data.labels) < 0.15:
            # Val size is too small, if we can add possible speaker to bring it up then we can continue otherwise fail here
            new_size = val_size + speaker_to_datasize[next(iter(possible_speakers))]
            if new_size/len(data.labels) < 0.15:
                return False
        if test_size/len(data.labels) < 0.15:
            # Val size is too small, if we can add possible speaker to bring it up then we can continue otherwise fail here
            new_size = test_size + speaker_to_datasize[next(iter(possible_speakers))]
            if new_size/len(data.labels) < 0.15:
                return False
        return selected_speakers, datasizes, possible_speakers

    best_res = None
    min_dist = 9999
    for val_speaker in possible_speakers:
        if val_speaker in val_speakers:
            continue
        if ('M' in val_speaker and 'M' in val_last_chosen) or ('F' in val_speaker and 'F' in val_last_chosen):
            continue
        for test_speaker in possible_speakers:
            if test_speaker == val_speaker:
                continue
            if test_speaker in test_speakers:
                continue
            if ('M' in test_speaker and 'M' in test_last_chosen) or ('F' in test_speaker and 'F' in test_last_chosen):
                continue # Alternate between choosing a male and female speaker to ensure gender balance in train/val/test
            new_possible_speakers = set(possible_speakers) - set([val_speaker, test_speaker])
            res = select_val_test_split(new_possible_speakers, (val_speakers + [val_speaker], test_speakers + [test_speaker]), (val_size + speaker_to_datasize[val_speaker], test_size + speaker_to_datasize[test_speaker]))
            if res:
                sel, sizes, p_spkrs = res
                abs_diff = abs(sizes[0]-sizes[1])
                if abs_diff < min_dist:
                    best_res = res
                    min_dist = abs_diff
            else:
                continue # If get_annotators returns false then too much data was selected
    if best_res is not None:
        return best_res
    # If we get to here then we failed to find a valid test split 
    return False

# We now select speaker such that it covers as much as possible of the remaining speakers 
selected_speakers = ['F05'] # Select a random initial speaker to build train/test set around
uncovered_annotators = set(data.individual_annotators.keys()) - set(ignored_annotators.keys()) - set(speaker_to_annotators['F05'])
selected_datasize = speaker_to_datasize['F05']
res = get_annotators(selected_speakers, uncovered_annotators, selected_datasize, check_test_set=True)
print('--- res')
print(res)

 25%|██▌       | 3/12 [00:00<00:02,  4.10it/s]

Given ['F05', 'M04'] best choice leaves 5 annotators uncovered and covers 1434. Train split size: 69.34%. Validation/Test split: ((['F04', 'M05'], ['M01', 'F02']), (1300, 1287), set())


 58%|█████▊    | 7/12 [00:01<00:01,  4.92it/s]

Given ['F05', 'M05'] best choice leaves 5 annotators uncovered and covers 1430. Train split size: 68.48%. Validation/Test split: ((['M03', 'F04'], ['M01', 'F02']), (1373, 1287), set())


 67%|██████▋   | 8/12 [00:02<00:01,  3.28it/s]

Given ['F05', 'M03'] best choice leaves 5 annotators uncovered and covers 1434. Train split size: 69.34%. Validation/Test split: ((['F04', 'M05'], ['M01', 'F02']), (1300, 1287), set())


 75%|███████▌  | 9/12 [00:02<00:01,  2.53it/s]

Given ['F05', 'M06'] best choice leaves 5 annotators uncovered and covers 1434. Train split size: 69.34%. Validation/Test split: ((['F04', 'M05'], ['M01', 'F02']), (1300, 1287), set())


100%|██████████| 12/12 [00:03<00:00,  3.29it/s]

Given ['F05', 'M02'] best choice leaves 5 annotators uncovered and covers 1434. Train split size: 69.34%. Validation/Test split: ((['F04', 'M05'], ['M01', 'F02']), (1300, 1287), set())
Given ['F05'] best choice leaves 5 annotators uncovered and covers 1434. Train split size: 69.34%. Validation/Test split: ((['F04', 'M05'], ['M01', 'F02']), (1300, 1287), set())
--- res
(['F05', 'M04', 'F03', 'M02', 'F01', 'M06', 'F06', 'M03'], {'5gcbrspms2hdl9do2n528pfal5-p3b', 'A36L397WEN8VRN-p_spon', '7s9qdc35a9mfoedulqepo1bip6-p3b', 'A1RWNYJA5X25YH-p1', 'qrbki0m9qs4p788ipamfb1r254-p3a'}, 5851, ((['F04', 'M05'], ['M01', 'F02']), (1300, 1287), set()))





# Final data split: 
Train: F01,F03,F05,F05,M02,M03,M04,M06
Validation: F02,M01
Test: F04,M05

Doesn't cover 5 annotators shared between at least two speakers, and doesn't cover annotators that only occur to one speaker. 