# Silver Standard handling
Correct the silver standard and split it into smaller chunks to efficiently perform pretraining on it. 

When exporting the silver standard data some labeling irregularities where inserted. Those can easily be fixed automatically:

In [None]:
silver_standard_data = '../data/data/pos_silver_samples_cor_data.txt'
silver_standard_labels = '../data/pos_silver_samples_cor_labels.txt'
silver_standard_labels_cor = '../data/pos_silver_samples_corrected_labels.txt'
    
error_count = 0
with open(silver_standard_data, 'r') as data_file, open(silver_standard_labels, 'r') as labels_file, open(silver_standard_labels_cor, 'w') as labels_file_cor:
    data_line = data_file.readline()
    labels_line = labels_file.readline()
    current_candidate = ''
    counter = 0
    while data_line and labels_line:
        counter += 1
        output_labels = ''
        tokens = data_line.split()
        labels = labels_line.split()
        token = tokens.pop(0)
        annotation = labels.pop(0)
        i_allowed_to_occur = False
        print_flag = False
        while token and annotation:
            if i_allowed_to_occur:
                if annotation == 'O':
                    i_allowed_to_occur = False
                    output_labels += ' O'
                elif annotation == 'B-software':
                    i_allowed_to_occur = True
                    output_labels += ' B-software'
                elif annotation == 'I-software':
                    i_allowed_to_occur = True
                    output_labels += ' I-software'
            else:
                if annotation == 'O':
                    i_allowed_to_occur = False
                    output_labels += ' O'
                elif annotation == 'B-software':
                    i_allowed_to_occur = True
                    output_labels += ' B-software'
                elif annotation == 'I-software':
                    i_allowed_to_occur = False
                    output_labels += ' O'
                    print_flag = True
            if len(tokens) > 0 and len(labels) > 0:
                token = tokens.pop(0)
                annotation = labels.pop(0)
            else:
                token = None
                annotation = None
        output_labels += '\n'
        if print_flag:
            print("Replace")
            print(labels_line)
            print(output_labels)
            print(len(data_line.split()))
            print(len(output_labels.split()))
                
        data_line = data_file.readline()
        labels_line = labels_file.readline()
        labels_file_cor.write(output_labels)

Next we hav to look up how much silver data is overall available. 
Since the dataset is extremely large its not feasible to train on all articles for each epoch. 
Instead we train one epoch on all positive samples and randomly sampled negative samples until we saw all negative samples.

In [None]:
buckets = {
    '../data/pos_silver_samples_cor_data.txt': [],
    '../data/pos_silver_samples_cor_labels.txt': [],
    '../data/neg_silver_samples_data.txt': []
}
for dataset in buckets.keys():
    with open(dataset, 'r') as data:
        for line in data:
            buckets[dataset].append(line)

for dataset in buckets.keys():
    print(len(buckets[dataset]))

Based on the size of our sets we decided to split it into 12 sets, where one positive sample is always accompanied by one negative sample.

In [None]:
import random
random.seed(1)

NUM_SILVER_TRAIN = 12

for i in range(NUM_SILVER_TRAIN):
    print("Creating set {}".format(i))
    with open('data/merged_silver_standard_data_ep{}.txt'.format(i), 'w') as merged_d, open('data/merged_silver_standard_labels_ep{}.txt'.format(i), 'w') as merged_l:
        for idx, (x, y) in enumerate(zip(buckets['data/pos_silver_samples_cor_data.txt'], buckets['data/pos_silver_samples_cor_labels.txt'])):
            if idx % 50000 == 0:
                print("At index {}".format(idx))
            merged_d.write(x)
            merged_l.write(y)
            index_to_pop = random.randint(0, len(buckets['data/neg_silver_samples_data.txt'])-1)
            text_string = buckets['data/neg_silver_samples_data.txt'].pop(index_to_pop)
            merged_d.write(text_string)
            labels_string = " "
            for a in range(len(text_string.split())):
                labels_string += "O "
            labels_string += '\n'
            merged_l.write(labels_string)


In [None]:
len(buckets['data/neg_silver_samples_data.txt'])

# Perform a split for optimizing the pre-training

This split is intended to just test how well the model is able to learn on the silver standard and to predict on the silver standard. 
For this purpose the silver standard is split into a train and test set. 

In [None]:
import random
random.seed(1)

size_of_silver_set = 25000

print("Creating the silver devel set")
with open('data/merged_silver_opt_test_data.txt', 'w') as merged_d, open('data/merged_silver_opt_test_labels.txt', 'w') as merged_l:
    for idx in range(size_of_silver_set):
        if idx % 50000 == 0:
            print("At index {}".format(idx))
        pos_index_to_pop = random.randint(0, len(buckets['data/pos_silver_samples_cor_data.txt'])-1)
        neg_index_to_pop = random.randint(0, len(buckets['data/neg_silver_samples_data.txt'])-1)
        pos_text_string = buckets['data/pos_silver_samples_cor_data.txt'].pop(pos_index_to_pop)
        pos_text_label = buckets['data/pos_silver_samples_cor_labels.txt'].pop(pos_index_to_pop)
        merged_d.write(pos_text_string)
        merged_l.write(pos_text_label)
        neg_text_string = buckets['data/neg_silver_samples_data.txt'].pop(neg_index_to_pop)
        merged_d.write(neg_text_string)
        labels_string = " "
        for a in range(len(neg_text_string.split())):
            labels_string += "O "
        labels_string += '\n'
        merged_l.write(labels_string)

NUM_SILVER_TRAIN = 14

for i in range(NUM_SILVER_TRAIN):
    print("Creating set {}".format(i))
    with open('data/merged_silver_opt_train_data_ep{}.txt'.format(i), 'w') as merged_d, open('data/merged_silver_opt_train_labels_ep{}.txt'.format(i), 'w') as merged_l:
        for idx, (x, y) in enumerate(zip(buckets['data/pos_silver_samples_cor_data.txt'], buckets['data/pos_silver_samples_cor_labels.txt'])):
            if idx % 50000 == 0:
                print("At index {}".format(idx))
            merged_d.write(x)
            merged_l.write(y)
            index_to_pop = random.randint(0, len(buckets['data/neg_silver_samples_data.txt'])-1)
            text_string = buckets['data/neg_silver_samples_data.txt'].pop(index_to_pop)
            merged_d.write(text_string)
            labels_string = " "
            for a in range(len(text_string.split())):
                labels_string += "O "
            labels_string += '\n'
            merged_l.write(labels_string)

In [None]:
for i in range(12,14):
    print("Creating set {}".format(i))
    with open('data/merged_silver_opt_train_data_ep{}.txt'.format(i), 'w') as merged_d, open('data/merged_silver_opt_train_labels_ep{}.txt'.format(i), 'w') as merged_l:
        for idx, (x, y) in enumerate(zip(buckets['data/pos_silver_samples_cor_data.txt'], buckets['data/pos_silver_samples_cor_labels.txt'])):
            if idx % 50000 == 0:
                print("At index {}".format(idx))
            merged_d.write(x)
            merged_l.write(y)
            index_to_pop = random.randint(0, len(buckets['data/neg_silver_samples_data.txt'])-1)
            text_string = buckets['data/neg_silver_samples_data.txt'].pop(index_to_pop)
            merged_d.write(text_string)
            labels_string = " "
            for a in range(len(text_string.split())):
                labels_string += "O "
            labels_string += '\n'
            merged_l.write(labels_string)