# Silver Standard handling
Split SSC into smaller chunks to efficiently perform pretraining on it. 
We have to look up how much silver data is overall available. 
Since the dataset is extremely large its not feasible to train on all articles for each epoch. 
Instead we train one epoch on all positive samples and randomly sampled negative samples until we saw all negative samples.


In [None]:
buckets = {
    '../../SoSciSoCi-SSC/data/SSC_pos_samples_data.txt': [],
    '../../SoSciSoCi-SSC/data/SSC_pos_samples_labels.txt': [],
    '../../SoSciSoCi-SSC/data/SSC_neg_samples_data.txt': [],
    '../../SoSciSoCi-SSC/data/SSC_neg_samples_labels.txt': []
}
for dataset in buckets.keys():
    with open(dataset, 'r') as data:
        for line in data:
            buckets[dataset].append(line)

for dataset in buckets.keys():
    print(len(buckets[dataset]))

Based on the size of our sets we decided to split it into 12 sets, where one positive sample is always accompanied by one negative sample.

In [None]:
import random
random.seed(1)

NUM_SILVER_TRAIN = 12

for i in range(NUM_SILVER_TRAIN):
    print("Creating set {}".format(i))
    with open('../data/merged_SSC_data_ep{}.txt'.format(i), 'w') as merged_d, open('../data/merged_SSC_labels_ep{}.txt'.format(i), 'w') as merged_l:
        for idx, (x, y) in enumerate(zip(buckets['../../SoSciSoCi-SSC/data/SSC_pos_samples_data.txt'], buckets['../../SoSciSoCi-SSC/data/SSC_pos_samples_labels.txt'])):
            if idx % 50000 == 0:
                print("At index {}".format(idx))
            merged_d.write(x)
            merged_l.write(y)
            index_to_pop = random.randint(0, len(buckets['../../SoSciSoCi-SSC/data/SSC_neg_samples_data.txt'])-1)
            text_string = buckets['../../SoSciSoCi-SSC/data/SSC_neg_samples_data.txt'].pop(index_to_pop)
            merged_d.write(text_string)
            labels_string = " "
            for a in range(len(text_string.split())):
                labels_string += "O "
            labels_string += '\n'
            merged_l.write(labels_string)


A small number of negative samples gets left out of this sampling:

In [None]:
len(buckets['data/neg_silver_samples_data.txt'])

# Perform a split for optimizing the pre-training

This split is intended to just test how well the model is able to learn on the silver standard and to predict on the silver standard. 
For this purpose the silver standard is split into a train and test set. 

In [None]:
import random
random.seed(1)

size_of_silver_set = 25000

print("Creating the silver devel set")
with open('../data/merged_SSC_opt_test_data.txt', 'w') as merged_d, open('../data/merged_SSC_opt_test_labels.txt', 'w') as merged_l:
    for idx in range(size_of_silver_set):
        if idx % 50000 == 0:
            print("At index {}".format(idx))
        pos_index_to_pop = random.randint(0, len(buckets['../../SoSciSoCi-SSC/data/SSC_pos_samples_data.txt'])-1)
        neg_index_to_pop = random.randint(0, len(buckets['../../SoSciSoCi-SSC/data/SSC_neg_samples_data.txt'])-1)
        pos_text_string = buckets['../../SoSciSoCi-SSC/data/SSC_pos_samples_data.txt'].pop(pos_index_to_pop)
        pos_text_label = buckets['../../SoSciSoCi-SSC/data/SSC_pos_samples_labels.txt'].pop(pos_index_to_pop)
        merged_d.write(pos_text_string)
        merged_l.write(pos_text_label)
        neg_text_string = buckets['../../SoSciSoCi-SSC/data/SSC_neg_samples_data.txt'].pop(neg_index_to_pop)
        merged_d.write(neg_text_string)
        labels_string = " "
        for a in range(len(neg_text_string.split())):
            labels_string += "O "
        labels_string += '\n'
        merged_l.write(labels_string)

NUM_SILVER_TRAIN = 14

for i in range(NUM_SILVER_TRAIN):
    print("Creating set {}".format(i))
    with open('../data/merged_SSC_opt_train_data_ep{}.txt'.format(i), 'w') as merged_d, open('../data/merged_SSC_opt_train_labels_ep{}.txt'.format(i), 'w') as merged_l:
        for idx, (x, y) in enumerate(zip(buckets['../../SoSciSoCi-SSC/data/SSC_pos_samples_data.txt'], buckets['../../SoSciSoCi-SSC/data/SSC_pos_samples_labels.txt'])):
            if idx % 50000 == 0:
                print("At index {}".format(idx))
            merged_d.write(x)
            merged_l.write(y)
            index_to_pop = random.randint(0, len(buckets['../../SoSciSoCi-SSC/data/SSC_neg_samples_data.txt'])-1)
            text_string = buckets['../../SoSciSoCi-SSC/data/SSC_neg_samples_data.txt'].pop(index_to_pop)
            merged_d.write(text_string)
            labels_string = " "
            for a in range(len(text_string.split())):
                labels_string += "O "
            labels_string += '\n'
            merged_l.write(labels_string)