# Managing the data used for training and evaluating the model

In [None]:
import random
import math
import re
import json
import pickle
import pandas as pd

from os import listdir, remove, makedirs
from os.path import join, exists

random.seed(42)

In [None]:
with open('../data/SoSciSoCi_train_dev_test_split.json', 'r') as json_file:
    train_test_split_unique = json.load(json_file)

We have to clean up the SoSci data a little and can then split it into the distinct sets.

In [None]:
def clean_up_sosci(name, include_pos_samples=False):
    if include_pos_samples:
        train_name = '_train_with_pos_'
    else:
        train_name = '_train_no_pos_'
    with open('../data/{}_bio.txt'.format(name), 'r') as read_file, \
    open('../data/positive_samples_bio.txt', 'r') as pos_sample_file, \
    open('../data/{}{}data.txt'.format(name, train_name), 'w') as text_file_train, \
    open('../data/{}{}labels.txt'.format(name, train_name), 'w') as labels_file_train, \
    open('../data/{}_devel_data.txt'.format(name), 'w') as text_file_devel, \
    open('../data/{}_devel_labels.txt'.format(name), 'w') as labels_file_devel, \
    open('../data/{}_test_data.txt'.format(name), 'w') as text_file_test, \
    open('../data/{}_test_labels.txt'.format(name), 'w') as labels_file_test:
        current_line = ''
        current_labels = ''
        current_file = ''
        for line in read_file:
            if line.startswith('-DOCSTART-'):
                current_file = line.split(':')[1].rstrip('\n')
                if current_file in train_test_split_unique['train']:
                    set_to_put = 'train'
                elif current_file in train_test_split_unique['devel']:
                    set_to_put = 'devel'
                elif current_file in train_test_split_unique['test']:
                    set_to_put = 'test'
                else:
                    print("Unknown file. This should be an error.")
            elif line == '\n':
                if set_to_put == 'train':
                    text_file_train.writelines(current_line + '\n')
                    labels_file_train.writelines(current_labels + '\n')
                elif set_to_put == 'devel':
                    text_file_devel.writelines(current_line + '\n')
                    labels_file_devel.writelines(current_labels + '\n')
                elif set_to_put == 'test':
                    text_file_test.writelines(current_line + '\n')
                    labels_file_test.writelines(current_labels + '\n')
                else:
                    print("Unknown file. This should be an error.")
                current_line = ''
                current_labels = ''
            else:
                sep = line.split()
                current_line += sep[0] + ' '
                current_labels += sep[1] + ' '
                
        if include_pos_samples:
            current_line = ''
            current_labels = ''
            current_file = ''
            for line in pos_sample_file:
                if line.startswith('-DOCSTART-'):
                    continue
                elif line == '\n':
                    text_file_train.writelines(current_line + '\n')
                    labels_file_train.writelines(current_labels + '\n')
                    current_line = ''
                    current_labels = ''
                else:
                    sep = line.split()
                    current_line += sep[0] + ' '
                    current_labels += sep[1] + ' '

In [None]:
random.seed(42)
clean_up_sosci('SoSciSoCi', True)

## Handling the data used for reasoning

In [None]:
NUM_SETS = 40
reasoning_files = listdir('../data/R_loading/SENTS')
print("Reasoning on {} articles.".format(len(reasoning_files)))

if not exists('reasoning_sets'):
    makedirs('reasoning_sets')

files_per_set = math.ceil(len(reasoning_files)/NUM_SETS)

for i in range(NUM_SETS):
    files = reasoning_files[i*files_per_set:(i+1)*files_per_set]
    pickle.dump(files, open("reasoning_sets/reasoning_set_{}.p".format(i), "wb")) 