In [None]:
import random

import pandas as pd
import numpy as np

In [None]:
%cd ..

In [None]:
nudging = pd.read_csv('data/Nagtegaal_2019.csv')
software = pd.read_csv('data/Hall_2012.csv')
brouwer = pd.read_csv('data/Brouwer_2019.csv')

# remove duplicates
nudging = nudging[nudging.duplicate_record_id.isna()]
software = software[software.duplicate_record_id.isna()]
brouwer = brouwer.drop_duplicates(subset=['title','abstract','author'])

In [None]:
brouwer.rename({'included':'label_included'}, axis=1, inplace=True)
brouwer['record_id'] = list(range(1, len(brouwer) + 1))

In [None]:
def generate_sample_sets(df, output_name, sample_sizes = [200,400], n_prior_sets = 1, shuffle_seed = 10):
    # create list of included and excluded indices
    incl = list(df[df.label_included == 1].index)
    excl = list(df[df.label_included == 0].index)
      
    # shuffle
    random.seed(shuffle_seed)
    random.shuffle(incl)
    random.shuffle(excl)
    
    # remove n_prior_sets relevant and irrelevant records from the set to use as priors
    priors_incl = df.loc[incl[0:n_prior_sets]].copy()
    priors_excl = df.loc[excl[0:n_prior_sets]].copy()
    incl = incl[n_prior_sets:]
    excl = excl[n_prior_sets:]
    
    # create dataframes containing the prior combinations
    prior_dfs = []
    for i in range(0, n_prior_sets):
        sub_incl = priors_incl.iloc[i,:].to_frame().transpose()
        sub_excl = priors_excl.iloc[i,:].to_frame().transpose()
        prior_set = sub_incl.append(sub_excl)
        prior_dfs.append(prior_set)
    
    # number of relevant and irrelevant records in set
    n_incl = len(incl)
    n_excl = len(excl)
    n_total = n_incl + n_excl
    
    # downsampling
    for sample_size in sample_sizes:
        sample_n_incl = int(np.round(n_incl / n_total * sample_size))
        sample_n_excl = sample_size - sample_n_incl
        
        df_incl = df.loc[incl[0:sample_n_incl]]
        df_excl = df.loc[excl[0:sample_n_excl]]
        
        for i, prior_df in enumerate(prior_dfs):
            temp = prior_df.append(df_incl)
            temp = temp.append(df_excl)
            temp.record_id = list(range(1, len(temp) + 1))
            temp = temp.reset_index(drop=True)
            temp.to_csv('output/preprocessed_data/' + output_name + '_s{}_p{}.csv'.format(sample_size, str(i + 1)))

In [None]:
# generate sampled data sets for nudging dataset
generate_sample_sets(nudging, 'nudging', sample_sizes = [200,400,800,1600])

In [None]:
# generate sampled data sets for software dataset
generate_sample_sets(software, 'software', sample_sizes = [200,400,800,1600,3200,6400])

In [None]:
# generate sampled data sets for brouwer dataset
generate_sample_sets(brouwer, 'brouwer', sample_sizes = [1600,3200,6400,12800,25600,48975])