In [16]:
import pandas as pd

# input data files
data_files = ['results-syn-hamming', 'results-syn-leven', 'results-syn-jaro', 'results-syn-lcsstr', 'results-syn-sorensen', 'results-syn-jaccard', 'results-syn-tversky']

# output samples (dataframes) for each input data file
composite_samples = []

# define function to calculate sample size and proportion using Slovin's formula
def get_sample_size_and_proportion(population_df, error_tolerance, stratification_variable):
    # Use Slovin's formula for sample size: n = N / (1 + Ne2)
    sample_size = len(population_df) / (1 + (len(population_df)*(error_tolerance ** 2)))
    sample_proportion = sample_size/len(population_df)
    print()
    print('sample size: ', sample_size)
    print()
    # print('sample proportion: ', sample_proportion)
    return {'size' : sample_size, 'proportion' : sample_proportion}

def get_sample(population_df, error_tolerance, stratification_variable):
    # remove all rows that have perfectly matching scores of 1 (no need for human judgement for exact matches)
    population_df = population_df[population_df['similarity_score'] < 1.0]
    # partition data into low, medium and high context sizes 
    population_df_low_context_size = population_df[population_df['context_size'] <= 3]
    population_df_medium_context_size = population_df.loc[(population_df['context_size'] > 3) & (population_df['context_size'] <= 6)]
    population_df_high_context_size = population_df[population_df['context_size'] > 6]
    
    print(len(population_df_low_context_size))
    print(len(population_df_medium_context_size))
    print(len(population_df_high_context_size))
    print()
    
    # remove 't' anomaly
    population_df_low_context_size = population_df_low_context_size[population_df_low_context_size['source_token'] != 't']
    population_df_medium_context_size = population_df_medium_context_size[population_df_medium_context_size['source_token'] != 't']
    population_df_high_context_size = population_df_high_context_size[population_df_high_context_size['source_token'] != 't']
    
    print(len(population_df_low_context_size))
    print(len(population_df_medium_context_size))
    print(len(population_df_high_context_size))
    print()
    
    # create a column with both pairs of contexts in a single string (to calculate number of unique pairs)
    population_df_low_context_size["context_pair_to_compare"] = population_df_low_context_size["source_context_inline"] + ' | ' + population_df_low_context_size["target_context_inline"]
    population_df_medium_context_size["context_pair_to_compare"] = population_df_medium_context_size["source_context_inline"] + ' | ' + population_df_medium_context_size["target_context_inline"]
    population_df_high_context_size["context_pair_to_compare"] = population_df_high_context_size["source_context_inline"] + ' | ' + population_df_high_context_size["target_context_inline"]

    print(population_df_low_context_size['context_pair_to_compare'].nunique())
    print(population_df_medium_context_size['context_pair_to_compare'].nunique())
    print(population_df_high_context_size['context_pair_to_compare'].nunique())
    
    # Perform stratified sampling according to similarity score
    low_context_size_sample_stats = get_sample_size_and_proportion(population_df_low_context_size, error_tolerance, stratification_variable)
    low_context_size_sample = population_df_low_context_size.groupby(stratification_variable, group_keys=False).apply(lambda x: x.sample(frac=low_context_size_sample_stats['proportion']))
    # low_context_size_sample = population_df_low_context_size.groupby(stratification_variable, group_keys=False).apply(lambda x: x.sample(1))
    medium_context_size_sample_stats = get_sample_size_and_proportion(population_df_medium_context_size, error_tolerance, stratification_variable)
    medium_context_size_sample = population_df_medium_context_size.groupby(stratification_variable, group_keys=False).apply(lambda x: x.sample(frac=medium_context_size_sample_stats['proportion']))
    # medium_context_size_sample = population_df_medium_context_size.groupby(stratification_variable, group_keys=False).apply(lambda x: x.sample(1))
    high_context_size_sample_stats = get_sample_size_and_proportion(population_df_high_context_size, error_tolerance, stratification_variable)
    high_context_size_sample = population_df_high_context_size.groupby(stratification_variable, group_keys=False).apply(lambda x: x.sample(frac=high_context_size_sample_stats['proportion']))
    # high_context_size_sample = population_df_high_context_size.groupby(stratification_variable, group_keys=False).apply(lambda x: x.sample(1))

    # Concatenate samples (for low, medium and high context sizes) together
    all_samples = []
    all_samples.append(low_context_size_sample)
    all_samples.append(medium_context_size_sample)
    all_samples.append(high_context_size_sample)
    
    return pd.concat(all_samples, ignore_index=True)



In [17]:
# # Cochran's formula (90% confidence gives very similar results to Slovin's 80% confidence)
# e = 0.1
# z = 1.645
# p = 0.5
# q = 1 - p
# print(q)

# n = ((z ** 2) * p * q) / (e ** 2)

# big_N = 50000

# # final sample size (Cochran)
# n_final = n / (1 + ((n - 1)/big_N))
# n_final

In [18]:
# calculate and save representative sample to file
error_tolerance = 0.15
stratification_variable_column = 'similarity_score'

for data_file in data_files:
    print(data_file)
    print()
    
    df = pd.read_csv("data/results/syntactic/" + data_file + ".csv")
    sample = get_sample(df, error_tolerance, stratification_variable_column)
    composite_samples.append(sample)
    
final_sample = pd.concat(composite_samples, ignore_index=True)
final_sample.to_csv('data/results/syntactic/representative-sample.csv')
final_sample.head()

results-syn-hamming

16484
21187
30492

16412
21106
30384

8760
13944
22824

sample size:  44.324411915629135


sample size:  44.35105119934438


sample size:  44.37952792708577

results-syn-leven

16484
21187
30492

16412
21106
30384

8760
13944
22824

sample size:  44.324411915629135


sample size:  44.35105119934438


sample size:  44.37952792708577

results-syn-jaro

16484
21187
30492

16412
21106
30384

8760
13944
22824

sample size:  44.324411915629135


sample size:  44.35105119934438


sample size:  44.37952792708577

results-syn-lcsstr

16484
21187
30492

16412
21106
30384

8760
13944
22824

sample size:  44.324411915629135


sample size:  44.35105119934438


sample size:  44.37952792708577

results-syn-sorensen

16437
21165
30471

16365
21084
30363

8734
13936
22815

sample size:  44.32406811795376


sample size:  44.350953953596


sample size:  44.379483094417665

results-syn-jaccard

16437
21165
30471

16365
21084
30363

8734
13936
22815

sample size:  44.32406811795376


s

Unnamed: 0,source_token,source,target,context_size,source_context_words,source_context_words_nstopw,source_context_inline,source_context_inline_nstopw,source_context_span,target_context_words,target_context_words_nstopw,target_context_inline,target_context_inline_nstopw,target_context_span,similarity_metric,similarity_score,context_pair_to_compare
0,greta,w1,w4,3,"['eerlijk', 'interieur', 'van', 'brenns', 'hui...","['eerlijk', 'interieur', 'brenns', 'huis']",eerlijk interieur van greta brenns huis daar,"['eerlijk', 'interieur', 'greta', 'brenns', 'h...","[2533, 5]","['de', 'rol', 'van', 'brenn', 'vertolkte', 'sp...","['rol', 'brenn', 'vertolkte', 'speciale']",de rol van greta brenn vertolkte speciale,"['rol', 'greta', 'brenn', 'vertolkte', 'specia...","[2954, 5]",hamming,0.0,eerlijk interieur van greta brenns huis daar |...
1,publiek,w1,w3,3,"['heeft', 'omdat', 'uw', 'haar', 'niet', 'aans...",['aanstond'],heeft omdat uw publiek haar niet aanstond,"['publiek', 'aanstond']","[2701, 7]","['ik', 'ga', 'het', 'verwittigen', 'dar', 'er']","['ga', 'verwittigen', 'dar']",ik ga het publiek verwittigen dar er,"['ga', 'publiek', 'verwittigen', 'dar']","[2724, 7]",hamming,0.0,heeft omdat uw publiek haar niet aanstond | ik...
2,versiering,w1,w3,3,"['en', 'tevens', 'alle', 'vermijdt', 'want', '...","['tevens', 'vermijdt']",en tevens alle versiering vermijdt want die,"['tevens', 'versiering', 'vermijdt']","[1326, 10]","['prijkten', 'als', 'goedkoope', 'en', 'ter', ...","['prijkten', 'goedkoope', 'bevordering']",prijkten als goedkoope versiering en ter bevor...,"['prijkten', 'goedkoope', 'versiering', 'bevor...","[1576, 10]",hamming,0.0,en tevens alle versiering vermijdt want die | ...
3,film,w1,w2,1,"['sprekende', 'maar']",['sprekende'],sprekende film maar,"['sprekende', 'film']","[184, 4]","['de', 'ziet']",['ziet'],de film ziet,"['film', 'ziet']","[2920, 4]",hamming,0.0,sprekende film maar | de film ziet
4,greta,w1,w2,2,"['rol', 'van', 'brenn', 'vertolkte']","['rol', 'brenn', 'vertolkte']",rol van greta brenn vertolkte,"['rol', 'greta', 'brenn', 'vertolkte']","[2834, 5]","['niet', 'dat', 'brenn', 'de']",['brenn'],niet dat greta brenn de,"['greta', 'brenn']","[2897, 5]",hamming,0.0,rol van greta brenn vertolkte | niet dat greta...


In [None]:
# # hamming_df = df[df['similarity_metric'] == 'hamming']
# # hamming_df_low.loc[(hamming_df_low['context_size'] <= 3) & (df_nonexact_matches['similarity_score'] <= 0.25)]
# hamming_df_low_context_size = df[df['context_size'] <= 3]
# hamming_df_medium_context_size = df.loc[(df['context_size'] > 3) & (df['context_size'] <= 6)]
# hamming_df_high_context_size = df[df['context_size'] > 6]
# print(len(hamming_df_low_context_size))
# print(len(hamming_df_medium_context_size))
# print(len(hamming_df_high_context_size))
# print()
# # remove 't' anomaly
# hamming_df_low_context_size = hamming_df_low_context_size[hamming_df_low_context_size['source_token'] != 't']
# print(len(hamming_df_low_context_size))
# hamming_df_medium_context_size = hamming_df_medium_context_size[hamming_df_medium_context_size['source_token'] != 't']
# print(len(hamming_df_medium_context_size))
# hamming_df_high_context_size = hamming_df_high_context_size[hamming_df_high_context_size['source_token'] != 't']
# print(len(hamming_df_high_context_size))



In [None]:
# hamming_df_low_context_size["context_pair_to_compare"] = hamming_df_low_context_size["source_context_inline"] + ' | ' + hamming_df_low_context_size["target_context_inline"]
# hamming_df_medium_context_size["context_pair_to_compare"] = hamming_df_medium_context_size["source_context_inline"] + ' | ' + hamming_df_medium_context_size["target_context_inline"]
# hamming_df_high_context_size["context_pair_to_compare"] = hamming_df_high_context_size["source_context_inline"] + ' | ' + hamming_df_high_context_size["target_context_inline"]



In [None]:
# hamming_df_low_context_size

In [None]:
# print(hamming_df_low_context_size['context_pair_to_compare'].nunique())
# print(hamming_df_medium_context_size['context_pair_to_compare'].nunique())
# print(hamming_df_high_context_size['context_pair_to_compare'].nunique())

In [None]:
# def get_sample(population_df, error_tolerance, stratification_variable):
#     # Use Slovin's formula for sample size: n = N / (1 + Ne2)
#     sample_size = len(population_df) / (1 + (len(population_df)*(error_tolerance ** 2)))
#     sample_proportion = sample_size/len(population_df)
#     print('sample size: ', sample_size)
#     print('sample proportion: ', sample_proportion)
#     # Perform stratified sampling according to similarity score
#     sample = population_df.groupby(stratification_variable, group_keys=False).apply(lambda x: x.sample(frac=sample_proportion))
#     return sample

# error_tolerance = 0.2
# stratification_variable_column = 'similarity_score'
# low_context_size_sample = get_sample(hamming_df_low_context_size, error_tolerance, stratification_variable_column)
# medium_context_size_sample = get_sample(hamming_df_medium_context_size, error_tolerance, stratification_variable_column)
# high_context_size_sample = get_sample(hamming_df_high_context_size, error_tolerance, stratification_variable_column)

# print('low cxt sample size:', len(low_context_size_sample))
# print('medium cxt sample size:', len(low_context_size_sample))
# print('high cxt sample size:', len(low_context_size_sample))
# print()

In [None]:
# import random
 
# # Function to create the random binary string
# def rand_key(p):
   
#     # Variable to store the list
#     key1 = []
 
#     # Loop to find the string
#     # of desired length
#     for i in range(p):
         
#         # randint function to generate
#         # 0, 1 randomly and converting
#         # the result into str
#         temp = str(random.randint(0, 2))
 
#         # Concatenation the random 0, 1, 2
#         # to the final result
#         key1.append(temp)
         
#     return key1
 
# # Driver Code
# n = len(sample)
# sequence = rand_key(n)
# print("Desired length random binary string is: ", sequence)

In [None]:
# sample['match_human_judgement'] = sequence

In [None]:
# sample