# Prepare sampled data for manual labelling

## I. Extend spans to two word phrases [a postfix]

Spans sampled from the database cover only geographical terms. Extend these to cover also a word preceding the term.

In [1]:
import os
import json
import copy
import random
import os.path

input_dir = 'unlabelled/pos_terms_1000'
assert os.path.exists(input_dir), \
    f'(!) Missing input dir {input_dir!r}. Please run "01_create_sampling_tasks.ipynb" before running this.'

In [2]:
# This runs on newly generated data
output_dir = 'unlabelled/pos_terms_1000_extended'
os.makedirs(output_dir, exist_ok=True)

# Finds last word (an unit separated by whitespace) in a string
def lastWord(string):
    string = " " + string
    # taking empty string
    newstring = ""
    # calculating length of string
    length = len(string)
    # traversing from last
    for i in range(length-1, 0, -1):
        # if space is occurred then return
        if(string[i] == " "):
            # return reverse of newstring
            return newstring[::-1]
        else:
            newstring = newstring + string[i]
    return newstring[::-1]

# Extend all spans to cover one preceding word
c = 0
for file in os.listdir(input_dir):
    print(file)
    with open(f"{input_dir}/{file}", 'r', encoding="utf-8") as f:
        data = json.load(f)
    print(len(data))
    for i in range(len(data)):
        sentence = data[i]['data']['text']
        try:
            old_start = data[i]['predictions'][0]['result'][0]['value']['start']
            old_end  = data[i]['predictions'][0]['result'][0]['value']['end']
            old_geo  = data[i]['predictions'][0]['result'][0]['value']['text']
            if old_start != 0:
                #print(sentence, old_geo)
                prev_word = lastWord(sentence[:old_start].strip())
                len_prev_word = len(prev_word)
                #print(prev_word)
                #print()
                new_start = (old_start - len_prev_word) -1 if (old_start - len_prev_word)-1 >=0 else 0
                #print(prev_word)
                new_geo = prev_word + " " + old_geo
                data[i]['predictions'][0]['result'][0]['value']['start'] = new_start
                data[i]['predictions'][0]['result'][0]['value']['text'] = new_geo
                #data[i]['predictions'][0]['result'] = data[i]['predictions'][0]['result'][0]
            if len(data[i]['predictions'][0]['result']) > 1:
                c+= 1
        except Exception as e:
            # old_start = data[i]['predictions'][0]['result'][0]['value']['start']
            # IndexError: list index out of range 
            # --> Some wierd problems with indexes. Rasmus and Kaire will know the details
            continue
    print(len(data))
    filename_stub = file.replace(".json", "")
    output_path = os.path.join(output_dir, f'{filename_stub}_extended.json')
    with open(output_path, 'w', encoding="utf-8") as f:
        json.dump(data, f)

pos_A_1000.json
975
975
pos_C_1000.json
819
819
pos_D_1000.json
965
965
pos_G_1000.json
864
864
pos_H_1000.json
991
991
pos_I_1000.json
198
198
pos_J_1000.json
976
976
pos_K_1000.json
941
941
pos_N_1000.json
877
877
pos_O_1000.json
556
556
pos_P_1000.json
979
979
pos_S_1000.json
993
993
pos_U_1000.json
687
687
pos_V_1000.json
990
990
pos_X_1000.json
25
25
pos_Y_1000.json
799
799
pos_Z_1000.json
953
953


## II. Pick 100 sentences from each sample

Start manual annotation from a smaller subset. Pick 100 samples from each 1000 sentence sample.

In [3]:
# This runs on old generated data, which was available
input_dir = 'unlabelled/pos_terms_1000_extended'

output_dir = 'unlabelled/pos_terms_100_extended'
os.makedirs(output_dir, exist_ok=True)

# Important: fix seed
rnd = random.Random()
rnd.seed(1)
for file in os.listdir(input_dir):
    print(file)
    new_data = []
    with open(f"{input_dir}/{file}", 'r', encoding="utf-8") as f:
        data = json.load(f)
    max_sents = len(data)
    print(f'Initial sentences:    {max_sents}')
    samples = 130 if max_sents >=130 else max_sents
    print(f'Max sample size:      {samples}')
    uniqs = rnd.sample(range(0, max_sents), samples)
    assert len(list(set(uniqs))) == samples

    for i in uniqs:
        if len(data[i]['predictions'][0]['result'])==1:
            new_data.append(data[i])

    print(f'Unique samples:       {len(new_data)}')
    print(f'Unique samples[final]:{len(new_data[:100])}')
    #break
    filename_stub = file.replace("_1000_extended.json", "")
    output_path = os.path.join(output_dir, f'{filename_stub}_100_extended.json')
    json_data = json.dumps(new_data[:100])
    with open(output_path, 'w', encoding="utf-8") as f:
        json.dump(new_data, f)
    print()

pos_A_1000_extended.json
Initial sentences:    1000
Max sample size:      130
Unique samples:       129
Unique samples[final]:100

pos_C_1000_extended.json
Initial sentences:    996
Max sample size:      130
Unique samples:       130
Unique samples[final]:100

pos_D_1000_extended.json
Initial sentences:    999
Max sample size:      130
Unique samples:       125
Unique samples[final]:100

pos_G_1000_extended.json
Initial sentences:    998
Max sample size:      130
Unique samples:       129
Unique samples[final]:100

pos_H_1000_extended.json
Initial sentences:    999
Max sample size:      130
Unique samples:       127
Unique samples[final]:100

pos_I_1000_extended.json
Initial sentences:    346
Max sample size:      130
Unique samples:       126
Unique samples[final]:100

pos_J_1000_extended.json
Initial sentences:    1000
Max sample size:      130
Unique samples:       130
Unique samples[final]:100

pos_K_1000_extended.json
Initial sentences:    998
Max sample size:      130
Unique samp

---

**Note**: this notebook contains refactored code for data preparation, but the original input data this code was created for is no longer fully available (due to missing data sampling seed). Thus, the outcomes printed in this notebook do no correspond exactly to outputs of original data preparation notebooks (which are distributed elsewhere).

---