In [1]:
# Annotation is based on Direct Assessment (DA):
# https://github.com/ygraham/direct-assessment

import pandas as pd
from txt2img import text2png
import random, os

In [12]:
def process(data_path, trap_path, lang):
    data=pd.read_csv(data_path)
    trap=pd.read_csv(trap_path)
    
    out_dir = f'mturk/images/{lang}/'
    out_dir_src = f'mturk/images/{lang}/src/'
    out_dir_tgt = f'mturk/images/{lang}/tgt/'
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
        os.mkdir(out_dir_src)
        os.mkdir(out_dir_tgt)

    def normalise(text):
        text = text.split('\n')
        text = [a+'.' if  (len(a)>0 and a[-1]!= '.') else a for a in text]
        return ' '.join(text)

    for idx, row in data.iterrows():
        src_fname = out_dir_src + row['id']+'.png'
        tgt_fname = out_dir_tgt + row['id']+'.png'
        text2png(normalise(row['text1']), src_fname, fontfullpath = "times.ttf", color = '#708090',fontsize=20)
        text2png(normalise(row['text2']), tgt_fname, fontfullpath = "times.ttf",fontsize=20)
    for idx, row in trap.iterrows():
        src_fname = out_dir_src + row['id']+'.png'
        tgt_fname = out_dir_tgt + row['id']+'.png'
        text2png(normalise(row['text1']), src_fname, fontfullpath = "times.ttf", color = '#708090',fontsize=20)
        text2png(normalise(row['text2']), tgt_fname, fontfullpath = "times.ttf",fontsize=20)


    # Create 1 HITS where it contains 100 tasks

    data = list(data['id'])
    trap_same = [t for t in trap['id'] if 'SAME' in t]
    trap_diff = [t for t in trap['id'] if 'DIFF' in t]
    random.shuffle(data)
    random.shuffle(trap_same)
    random.shuffle(trap_diff)

    prop_data = 90
    prop_trap = 5

    cols = []
    for idx in range(100):
        cols.append('text_'+str(idx))

    df = pd.DataFrame(columns=cols)
    for idx in range(6):
        new_row = []
        cur_data = data[idx*prop_data:idx*prop_data+prop_data]
        cur_trap_same = trap_same[idx*prop_trap:idx*prop_trap+prop_trap]
        cur_trap_diff = trap_diff[idx*prop_trap:idx*prop_trap+prop_trap]
        new_row += (cur_data + cur_trap_same + cur_trap_diff)
        random.shuffle(new_row)

        row = {}
        for idx in range(100):
            row['text_'+str(idx)] = '\''+new_row[idx]+'\''
        df = df.append(row, ignore_index=True)
    df.to_csv(out_dir+'turker.csv', index=False)


In [13]:
process('mturk/samples/data_ID.csv', 'mturk/samples/trap_ID.csv', 'ID')
process('mturk/samples/data_ZH.csv', 'mturk/samples/trap_ZH.csv', 'ZH')
process('mturk/samples/data_FR.csv', 'mturk/samples/trap_FR.csv', 'FR')
process('mturk/samples/data_RU.csv', 'mturk/samples/trap_RU.csv', 'RU')
process('mturk/samples/data_ES.csv', 'mturk/samples/trap_ES.csv', 'ES')
process('mturk/samples/data_TR.csv', 'mturk/samples/trap_TR.csv', 'TR')
process('mturk/samples/data_DE.csv', 'mturk/samples/trap_DE.csv', 'DE')