In [28]:
# First, please download, Liputan6, LCSTS, and MLSUM.
# Separate the test set with the folder names below. 
# Reference summary is put in a folder: gold or reference
# PG summary is put in a folder: PG or pg_author
# BERT summary is put in a folder: indoBertExtAbs or mBertAbs or bert_author
# Naming config for each doc-summary: Please use a unique file name for each file 

import glob
import pandas as pd
import random

ID=('/Data/Liputan6_final/output_test', 'reference', 'PG', 'indoBertExtAbs')
ZH=('/Data/LCSTS2.0/LCSTS2.0/output_test', 'reference', 'PG', 'mBertAbs')
FR=('/Workspace/mlsum/data/fr/test', 'gold', 'pg_author', 'bert_author')
RU=('/Workspace/mlsum/data/ru/test', 'gold', 'pg_author', 'bert_author')
ES=('/Workspace/mlsum/data/es/test', 'gold', 'pg_author', 'bert_author')
DE=('/Workspace/mlsum/data/de/test', 'gold', 'pg_author', 'bert_author')
TR=('/Workspace/mlsum/data/tu/test', 'gold', 'pg_author', 'bert_author')

In [29]:
TOTAL=135
TRAP_SAME=30
TRAP_DIFF=30

def read(fname):
    return ''.join(open(fname).readlines())

def create_data(lang, path, gold_folder, pg_folder, bert_folder, use_oracle=False):
    all_golds = glob.glob(path+'/'+gold_folder+'/*')
    random.shuffle(all_golds)
    golds = all_golds[:TOTAL]
    
    data = []
    for idx in range(len(golds)):
        fname = golds[idx].split('/')[-1]
        ids = fname.replace('_reference','').replace('.txt','')
        
        ref = read(golds[idx])
        sum_pg = read(path+'/'+pg_folder+'/'+fname.replace('reference', 'decoded'))
        sum_bert = read(path+'/'+bert_folder+'/'+fname.replace('reference', 'decoded'))
        
        cur_id = f"{lang}-{ids}-SYSTEM-REFERENCE-PG"
        data.append((cur_id,sum_pg,ref))

        cur_id = f"{lang}-{ids}-REFERENCE-SYSTEM-PG"
        data.append((cur_id,ref,sum_pg))

        cur_id = f"{lang}-{ids}-SYSTEM-REFERENCE-BERT"
        data.append((cur_id,sum_bert,ref))

        cur_id = f"{lang}-{ids}-REFERENCE-SYSTEM-BERT"
        data.append((cur_id,ref,sum_bert))
        
    trap = []
    golds = all_golds[TOTAL:TOTAL+TRAP_SAME]
    for idx in range(len(golds)):
        fname = golds[idx].split('/')[-1]
        ids = fname.replace('_reference','').replace('.txt','')
        
        ref = read(golds[idx])
        noise = ''
        if use_oracle:
            noise = open(path+'/oracle/'+fname.replace('reference', 'decoded')).readlines()[-1]+'\n'
        cur_id = f"{lang}-{ids}-TRAP-SAME"
        trap.append((cur_id, noise+ref, ref))
        
    while len(trap) < TRAP_SAME+TRAP_DIFF:
        a = random.randint(TOTAL+TRAP_SAME, len(all_golds)-1)
        b = random.randint(TOTAL+TRAP_SAME, len(all_golds)-1)
        if a == b:
            continue
            
        fname1 = all_golds[a].split('/')[-1]
        ids1 = fname1.replace('_reference','').replace('.txt','')
        fname2 = all_golds[b].split('/')[-1]
        ids2 = fname2.replace('_reference','').replace('.txt','')
        
        cur_id = f"{lang}-{ids1}_{ids2}-TRAP-DIFFERENT"
        ref1 = read(all_golds[a])
        ref2 = read(all_golds[b])
        trap.append((cur_id, ref1, ref2))
    
    return data, trap

In [30]:
data_ID, trap_ID = create_data('ID', ID[0], ID[1], ID[2], ID[3], True)
data_ZH, trap_ZH = create_data('ZH', ZH[0], ZH[1], ZH[2], ZH[3], False)
data_FR, trap_FR = create_data('FR', FR[0], FR[1], FR[2], FR[3], True)
data_RU, trap_RU = create_data('RU', RU[0], RU[1], RU[2], RU[3], True)
data_ES, trap_ES = create_data('ES', ES[0], ES[1], ES[2], ES[3], True)
data_TR, trap_TR = create_data('TR', TR[0], TR[1], TR[2], TR[3], True)
data_DE, trap_DE = create_data('DE', DE[0], DE[1], DE[2], DE[3], True)

In [33]:
!mkdir mturk/samples

def save_to(d, fname):
    df = pd.DataFrame()
    ids = []; texts1 = []; texts2 = []
    for id, text1, text2 in d:
        ids.append(id)
        texts1.append(text1)
        texts2.append(text2)
    df['id'] = ids
    df['text1'] = texts1
    df['text2'] = texts2
    df.to_csv(fname,index=False)
    
save_to(data_ID, 'mturk/samples/data_ID.csv'); save_to(trap_ID, 'mturk/samples/trap_ID.csv')
save_to(data_ZH, 'mturk/samples/data_ZH.csv'); save_to(trap_ZH, 'mturk/samples/trap_ZH.csv')
save_to(data_FR, 'mturk/samples/data_FR.csv'); save_to(trap_FR, 'mturk/samples/trap_FR.csv')
save_to(data_RU, 'mturk/samples/data_RU.csv'); save_to(trap_RU, 'mturk/samples/trap_RU.csv')
save_to(data_ES, 'mturk/samples/data_ES.csv'); save_to(trap_ES, 'mturk/samples/trap_ES.csv')
save_to(data_TR, 'mturk/samples/data_TR.csv'); save_to(trap_TR, 'mturk/samples/trap_TR.csv')
save_to(data_DE, 'mturk/samples/data_DE.csv'); save_to(trap_DE, 'mturk/samples/trap_DE.csv')

In [46]:
assert len(data_ID) == len(data_ZH) == len(data_FR) == len(data_RU) == len(data_ES) == len(data_TR) == len(data_DE)
assert len(trap_ID) == len(trap_ZH) == len(trap_FR) == len(trap_RU) == len(trap_ES) == len(trap_TR) == len(trap_DE)