In [1]:
from tqdm import tqdm, trange
import numpy as np

In [8]:
def process_file(input_file, output_file):
    """
    The function process the data files for Delete & Generate and convert
    it for the Delete, Retrieve and Generate training by separating the content
    and attributes. It includes all the attribure words.
    
    Input_file: string : Path of the input file
    Output_file: string : Path of the output file 
    """
    max_atts = -1   #diego added to prevent errors
    
    with open(input_file) as fp:
        data = fp.read().splitlines()
    with open (output_file,"w") as out_fp:
        for x in tqdm(data):
            temp = x.split("<START>")
            con = temp[0].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","")
            sen = temp[1].replace("<END>","")
            lt1 = con.split()
            lt2 = sen.split()
            att_tokens = [z for z in lt2 if z not in lt1]
            if len(att_tokens) > max_atts:
                max_atts = len(att_tokens)
            att_words = " ".join(att_tokens)
            out_str = "<ATTR_WORDS> " + att_words + " <CON_START> " + con.strip() + " <START> " + sen.strip() + " <END>" + "\n"
            out_fp.write(out_str)
    

In [9]:
def process_file_v1(input_file, output_file):
    """
    The function process the data files for Delete & Generate and convert
    it for the Delete, Retrieve and Generate training by separating the content
    and attributes. It randomly picks 70% of the attributes only to make the generation
    more realistic instead of just filling the blanks, which helps while generating
    sentences for test cases.
    
    Input_file: string : Path of the input file
    Output_file: string : Path of the output file 
    """
    with open(input_file) as fp:
        data = fp.read().splitlines()
    with open (output_file,"w") as out_fp:
        for x in tqdm(data):
            temp = x.split("<START>")
            con = temp[0].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","")
            sen = temp[1].replace("<END>","")
            lt1 = con.split()
            lt2 = sen.split()
            att_words = [z for z in lt2 if z not in lt1]
            #print(att_words)
            
            if len(att_words) > 2:
                indx = np.array(list(range(len(att_words))))
                np.random.shuffle(indx)
                att_words = " ".join([att_words[indx[k]] for k in range(int(0.7 * len(att_words)))])
            else: # If attributes less than 2 then keep all the attributes
                att_words = " ".join(att_words)
            #print(att_words)
            out_str = "<ATTR_WORDS> " + att_words + " <CON_START> " + con.strip() + " <START> " + sen.strip() + " <END>" + "\n"
            out_fp.write(out_str)

In [12]:
#1st. make dir data/yelp/processed_files_with_bert_with_best_head/delete_retrieve_edit_model  for task
#!cd data/amazon/processed_files_with_bert_with_best_head/; mkdir delete_retrieve_edit_model; 
!cd data/amazon/processed_files_with_bert_with_best_head/; ls; 

delete_retrieve_edit_model  sentiment_dev_0.txt   sentiment_test_1.txt
reference_0.txt		    sentiment_dev_1.txt   sentiment_train_0.txt
reference_1.txt		    sentiment_test_0.txt  sentiment_train_1.txt


In [13]:
!cd data/amazon/processed_files_with_bert_with_best_head/; cat sentiment_train_0.txt sentiment_train_1.txt > sentiment_train.txt
!cd data/amazon/processed_files_with_bert_with_best_head/; cat sentiment_dev_0.txt sentiment_dev_1.txt > sentiment_dev.txt
!cd data/amazon/processed_files_with_bert_with_best_head/; cat sentiment_test_0.txt sentiment_test_1.txt > sentiment_test.txt

In [10]:
#2nd. set task 
#task = "yelp"  
#task = "imagecaption"
task = "amazon"
data_dir = "data/"+task+"/processed_files_with_bert_with_best_head/"

In [14]:
#cat sentiment_train_0.txt sentiment_train_1.txt > sentiment_train.txt
process_file(data_dir+"sentiment_train.txt",data_dir+"delete_retrieve_edit_model/sentiment_train_all_attrs.txt")
process_file(data_dir+"sentiment_train_1.txt",data_dir+"delete_retrieve_edit_model/sentiment_train_1_all_attrs.txt")
process_file(data_dir+"sentiment_train_0.txt",data_dir+"delete_retrieve_edit_model/sentiment_train_0_all_attrs.txt")

100%|██████████| 554997/554997 [00:03<00:00, 161354.83it/s]
100%|██████████| 277228/277228 [00:01<00:00, 150861.47it/s]
100%|██████████| 277769/277769 [00:01<00:00, 163126.33it/s]


In [15]:
#cat sentiment_test_0.txt sentiment_test_1.txt > sentiment_test.txt
process_file(data_dir+"sentiment_test.txt",data_dir+"delete_retrieve_edit_model/sentiment_test_all_attrs.txt")
process_file(data_dir+"sentiment_test_1.txt",data_dir+"delete_retrieve_edit_model/sentiment_test_1_all_attrs.txt")
process_file(data_dir+"sentiment_test_0.txt",data_dir+"delete_retrieve_edit_model/sentiment_test_0_all_attrs.txt")

100%|██████████| 1000/1000 [00:00<00:00, 65522.69it/s]
100%|██████████| 500/500 [00:00<00:00, 72460.51it/s]
100%|██████████| 500/500 [00:00<00:00, 77500.07it/s]


In [16]:
#cat sentiment_dev_0.txt sentiment_dev_1.txt > sentiment_dev.txt
process_file(data_dir+"sentiment_dev.txt",data_dir+"delete_retrieve_edit_model/sentiment_dev_all_attrs.txt")
process_file(data_dir+"sentiment_dev_0.txt",data_dir+"delete_retrieve_edit_model/sentiment_dev_1_all_attrs.txt")
process_file(data_dir+"sentiment_dev_1.txt",data_dir+"delete_retrieve_edit_model/sentiment_dev_0_all_attrs.txt")

100%|██████████| 2000/2000 [00:00<00:00, 61150.37it/s]
100%|██████████| 1015/1015 [00:00<00:00, 66765.24it/s]
100%|██████████| 985/985 [00:00<00:00, 75728.89it/s]


In [18]:
process_file_v1(data_dir+"sentiment_train.txt",data_dir+"delete_retrieve_edit_model/sentiment_train.txt")
process_file_v1(data_dir+"sentiment_train_1.txt",data_dir+"delete_retrieve_edit_model/sentiment_train_1.txt")
process_file_v1(data_dir+"sentiment_train_0.txt",data_dir+"delete_retrieve_edit_model/sentiment_train_0.txt")

100%|██████████| 554997/554997 [00:05<00:00, 96641.18it/s] 
100%|██████████| 277228/277228 [00:02<00:00, 97415.44it/s]
100%|██████████| 277769/277769 [00:02<00:00, 95053.72it/s]


In [19]:
process_file_v1(data_dir+"sentiment_test.txt",data_dir+"delete_retrieve_edit_model/sentiment_test.txt")
process_file_v1(data_dir+"sentiment_test_1.txt",data_dir+"delete_retrieve_edit_model/sentiment_test_1.txt")
process_file_v1(data_dir+"sentiment_test_0.txt",data_dir+"delete_retrieve_edit_model/sentiment_test_0.txt")

100%|██████████| 1000/1000 [00:00<00:00, 103044.03it/s]
100%|██████████| 500/500 [00:00<00:00, 110359.00it/s]
100%|██████████| 500/500 [00:00<00:00, 98397.79it/s]


In [20]:
process_file_v1(data_dir+"sentiment_dev.txt",data_dir+"delete_retrieve_edit_model/sentiment_dev.txt")
process_file_v1(data_dir+"sentiment_dev_0.txt",data_dir+"delete_retrieve_edit_model/sentiment_dev_1.txt")
process_file_v1(data_dir+"sentiment_dev_1.txt",data_dir+"delete_retrieve_edit_model/sentiment_dev_0.txt")

100%|██████████| 2000/2000 [00:00<00:00, 86698.58it/s]
100%|██████████| 1015/1015 [00:00<00:00, 87663.83it/s]
100%|██████████| 985/985 [00:00<00:00, 89474.37it/s]
