## Prepare Referece files using TFIDF for retrieving attributes


In [2]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.22.2.post1-cp37-cp37m-manylinux1_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 3.4 MB/s eta 0:00:01
[?25hCollecting joblib>=0.11
  Downloading joblib-0.14.1-py2.py3-none-any.whl (294 kB)
[K     |████████████████████████████████| 294 kB 80.7 MB/s eta 0:00:01
[?25hCollecting scipy>=0.17.0
  Using cached scipy-1.4.1-cp37-cp37m-manylinux1_x86_64.whl (26.1 MB)
Installing collected packages: joblib, scipy, scikit-learn
Successfully installed joblib-0.14.1 scikit-learn-0.22.2.post1 scipy-1.4.1


In [1]:
import pandas as pd
from tqdm import tqdm, trange
import numpy as np
import time
import torch
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def read_file(path):
    print(path)
    with open(path, encoding="utf8", errors='ignore') as fp:
        lines = fp.read().splitlines()
    return lines

In [3]:
def clean_text(text):
    return text.replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","").replace("<END>","").strip()


In [4]:
#data_dir = "data/yelp/"
#data_dir = "./data/imagecaption/"
data_dir = "./data/amazon/"

# TRAIN
train0_org = read_file(data_dir+"sentiment_train_0.txt") # Training data of negative sentiment
train1_org = read_file(data_dir+"sentiment_train_1.txt") # Training data of positive sentiment

train0_processed = read_file(data_dir+"processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_0_all_attrs.txt") # training data with content and attributes seperation
train1_processed = read_file(data_dir+"processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_1_all_attrs.txt") # training data with content and attributes seperation

# REF
ref0_org = read_file(data_dir+"reference_0.txt") # Original Refrence_0 data
ref1_org = read_file(data_dir+"reference_1.txt") # Original Refrence_1 data

ref0_processed = read_file(data_dir+"processed_files_with_bert_with_best_head/reference_0.txt") # Reference data for delete_generate model
ref1_processed = read_file(data_dir+"processed_files_with_bert_with_best_head/reference_1.txt") # Reference data for delete_generate model

# DEV
dev0_org = read_file(data_dir+"sentiment_dev_0.txt") # deving data of negative sentiment
dev1_org = read_file(data_dir+"sentiment_dev_1.txt") # deving data of positive sentiment

dev0_processed = read_file(data_dir+"processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_0_all_attrs.txt") # deving data with content and attributes seperation
dev1_processed = read_file(data_dir+"processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_1_all_attrs.txt") # deving data with content and attributes seperation

# TEST
test0_org = read_file(data_dir+"sentiment_test_0.txt") # testing data of negative sentiment
test1_org = read_file(data_dir+"sentiment_test_1.txt") # testing data of positive sentiment

test0_processed = read_file(data_dir+"processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_0.txt") # testing data with content and attributes seperation
test1_processed = read_file(data_dir+"processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_1.txt") # testing data with content and attributes seperation



./data/amazon/sentiment_train_0.txt
./data/amazon/sentiment_train_1.txt
./data/amazon/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_0_all_attrs.txt
./data/amazon/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_train_1_all_attrs.txt
./data/amazon/reference_0.txt
./data/amazon/reference_1.txt
./data/amazon/processed_files_with_bert_with_best_head/reference_0.txt
./data/amazon/processed_files_with_bert_with_best_head/reference_1.txt
./data/amazon/sentiment_dev_0.txt
./data/amazon/sentiment_dev_1.txt
./data/amazon/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_0_all_attrs.txt
./data/amazon/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_dev_1_all_attrs.txt
./data/amazon/sentiment_test_0.txt
./data/amazon/sentiment_test_1.txt
./data/amazon/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/sentiment_test_0.txt
./data/amazon/processed_files_wit

In [10]:
# Get the Original Reference Sentence
ref0_org = [x.split("\t")[0] for x in ref0_org]
ref1_org = [x.split("\t")[0] for x in ref1_org]

In [11]:
# Get the Content of the Reference Sentences
ref0_con = [clean_text(x) for x in ref0_processed]
ref1_con = [clean_text(x) for x in ref1_processed]

In [13]:
ref0_org[:4], ref0_con[:4]

(["ever since joes has changed hands it 's just gotten worse and worse .",
  'there is definitely not enough room in that part of the venue .',
  'so basically tasted watered down .',
  "she said she 'd be back and disappeared for a few minutes ."],
 ["ever since joes has changed hands it ' s just gotten and . ever since joes has changed hands it ' s gotten and better .",
  'there is definitely enough in that part of the venue . there is so much in that part of the venue',
  "so basically tasted . it didn ' t taste down at all .",
  "she said she ' be back and for a few minutes . she said she ' be back , and didn ' t disappear at all ."])

In [5]:
def get_train_content(text):
    return text.split("<START>")[0].split("<CON_START>")[1].strip()

In [6]:
def get_train_attrs(text):
    return text.split("<CON_START>")[0].replace("<ATTR_WORDS>","").strip().split()

In [7]:
# MAY 16: GET the Content of the TEST Sentences
test0_con = [get_train_content(x) for x in test0_processed]
test1_con = [get_train_content(x) for x in test1_processed]

test_attrs_neg = [get_train_attrs(x) for x in test0_processed]
test_attrs_pos = [get_train_attrs(x) for x in test1_processed]

In [8]:
test0_org[:4], test0_con[:4], test_attrs_neg[:4]

(['if your bike had a kickstand on the plate it won t lock down . ',
  'they are thinner and retain memory than the older ones . ',
  'it makes a buzzing sound when devices are plugged in . ',
  'i could barely get through it they taste so nasty . '],
 ['your had a kickstand on the plate it won t down .',
  'they are and memory than the older ones .',
  'it makes a sound devices are plugged in .',
  'i could barely get it they so nasty .'],
 [['bike', 'if'],
  ['thinner', 'retain'],
  ['buzzing', 'when'],
  ['through', 'taste']])

In [9]:
get_train_attrs(train0_processed[0])

['moderate', 'attacks']

In [10]:
train0_processed[:4], train1_processed[:4]

(['<ATTR_WORDS> moderate attacks <CON_START> especially on where the are constant . <START> especially on moderate where the attacks are constant . <END>',
  '<ATTR_WORDS> looked drier <CON_START> i put this on my hair and flat ironed it and my than sand paper . <START> i put this on my hair and flat ironed it and my hair looked drier than sand paper . <END>',
  '<ATTR_WORDS> since three worst <CON_START> their price is times that they are about the value in headphones . <START> since their price is three times that they are about the worst value in headphones . <END>',
  '<ATTR_WORDS> personal touch <CON_START> just had to give it the . <START> just had to give it the personal touch . <END>'],
 ['<ATTR_WORDS> thing without any <CON_START> i open almost exclusively very small cans of tomato paste with this problems . <START> i open almost exclusively very small cans of tomato paste with this thing without any problems . <END>',
  '<ATTR_WORDS> opening handy lot <CON_START> it s made wi

In [11]:
# get content
train0_con = [get_train_content(x) for x in train0_processed]
train1_con = [get_train_content(x) for x in train1_processed]

In [26]:
train0_con[:4], train1_con[:4]

(['young people walking on the high wire the time .',
  'a woman and a dog sit on a tree stump wondering have 9 .',
  'the brown dog eats from a bowl on table as a black dog off for .',
  'child standing on sprinkler mat with spraying is trying to .'],
 ['the men when they are walking barefoot on a tightrope .',
  'an woman and her white dog siting on a tree stump a .',
  'a little dog jumps off bench where larger dog is eating .',
  'a kid standing in a sprinkler to in water .'])

In [12]:
# Fatch attributes from the training data
attrs_neg = [get_train_attrs(x) for x in train0_processed]
attrs_pos = [get_train_attrs(x) for x in train1_processed]

In [18]:
# Get TFIDF vectors for Training and Reference
"""
tfidf = TfidfVectorizer()
conts_vecs = tfidf.fit_transform(train0_con + train1_con)
conts_pos_vecs = conts_vecs[:len(train1_con)]
conts_neg_vecs = conts_vecs[len(train1_con):len(train1_con)+len(train0_con)]
conts_from_pos_ref_vecs = tfidf.transform(ref1_con)
conts_from_neg_ref_vecs = tfidf.transform(ref0_con)
"""

In [13]:
# INSTEAD OF ABOVE DO TFIDF for Training and Test !
# Get TFIDF vectors for Training and Reference
tfidf = TfidfVectorizer()
conts_vecs = tfidf.fit_transform(train0_con + train1_con)
conts_pos_vecs = conts_vecs[:len(train1_con)]
conts_neg_vecs = conts_vecs[len(train1_con):len(train1_con)+len(train0_con)]
conts_from_pos_test_vecs = tfidf.transform(test1_con)
conts_from_neg_test_vecs = tfidf.transform(test0_con)

In [14]:
check_pos = conts_from_pos_test_vecs[0]
#print([ a for a in dir(check_pos) if a[0] != "_"])
#check_pos.nnz

print(test1_con[0])
#print(check_pos.nnz)      # 7
#print(check_pos.data)     # [0.28597091 0.53041017 0.2618744  0.46515963 0.22145522 0.45394814 0.31116341]
#print(check_pos.indices)  # [9121 9113 8181 7417 4245 3899  535]
xx = check_pos.sorted_indices()
print(xx)
print(xx.indices)
#print(xx..
fnames = tfidf.get_feature_names()
print([fnames[a] for a in check_pos.indices])



i ve had this thermometer for num _ num .
  (0, 16489)	0.22769910539449292
  (0, 18931)	0.3199795769819059
  (0, 28740)	0.5166708241794645
  (0, 43443)	0.6336212042188881
  (0, 43641)	0.1932534598902485
  (0, 46700)	0.3741586788561707
[16489 18931 28740 43443 43641 46700]
['ve', 'this', 'thermometer', 'num', 'had', 'for']


(1, 9157)

#### AnnoyIndex is used to store the TFIDF vectors of training set and retrieve nearest neighbours of the reference content 

In [20]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.16.3.tar.gz (644 kB)
[K     |████████████████████████████████| 644 kB 3.0 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25ldone
[?25h  Created wheel for annoy: filename=annoy-1.16.3-cp37-cp37m-linux_x86_64.whl size=275501 sha256=352b4d48653d80445a26736f1ab5ee2218f6b7cc3d6da57324a84a69aaed44ae
  Stored in directory: /home/diego/.cache/pip/wheels/39/36/d4/ee348a7240ca3e8d1fcbf04ebe46d45f2879ccb094a40f5706
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.16.3


In [15]:
from annoy import AnnoyIndex

In [16]:
train0_tree = AnnoyIndex(conts_neg_vecs.shape[-1])
train1_tree = AnnoyIndex(conts_pos_vecs.shape[-1])

  """Entry point for launching an IPython kernel.
  


In [17]:
print(conts_neg_vecs.shape)
print(conts_pos_vecs.shape)

(277769, 49424)
(277228, 49424)


In [19]:
# We have randomly selected training samples to control the memory usage

#for yelp  ... 180,000 and 270k
#neg_idxs = np.random.choice(conts_neg_vecs.shape[0], size=50000, replace=False)
#pos_idxs = np.random.choice(conts_pos_vecs.shape[0], size=50000, replace=False)

#for image caption  .. 6k / 6k train
#neg_idxs = np.random.choice(conts_neg_vecs.shape[0], size=6000, replace=False)
#pos_idxs = np.random.choice(conts_pos_vecs.shape[0], size=6000, replace=False)
 
#for amazon   ... 277kpos and 270k neg train
neg_idxs = np.random.choice(conts_neg_vecs.shape[0], size=50000, replace=False)
pos_idxs = np.random.choice(conts_pos_vecs.shape[0], size=50000, replace=False)

In [20]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(neg_idxs)):
    np_array = conts_neg_vecs[neg_idxs[i]].toarray()[0]
    train0_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [05:39<00:00, 147.21it/s]


In [21]:
train0_tree.build(50)
train0_tree.save(data_dir+'tfidf_train0.ann')

True

In [31]:
#ref1_con[0:3], " ".join(attrs_neg[neg_idxs[0]])
#test1_con[0], " ".join(attrs_neg[neg_idxs[0]])   #("it ' s small yet they you at home .", 'mistake')
test1_con[1], " ".join(attrs_neg[neg_idxs[1]])

('i will be going back and this place !', "'s season _num_ attitude")

In [22]:
#mkdir tfidf folder in processed_files_with_bert_with_best_head/delete_retrieve_edit_model/
!cd data/amazon/processed_files_with_bert_with_best_head/delete_retrieve_edit_model/; mkdir tfidf; ls

sentiment_dev_0_all_attrs.txt	sentiment_test_all_attrs.txt
sentiment_dev_0.txt		sentiment_test.txt
sentiment_dev_1_all_attrs.txt	sentiment_train_0_all_attrs.txt
sentiment_dev_1.txt		sentiment_train_0.txt
sentiment_dev_all_attrs.txt	sentiment_train_1_all_attrs.txt
sentiment_dev.txt		sentiment_train_1.txt
sentiment_test_0_all_attrs.txt	sentiment_train_all_attrs.txt
sentiment_test_0.txt		sentiment_train.txt
sentiment_test_1_all_attrs.txt	tfidf
sentiment_test_1.txt


## may 16 TRAIN AND TEST 1

In [23]:
# Here we'd like to get attributes from the target class

# we'd like to take the positive examples content and give them negative attributes

with open(data_dir+"processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/test_1.txt", "w") as out_fp:
    for i in range(conts_from_pos_test_vecs.shape[0]):   #for positive test sentences
        #if i < 10:
        x = conts_from_pos_test_vecs[i].toarray()[0]
        inx,dis = train0_tree.get_nns_by_vector(x, 1, include_distances=True)  #get negative attributes closes to it
        test_sen = test1_con[i]
        #ref_sen = processed_ref0[i].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","")
        #print(dis,"\t",ref0_org[i], "\t" ,train1_data[inx[0]], train1_attr[inx[0]])

        a = conts_from_pos_test_vecs[i].sorted_indices()
        #print(type(a.tolil()))
        #print([ b for b in dir(a) if b[0] != "_"])

        pos_test_sent_words = [fnames[a] for a in conts_from_pos_test_vecs[i].indices]
        neg_train_words = [fnames[a] for a in conts_vecs[neg_idxs[inx[0]]].indices]
        #print(inx,neg_idxs[inx[0]], "Dist from ",pos_test_sent_words, "to ", neg_train_words,"=",dis )
        out_str = "<ATTR_WORDS> " + " ".join(attrs_neg[neg_idxs[inx[0]]]) + " <CON_START> " + test_sen.strip() + " <START>" + "\n"
        #print(out_str)
        out_fp.write(out_str)

In [27]:
"""
for i in trange(len(pos_idxs)):
    np_array = conts_pos_vecs[pos_idxs[i]].toarray()[0]
    train1_tree.add_item(i,np_array)
    
train1_tree.build(50)
"""
train1_tree.save(data_dir+'tfidf_train1.ann')

True

In [28]:
with open(data_dir+"processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/test_0.txt", "w") as out_fp:
    for i in range(conts_from_neg_test_vecs.shape[0]):   #for negative test sentences
        x = conts_from_neg_test_vecs[i].toarray()[0]
        inx,dis = train1_tree.get_nns_by_vector(x, 1, include_distances=True)  #get positive attributes closes to it in train
        test_sen = test0_con[i]
        out_str = "<ATTR_WORDS> " + " ".join(attrs_pos[pos_idxs[inx[0]]]) + " <CON_START> " + test_sen.strip() + " <START>" + "\n"
        out_fp.write(out_str)

### TRAIN AND REFERENCE 1

In [29]:
with open(data_dir+"processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/reference_1.txt", "w") as out_fp:
    for i in range(conts_from_pos_ref_vecs.shape[0]):
        x = conts_from_pos_ref_vecs[i].toarray()[0]
        inx,dis = train0_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref1_con[i]
        #ref_sen = processed_ref0[i].replace("<POS>","").replace("<NEG>","").replace("<CON_START>","").replace("<START>","")
        #print(dis,"\t",ref0_org[i], "\t" ,train1_data[inx[0]], train1_attr[inx[0]])
        out_str = "<ATTR_WORDS> " + " ".join(attrs_neg[neg_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(out_str)
        out_fp.write(out_str)

<ATTR_WORDS> do n't <CON_START> it ' s small yet they you feel at home . it ' s small yet they you feel like a stranger . <START>

<ATTR_WORDS> miss <CON_START> i will be going back and this place ! i ' t be going back and suffering at this terrible place ! <START>

<ATTR_WORDS> worst 's to <CON_START> the drinks were affordable and a . the drinks were and half full . <START>

<ATTR_WORDS> to dry <CON_START> my husband got a ruben sandwich , he it . my husband got a sandwich , he it . <START>

<ATTR_WORDS> sadly not <CON_START> i signed up for their email and a coupon . i up for their email and spam . <START>

<ATTR_WORDS> no gloves <CON_START> i ' d definitely giving them a try . i ' d not giving them a try . <START>

<ATTR_WORDS> check other notice <CON_START> i highly recommend e & m . i highly e & m painting . <START>

<ATTR_WORDS> please hire n't slow <CON_START> otherwise a and we will go again . otherwise a experience and we will not go again . <START>

<ATTR_WORDS> _num_ minute

In [30]:
#for i in trange(conts_neg_vecs.shape[0]):
for i in trange(len(pos_idxs)):
    np_array = conts_pos_vecs[pos_idxs[i]].toarray()[0]
    train1_tree.add_item(i,np_array)

100%|██████████| 50000/50000 [01:07<00:00, 744.38it/s]


In [31]:
train1_tree.build(50)
train1_tree.save(data_dir+'tfidf_train1.ann')

True

In [32]:
with open(data_dir+"processed_files_with_bert_with_best_head/delete_retrieve_edit_model/tfidf/reference_0.txt", "w") as out_fp:
    for i in range(conts_from_neg_ref_vecs.shape[0]):
        x = conts_from_neg_ref_vecs[i].toarray()[0]
        inx,dis = train1_tree.get_nns_by_vector(x, 1, include_distances=True)
        ref_sen = ref0_con[i]
        out_str = "<ATTR_WORDS> " + " ".join(attrs_pos[pos_idxs[inx[0]]]) + " <CON_START> " + ref_sen.strip() + " <START>" + "\n"
        print(i, out_str)
        out_fp.write(out_str)

0 <ATTR_WORDS> also best <CON_START> ever since joes has changed hands it ' s just gotten and . ever since joes has changed hands it ' s gotten and better . <START>

1 <ATTR_WORDS> great place <CON_START> there is definitely enough in that part of the venue . there is so much in that part of the venue <START>

2 <ATTR_WORDS> fantastic fabulous <CON_START> so basically tasted . it didn ' t taste down at all . <START>

3 <ATTR_WORDS> love <CON_START> she said she ' be back and for a few minutes . she said she ' be back , and didn ' t disappear at all . <START>

4 <ATTR_WORDS> find honesty <CON_START> i ca ' t believe how this pharmacy is . this pharmacy is really . <START>

5 <ATTR_WORDS> great great fun <CON_START> just and took it the bill . just and put it on the bill . <START>

6 <ATTR_WORDS> love <CON_START> it is n ' t terrible , but it is ' t very either . it is n ' t perfect , but it is very . <START>

7 <ATTR_WORDS> w/ smile <CON_START> definitely that i could use my birthday gi