### Colab Setup

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Sun Oct 16 04:22:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import tensorflow_datasets as tfds
!pip3 install tensorflow==2.8
#!apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2
!pip3 install --quiet "tensorflow-text==2.8.*"
import tensorflow_text as text
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import time
from tensorflow_text import HubModuleTokenizer
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

!pip install -q tf-models-official==2.4.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[K     |████████████████████████████████| 1.1 MB 29.3 MB/s 
[K     |████████████████████████████████| 43 kB 2.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 59.6 MB/s 
[K     |████████████████████████████████| 352 kB 87.9 MB/s 
[K     |████████████████████████████████| 99 kB 11.2 MB/s 
[K     |████████████████████████████████| 238 kB 81.5 MB/s 
[K     |████████████████████████████████| 1.3 MB 69.9 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


### Build dataset


In [4]:
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import time
from tensorflow_text import HubModuleTokenizer
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'


In [35]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

def prepare_bert_input(sentences, seq_len):
  start = time.time()
  segments = []
  if len(sentences)>=64:
    for i in range(int(len(sentences)-1/64)+1):
      segment = bert_preprocess_model(sentences[64*i:min(len(sentences),64*(i+1))]) 
      segments.append(segment['input_word_ids'])
    segment_concat = tf.concat(segments,axis=0)
  else:
    segment_concat = bert_preprocess_model(sentences)['input_word_ids']
  print('time for tokenizing', time.time()-start)
  return segment_concat

def prepare_CNN_input(sentences, seq_len):
  start = time.time()
  segments = []
  if len(sentences)>=64:
    for i in range(int(len(sentences)-1/64)+1):
      segment = bert_preprocess_model(sentences[64*i:min(len(sentences),64*(i+1))]) 
      segments.append(segment)
    #print(segments)
    segments_0 = tf.concat([segment['input_word_ids'] for segment in segments],axis=0)
  else:
    segments_0 = bert_preprocess_model(sentences)['input_word_ids'] 
  print('time for tokenizing', time.time()-start)
  return segments_0


In [36]:
from official.nlp.bert import tokenization

bert_model = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3', trainable=True)
vocab_file = bert_model.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_model.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [7]:
MAX_SEQ_LEN = 128

ds_train = tfds.load('ag_news_subset', split='train', shuffle_files=False)
ds_test = tfds.load('ag_news_subset', split='test', shuffle_files=False)


train_sentences_list = []
y_train_orig = []
val_sentences_list = []
y_val_orig = []
test_sentences_list = []
y_test_orig = []
test_sentences_list2 = []
y_test2_orig = []

for count,x in enumerate(ds_train):
  if count<50000:
    train_sentences_list.append(x['description'].numpy().decode("utf-8") )
    y_train_orig.append([x['label']])
  elif count<55000:
    val_sentences_list.append(x['description'].numpy().decode("utf-8") )
    y_val_orig.append([x['label']])

for count,x in enumerate(ds_test):
  if count<20000:
    test_sentences_list.append(x['description'].numpy().decode("utf-8") )
    y_test_orig.append([x['label']])
  else:
    test_sentences_list2.append(x['description'].numpy().decode("utf-8") )
    y_test2_orig.append([x['label']])
# 2) encode sentences following the BERT specifications
base_dir = '/content/drive/MyDrive/tracin/agnews/'
preprocessed = True

if not preprocessed:
  x_val = prepare_bert_input(val_sentences_list, MAX_SEQ_LEN)
  x_test = prepare_bert_input(test_sentences_list, MAX_SEQ_LEN)
  x_train = prepare_bert_input(train_sentences_list, MAX_SEQ_LEN)
  y_train_orig = np.array(y_train_orig)
  y_val_orig = np.array(y_val_orig)
  y_test_orig = np.array(y_test_orig)

  y_train = np.sum(y_train_orig,axis=1,keepdims=True)
  y_val = np.sum(y_val_orig,axis=1,keepdims=True)
  y_test = np.sum(y_test_orig,axis=1,keepdims=True)
else:
  x_train = np.load(base_dir+'x_train.npy')
  x_test = np.load(base_dir+'x_test.npy')
  x_val = np.load(base_dir+'x_val.npy')

  y_train = np.load(base_dir+'y_train.npy')
  y_test = np.load(base_dir+'y_test.npy')
  y_val = np.load(base_dir+'y_val.npy')

[1mDownloading and preparing dataset 11.24 MiB (download: 11.24 MiB, generated: 35.79 MiB, total: 47.03 MiB) to ~/tensorflow_datasets/ag_news_subset/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/120000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/ag_news_subset/1.0.0.incompleteO62OX8/ag_news_subset-train.tfrecord*...:   0%|…

Generating test examples...:   0%|          | 0/7600 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/ag_news_subset/1.0.0.incompleteO62OX8/ag_news_subset-test.tfrecord*...:   0%| …

[1mDataset ag_news_subset downloaded and prepared to ~/tensorflow_datasets/ag_news_subset/1.0.0. Subsequent calls will reuse this data.[0m


'\ny_train_orig = np.array(y_train_orig)\ny_val_orig = np.array(y_val_orig)\ny_test_orig = np.array(y_test_orig)\n#y_test2_orig = np.array(y_test2_orig)\n\ny_train = np.sum(y_train_orig,axis=1,keepdims=True)\ny_val = np.sum(y_val_orig,axis=1,keepdims=True)\ny_test = np.sum(y_test_orig,axis=1,keepdims=True)\n#y_test2 = np.sum(y_test2_orig,axis=1,keepdims=True)\n\n\nprint(y_train[:10])\n\nprint(train_sentences_list[:10])\n'

### Train CNN Model

In [9]:
def create_model(verbose=False):
  filters = 50
  kernel_size = 5
  maxlen = 128
  inputs = tf.keras.layers.Input(shape=(maxlen,))
  emb1 = tf.keras.layers.Embedding(25000, 128, input_length=128)(inputs)
  conv1 = tf.keras.layers.Conv1D(
      filters, kernel_size, padding='valid', activation='relu')(
          emb1)
  conv2 = tf.keras.layers.Conv1D(
      filters, kernel_size, padding='valid', activation='relu')(
          conv1)
  conv3 = tf.keras.layers.Conv1D(
      filters, 1, padding='valid', activation='relu')(
          conv2)
  pool1 = tf.keras.layers.GlobalMaxPooling1D()(conv3)
  pred = tf.keras.layers.Dense(4, activation='softmax')(pool1)
  model = tf.keras.Model(inputs, pred)
  if verbose:
    model.summary()
  return model
  """
  filters = 50
  kernel_size=5
  maxlen = 128
  inputs = tf.keras.layers.Input(shape=(maxlen,))
  emb1 = tf.keras.layers.Embedding(30265,128,input_length=128)(inputs)
  conv1 = tf.keras.layers.Conv1D(filters,kernel_size,padding='valid',activation='relu')(emb1)
  conv2 = tf.keras.layers.Conv1D(filters,kernel_size,padding='valid',activation='relu')(conv1)
  conv3 = tf.keras.layers.Conv1D(filters,1,padding='valid',activation='relu')(conv2)
  pool1 = tf.keras.layers.GlobalMaxPooling1D()(conv3)
  pred = tf.keras.layers.Dense(4,activation='softmax')(pool1)
  model = tf.keras.Model(inputs,pred)
  if verbose:
    model.summary()
  return model
  """

In [10]:
batch_size = 128
epochs = 10
opt = tf.keras.optimizers.SGD(learning_rate=2e-2, momentum=0.9)
for i in range(10):
  trained = True
  model = create_model(True)
  loss = tf.keras.losses.SparseCategoricalCrossentropy()
  model.compile(loss=loss, optimizer= opt, metrics=['accuracy'])
  cp_callback = tf.keras.callbacks.ModelCheckpoint(base_dir + 'checkpoint_test_222_'+'{}'.format(i)+'_{epoch:02d}.hdf5', verbose=1, save_weights_only=False)
  if not trained:
    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=10,
              validation_data=(x_val, y_val),
              callbacks=[cp_callback],
              shuffle=True)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding (Embedding)       (None, 128, 128)          3200000   
                                                                 
 conv1d (Conv1D)             (None, 124, 50)           32050     
                                                                 
 conv1d_1 (Conv1D)           (None, 120, 50)           12550     
                                                                 
 conv1d_2 (Conv1D)           (None, 120, 50)           2550      
                                                                 
 global_max_pooling1d (Globa  (None, 50)               0         
 lMaxPooling1D)                                                  
                                                             

###  Choose Test data list

In [11]:
print(model.summary())

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 128)]             0         
                                                                 
 embedding_9 (Embedding)     (None, 128, 128)          3200000   
                                                                 
 conv1d_27 (Conv1D)          (None, 124, 50)           32050     
                                                                 
 conv1d_28 (Conv1D)          (None, 120, 50)           12550     
                                                                 
 conv1d_29 (Conv1D)          (None, 120, 50)           2550      
                                                                 
 global_max_pooling1d_9 (Glo  (None, 50)               0         
 balMaxPooling1D)                                                
                                                           

In [12]:
prediction_tests = np.zeros((10,7600,4))
model = create_model()
for i in range(10):
  model.load_weights(base_dir + 'checkpoint_test_222_{}_10.hdf5'.format(i))
  prediction_list = model.predict(x_test)
  print(prediction_list.shape)
  prediction_tests[i] = prediction_list

np.save(base_dir+'prediction_tests.npy', prediction_tests)

(7600, 4)
(7600, 4)
(7600, 4)
(7600, 4)
(7600, 4)
(7600, 4)
(7600, 4)
(7600, 4)
(7600, 4)
(7600, 4)


In [13]:
np.random.seed(0)
evaluate_test_list_highloss = np.random.permutation(7600)[:20]
#evaluate_test_list_highloss = evaluate_test_list_highloss[-10:] + evaluate_test_list_highloss[10:20]

print(x_test.shape)
print(evaluate_test_list_highloss)
x_eval = tf.convert_to_tensor(x_test[evaluate_test_list_highloss,:])
y_eval = y_test[evaluate_test_list_highloss]
eval_sentences_list = [test_sentences_list[i] for i in evaluate_test_list_highloss]

prediction_eval = np.mean(prediction_tests[:, evaluate_test_list_highloss],0)

(7600, 128)
[6529 4672 3996 2967 5133 6871 1014 4700 2164 5935  273 4494 3304 2453
 1824  152  613 4448 2232 4118]
[[2]
 [2]
 [3]
 [1]
 [0]
 [3]
 [0]
 [1]
 [1]
 [0]
 [1]
 [3]
 [0]
 [2]
 [0]
 [0]
 [2]
 [0]
 [3]
 [1]]
['Retail sales slid in August as people steered away from buying cars and shoppers kept a close eye on their spending after splurging in July.', 'The fallout from allegations of serious accounting problems at Fannie Mae has rattled investors and could even bump up mortgage rates down the road.', 'McData plans to introduce a new SAN router this week designed to connect the growing number of isolated SAN networks in corporations.&lt;p&gt;ADVERTISEMENT&lt;/p&gt;&lt;p&gt;&lt;img src="http://ad.doubleclick.net/ad/idg.us.ifw.general/sbcspotrssfeed;sz=1x1;ord=200301151450?" width="1" height="1" border="0"/&gt;&lt;a href="http://ad.doubleclick.net/clk;9228975;9651165;a?http://www.infoworld.com/spotlights/sbc/main.html?lpid0103035400730000idlp"&gt;SBC Case Study: Crate   Barrel&lt;/

## TracIn

### Load models

In [14]:
model = create_model()
print(model.summary())

cp_list = [3,8]
print(cp_list)
models = []
models_nosm = []
models_penultimate = []

for countm in range(1):
  for count,i in enumerate(cp_list):
    print(count)
    model = create_model()
    loss = tf.keras.losses.BinaryCrossentropy()
    #best_weights_file = os.path.join('/content/gdrive/My Drive/models/', 'model.ckpt-{epoch:04d}')
    model.load_weights(base_dir + 'checkpoint_test_222_{0:01d}_{1:02d}.hdf5'.format(countm,i+1))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC(multi_label=False, curve="ROC"),
                                                        'accuracy'])
    models.append(model)

Model: "model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 128)]             0         
                                                                 
 embedding_11 (Embedding)    (None, 128, 128)          3200000   
                                                                 
 conv1d_33 (Conv1D)          (None, 124, 50)           32050     
                                                                 
 conv1d_34 (Conv1D)          (None, 120, 50)           12550     
                                                                 
 conv1d_35 (Conv1D)          (None, 120, 50)           2550      
                                                                 
 global_max_pooling1d_11 (Gl  (None, 50)               0         
 obalMaxPooling1D)                                               
                                                          

In [15]:
print(len(model.trainable_variables))
for count,layer in enumerate(model.trainable_variables):
  print(count, layer.shape)

9
0 (25000, 128)
1 (5, 128, 50)
2 (50,)
3 (5, 50, 50)
4 (50,)
5 (1, 50, 50)
6 (50,)
7 (50, 4)
8 (4,)


In [16]:
print(x_train)

[[  101  2572  2094 ...     0     0     0]
 [  101 26665  1011 ...     0     0     0]
 [  101  2343  5747 ...     0     0     0]
 ...
 [  101 18952  3619 ...     0     0     0]
 [  101  2149  2390 ...     0     0     0]
 [  101  4811  4484 ...     0     0     0]]


### tracin Util

In [51]:
tracin_evaluate = []
tracin_train = []

def gather_flat_grad(grads):
    views = []
    for p in grads:
      view = tf.reshape(p,[-1])
      views.append(view)
    if len(views)>0:
      return tf.concat(views, 0)
    else:
      return views

@tf.function
def run_withfirstgrad_pred1(X_train1, y_train1, layers='all'):
  if layers == 'all' or 0 in layers:
    we = True
  else:
    we = False
  ind = []
  val = []
  grads = []
  for i, ml in enumerate(models):
    with tf.GradientTape(watch_accessed_variables=False,persistent=False) as tape1:
      if layers == 'all':
        w = ml.trainable_variables
      else:
        w = [ml.trainable_variables[layer] for layer in layers]
      tape1.watch(w)
      prediction = ml(X_train1)
      loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)(y_train1[0], prediction)
      #print(prediction,y_train1[0])
      #print(loss)
    grad = tape1.gradient(loss,w)
    if we:
      grads.append(gather_flat_grad(grad[1:]))
      ind.append(grad[0].indices)
      val.append(grad[0].values)
    else:
      grads.append(gather_flat_grad(grad))
  return tf.concat(grads, 0), ind, val

def get_tracin_grad(x_full, y_full, layers='all', top=False):
  batchsize= 1
  grads = []
  inds = []
  vals = []
  inds_10 = []
  vals_10 = []
  for i in range(x_full.shape[0]):
    if i%1000 ==0:
      print(i)  
    x = x_full[i:i+1]
    y = y_full[i:i+1]
    grad, ind, val = run_withfirstgrad_pred1(x, y, layers)
    grad = grad.numpy()
    ind_reshape = np.array(ind)
    val_reshape = np.array(val)
    #print(ind_reshape.shape)
    #print(val_reshape.shape)
    #print(grad.shape)
    grads.append([grad])
    if ((layers == 'all') or (0 in layers)):
      model_size = len(models)
      ind_repeat = np.array([[i*128]*10 for i in range(model_size)])
      #ind_reshape = ind.reshape((-1,128))
      #print(ind_reshape.shape)
      #val_reshape = val.reshape((-1,128,128))
      val_norm = np.linalg.norm(val_reshape,axis=2)
      top_10_ind_loc = val_norm.argsort(axis=1)[:,-10:]
      #print('start')
      #print(top_10_ind_loc)
      inds.append(ind_reshape)
      vals.append(val_reshape)
      top_10_ind_loc += ind_repeat
      #top_10_ind_loc = top_10_ind_loc.reshape(-1)
      #print(ind)
      #print(ind[top_10_ind_loc])
      inds_10.append(ind_reshape.flatten()[top_10_ind_loc])
      #print(ind_reshape.flatten().shape)
      #print(top_10_ind_loc)
      #print(ind_reshape.flatten()[top_10_ind_loc])
      #print(val_reshape.flatten().shape)
      vals_10.append(val_reshape.reshape(-1,128)[top_10_ind_loc])

  print('grad shape', np.concatenate(grads).shape)
  if  top:
    return {
            'grads': np.concatenate(grads),
            'inds': np.array(0),
            'vals': np.array(0),
            'inds_10': np.array(inds_10),
            'vals_10': np.array(vals_10),
            } 
  elif (layers=='all') or (0 in layers):
    return {
            'grads': np.concatenate(grads),
            'inds': np.array(inds),
            'vals': np.array(vals),
            'inds_10': np.array(0),
            'vals_10': np.array(0),
            } 
  else:
    return {
            'grads': np.concatenate(grads),
            'inds': np.array(0),
            'vals': np.array(0),
            'inds_10': np.array(0),
            'vals_10': np.array(0),
            } 

def sum_indexvalue(ind1s, val1s, ind2s, val2s):
  #print(ind1s.shape)
  #print(val1s.shape)
  sum=0.
  max_list = [(-1,0), (-1,0), (-1,0), (-1,0), (-1,0)]
  #print(ind2)
  for m in range(ind1s.shape[0]):
    ind1 = ind1s[m]
    ind2 = ind2s[m]
    if m==0:
      indexset1 = set(ind1)
      indexset2 = set(ind2)
    else:
      indexset1 = indexset1.union(set(ind1))
      indexset2 = indexset2.union(set(ind2))
  intersect = indexset1.intersection(indexset2)
  for ind in intersect:
    add = 0
    for m in range(ind1s.shape[0]):
      ind1 = ind1s[m]
      ind2 = ind2s[m]
      val1 = val1s[m]
      val2 = val2s[m]
      #if (ind !=0 and ind!= 101 and ind!=102):
      #  sum+= np.sum(np.sum(tensor1[index1==ind],axis=0) * \
      add += np.sum(np.sum(val1[ind1==ind],axis=0) * \
          np.sum(val2[ind2==ind],axis=0))
    max_list.append((np.abs(add),ind))
    max_list.sort()
    max_list.reverse()
    if len(max_list)>5:
      max_list.pop(-1)
    sum+= add
  

  return sum, [item[1] for item in max_list]


def get_tracin_sum(grad2, ind2, val2, grad1, ind1, val1, we):
  if not we:
    return np.sum(grad2*grad1), 0
  elif grad2 and grad1:
    val, max_ind = sum_indexvalue(ind1, val1, ind2, val2)
    return np.sum(grad2*grad1)+val, max_ind
  else:
    val, max_ind = sum_indexvalue(ind1, val1, ind2, val2)
    return val, max_ind

def get_tracin_list(tracin_eval, tracin_train, layers='all', top = False):
  if layers == 'all' or 0 in layers:
    we = True
  else:
    we = False
  sim_array = np.zeros((tracin_eval['grads'].shape[0], tracin_train['grads'].shape[0]))
  ind_array = np.zeros((tracin_eval['grads'].shape[0], tracin_train['grads'].shape[0], 5))
  for i in range(tracin_eval['grads'].shape[0]):
    #print('i', i)
    for j in range(tracin_train['grads'].shape[0]):
      #if j%1000 ==0:
      #  #print('j', j)
      if we:
        if top:
          val, ind = get_tracin_sum(tracin_eval['grads'][i], tracin_eval['inds_10'][i], tracin_eval['vals_10'][i], tracin_train['grads'][j], tracin_train['inds_10'][j], tracin_train['vals_10'][j], we)
        else:
          val, ind = get_tracin_sum(tracin_eval['grads'][i], tracin_eval['inds'][i], tracin_eval['vals'][i], tracin_train['grads'][j], tracin_train['inds'][j], tracin_train['vals'][j], we)
      else:
          val, ind = get_tracin_sum(tracin_eval['grads'][i], 0,0,tracin_train['grads'][j],0,0, we)
      sim_array[i,j] = val
      ind_array[i,j,:] = ind

  return sim_array, ind_array

In [48]:
print(tracin_evaluate['inds_10'].shape)
print(tracin_evaluate['vals_10'].shape)

(20, 2, 10)


### TracIn firsttop


In [55]:
tracin_evaluate = []
tracin_train = []
method = 'tracin_first_top10'
tracin_evaluate = get_tracin_grad(x_eval, y_eval, [0], top=True)
start = time.time()
tracin_train = get_tracin_grad(x_train, y_train, [0], top=True)
print('preparation time on saving gradient: {}'.format(time.time()-start))
print(tracin_evaluate['grads'].shape)
print(tracin_evaluate['inds'].shape)
print(tracin_evaluate['vals'].shape)


start = time.time()
evaluate_train_score, evaluate_train_ind = get_tracin_list(tracin_evaluate, tracin_train, [0], top=True)
print('time on calculating TracIn for 10 test * 50000 training: {}'.format(time.time()-start))
np.save(base_dir + 'evaluate_train_score_testtest2_' + method + '.npy', evaluate_train_score)
np.save(base_dir + 'evaluate_train_ind_testtest2_' + method + '.npy', evaluate_train_ind)

0
grad shape (20, 0)
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
grad shape (50000, 0)
preparation time on saving gradient: 131.29706478118896
(20, 0)
()
()




time on calculating TracIn for 10 test * 50000 training: 60.36470437049866


In [58]:
for i in range(20):
  method = 'tracin_first_top10'
  evaluate_train_score = np.load(base_dir + 'evaluate_train_score_testtest2_' + method + '.npy')
  evaluate_train_ind = np.load(base_dir + 'evaluate_train_ind_testtest2_' + method + '.npy')


  print("="*50) 
  print('Evaluate Sentence: ')
  print('label: {}, prediction: {}'.format(
     y_eval[i][0], prediction_tests[0][i]))
  print(eval_sentences_list[i])
  print("="*50)  
  print('Proponents: ')
  #print(x_eval[i])
  rm_inds = np.argsort(evaluate_train_score[i])[:10]
  for rm_ind in rm_inds:
    print('label: {},predicted_scores: {}, influence: {}'.format(y_train[rm_ind], y_train[rm_ind], evaluate_train_score[i,rm_ind]))
    print(train_sentences_list[rm_ind])
    
    print('top key words')
    print([tokenizer.convert_ids_to_tokens([evaluate_train_ind[i, rm_ind,j]]) for j in range(2)])
    print('-'*50)
  print("="*50)  
  print('Opponents: ')
  #print(x_eval[i])
  rm_inds = np.argsort(evaluate_train_score[i])[::-1][:10]
  for rm_ind in rm_inds:
    print('label: {},predicted_scores: {}, influence: {}'.format(y_train[rm_ind], y_train[rm_ind], evaluate_train_score[i,rm_ind]))
    print(train_sentences_list[rm_ind])
    
    print('top key words')
    print([tokenizer.convert_ids_to_tokens([evaluate_train_ind[i, rm_ind,j]]) for j in range(2)])
    print('-'*50)
  print('\n\n')

Evaluate Sentence: 
label: 2, prediction: [8.57603663e-06 9.99986172e-01 1.31222350e-06 3.96825226e-06]
Retail sales slid in August as people steered away from buying cars and shoppers kept a close eye on their spending after splurging in July.
Proponents: 
label: [0],predicted_scores: [0], influence: -17.930963918566704
The Passion of the Christ sells 2.4 million copies on DVD in its first few hours on sale in the US.
top key words
[['in'], ['on']]
--------------------------------------------------
label: [1],predicted_scores: [1], influence: -16.542261362075806
Ducks and geese coated in crude oil were carried to a national wildlife refuge yesterday by volunteers trying to save them from the largest oil spill on the Delaware River in nearly a decade.
top key words
[['in'], ['a']]
--------------------------------------------------
label: [0],predicted_scores: [0], influence: -14.66664457321167
A businessman is sentenced to one year in jail in Japan for making a pirated film available o

### TracIn first

In [None]:
tracin_evaluate = []
tracin_train = []
method = 'tracin_first'
tracin_evaluate = get_tracin_grad(x_eval, y_eval, [0], top=False)
start = time.time()
tracin_train = get_tracin_grad(x_train, y_train, [0], top=False)
print('preparation time on saving gradient: {}'.format(time.time()-start))
print(tracin_evaluate['grads'].shape)
print(tracin_evaluate['inds'].shape)
print(tracin_evaluate['vals'].shape)


start = time.time()
evaluate_train_score, evaluate_train_ind = get_tracin_list(tracin_evaluate, tracin_train, [0], top=False)
print('time on calculating TracIn for 10 test * 50000 training: {}'.format(time.time()-start))
np.save(base_dir + 'evaluate_train_score_testtest2_' + method + '.npy', evaluate_train_score)
np.save(base_dir + 'evaluate_train_ind_testtest2_' + method + '.npy', evaluate_train_ind)

0
grad shape (20, 0)
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
grad shape (50000, 0)
preparation time on saving gradient: 129.33664202690125
(20, 0)
(20, 2, 128)
(20, 2, 128, 128)




In [None]:
for i in range(10):
  method = 'tracin_first'
  evaluate_train_score = np.load(base_dir + 'evaluate_train_score_testtest2_' + method + '.npy')
  evaluate_train_ind = np.load(base_dir + 'evaluate_train_ind_testtest2_' + method + '.npy')


  print("="*50) 
  print('Evaluate Sentence: ')
  print('label: {}, prediction: {}'.format(
     y_eval[i][0], prediction_tests[0][i]))
  print(eval_sentences_list[i])
  print("="*50)  
  print('Proponents: ')
  #print(x_eval[i])
  rm_inds = np.argsort(evaluate_train_score[i])[:10]
  for rm_ind in rm_inds:
    print('label: {},predicted_scores: {}, influence: {}'.format(y_train[rm_ind], y_train[rm_ind], evaluate_train_score[i,rm_ind]))
    print(train_sentences_list[rm_ind])
    
    print('top key words')
    print([tokenizer.convert_ids_to_tokens([evaluate_train_ind[i, rm_ind,j]]) for j in range(2)])
    print('-'*50)
  print("="*50)  
  print('Opponents: ')
  #print(x_eval[i])
  rm_inds = np.argsort(evaluate_train_score[i])[::-1][:10]
  for rm_ind in rm_inds:
    print('label: {},predicted_scores: {}, influence: {}'.format(y_train[rm_ind], y_train[rm_ind], evaluate_train_score[i,rm_ind]))
    print(train_sentences_list[rm_ind])
    
    print('top key words')
    print([tokenizer.convert_ids_to_tokens([evaluate_train_ind[i, rm_ind,j]]) for j in range(2)])
    print('-'*50)
  print('\n\n')