In [1]:
sst_dataset = {}
for split in ["train", "dev", "test"]:
    URL = f"https://raw.githubusercontent.com/successar/instance_attributions_NLP/master/Datasets/SST/data/{split}.jsonl"
    import urllib.request, json 
    with urllib.request.urlopen(URL) as url:
        data = url.read().decode()
        data = [json.loads(line) for line in data.strip().split("\n")]
        sst_dataset[split] = data

In [2]:
import numpy as np
from tqdm import tqdm

In [3]:
#!pip install -U spacy
#!python -m spacy download en_core_web_md

In [4]:
import spacy 
nlp = spacy.load('en_core_web_md')
# We use spacy to get feature vectors for our input text.

In [5]:
import numpy as np
from tqdm import tqdm
X, y = {}, {}
for split in ["train", "dev"]:
    X[split] = np.array([nlp(example["document"]).vector for example in tqdm(sst_dataset[split])])
    y[split] = np.array([example["label"] for example in sst_dataset[split]])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6920/6920 [00:45<00:00, 152.28it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 872/872 [00:05<00:00, 156.73it/s]


In [6]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2', C=1)
model.fit(X["train"], y["train"])
model.score(X["dev"], y["dev"])

0.7924311926605505

\begin{equation}
\delta := f_{\hat{\theta}_{Z_k}}(x_t) - f_{\hat{\theta}}(x_t) = \nabla_{\theta} f_{\hat{\theta}}(x_t)^T \Delta_k
\end{equation}

\begin{equation}
\begin{gathered}
\Delta_k = -\varepsilon \nabla_{\theta}^2 R(\hat{\theta})^{-1} \sum_{i=1}^k \nabla_{\theta} L(x_i, \hat{\theta})\\
\Delta_k = - \varepsilon H_{\hat{\theta}}^{-1} \sum_{i=1}^k \nabla_{\theta} L(x_i, \hat{\theta})
\end{gathered}
\end{equation}

In [7]:
from sklearn.preprocessing import normalize
w = np.concatenate((model.coef_, model.intercept_[None, :]), axis=1)
F_train = np.concatenate([X["train"], np.ones((X["train"].shape[0], 1))], axis=1) # Concatenating one to calculate the gradient with respect to intercept
F_dev = np.concatenate([X["dev"], np.ones((X["dev"].shape[0], 1))], axis=1)

error_train = model.predict_proba(X["train"])[:, 1] - y["train"]
error_dev = model.predict_proba(X["dev"])[:, 1] - y["dev"]

gradient_train = F_train * error_train[:, None] +  w /X["train"].shape[0]
gradient_dev = F_dev * error_dev[:, None] +  w / X["dev"].shape[0]
gradient_train.shape, gradient_dev.shape

((6920, 301), (872, 301))

In [8]:
error_train.shape, F_train.shape

((6920,), (6920, 301))

In [9]:
from scipy import sparse
probs = model.predict_proba(X["train"])[:, 1]
hessian = F_train.T @ np.diag(probs * (1 - probs)) @ F_train / X["train"].shape[0] + 1 * np.eye(F_train.shape[1]) / X["train"].shape[0]
inverse_hessian = np.linalg.inv(hessian)

In [11]:
np.linalg.cholesky(hessian)

array([[ 3.44652068e-02,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.59673178e-02,  5.25006197e-02,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.34287804e-02, -3.01205399e-02,  3.87645492e-02, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-1.70975063e-04,  2.77083632e-03, -1.01172086e-02, ...,
         2.03344176e-02,  0.00000000e+00,  0.00000000e+00],
       [-7.82649353e-03,  1.97866349e-02, -1.05031441e-02, ...,
         2.49106034e-04,  2.14304553e-02,  0.00000000e+00],
       [-1.23917368e-01,  2.75314673e-01, -1.20560337e-01, ...,
         5.12357706e-05,  6.70301039e-04,  1.98460023e-02]])

In [12]:
eps = 1 / X["train"].shape[0]
delta_k = -eps * inverse_hessian @ gradient_train.T
pred = np.reshape(model.predict_proba(X["dev"])[:, 1], (model.predict_proba(X["dev"])[:, 1].shape[0], 1))
grad_f = F_dev * (pred * (1 - pred))
delta_pred = grad_f @ delta_k
delta_pred.shape

(872, 6920)

In [13]:
gradient_dot_product = gradient_dev @ gradient_train.T

gradient_cosine_product = normalize(gradient_dev) @ normalize(gradient_train).T

influence = gradient_dev @ inverse_hessian.T @ gradient_train.T

relative_influence = normalize(gradient_dev) @ normalize(gradient_train @ inverse_hessian).T

In [25]:
gradient_train.shape, hessian.shape

((6920, 301), (301, 301))

In [None]:
gradient_dev = F_dev * error_dev[:, None] +  w / X["dev"].shape[0]

### Remove top $k$ trainning examples

In [49]:
pred[0]

array([0.98660862])

In [50]:
def Remove(k, scores, test_idx):
    #print("test_idx", test_idx)
    #print("old")
    #print(pred[test_idx])
    
    if pred[test_idx] > 0.5:
        label = "decrease"
    else:
        label = "increase"
    
    if label == "decrease":
        top_k_index = scores[test_idx].argsort()[-k:]
        X_k = np.delete(X["train"], top_k_index, axis=0)
        y_k = np.delete(y["train"], top_k_index, axis=0)
    else:
        top_k_index = scores[test_idx].argsort()[:k]
        X_k = np.delete(X["train"], top_k_index, axis=0)
        y_k = np.delete(y["train"], top_k_index, axis=0)
        
    prediction = -np.sum(scores[test_idx][top_k_index])
    #print("prediction", prediction)

    return X_k, y_k, prediction

### Train the model again with removed trainning set

In [57]:
def new_train(k, dev_index, scores):
    X_k, y_k, prediction = Remove(k, scores, dev_index)
    
    if y_k.shape[0] == np.sum(y_k) or np.sum(y_k) == 0: # data contains only one class: 1
        return None, None, None
        
    # Fit the model again
    model_k = LogisticRegression(penalty='l2', C=1)
    model_k.fit(X_k, y_k)

    # predictthe probaility with test point
    test_item = sst_dataset["dev"][dev_index]
    test_point = X["dev"][dev_index]
    test_point=np.reshape(test_point, (1,-1))

    change = -(model.predict_proba(test_point)[0][1] - model_k.predict_proba(test_point)[0][1])
    #change = model_k.predict_proba(test_point)[0][1]-model.predict_proba(test_point)[0][1]
    flip = (model.predict(test_point) == model_k.predict(test_point))
    
    #print("change    ", change)
    #print("old       ", model.predict_proba(test_point)[0][1])
    #print()
    
  
    return np.abs(change), flip, prediction

In [58]:
"""
changes=[]
predictions = []
for i in range(1, 100):
    change, _, prediction = new_train(1, i, delta_pred)
    changes.append(change)
    predictions.append(prediction)
"""

'\nchanges=[]\npredictions = []\nfor i in range(1, 100):\n    change, _, prediction = new_train(1, i, delta_pred)\n    changes.append(change)\n    predictions.append(prediction)\n'

In [59]:
# direction
"""
changes= np.array(changes)
predictions = np.array(predictions)
error = (predictions - changes) / changes
np.sum(np.abs(error) > 1)

same_direction=0
for i in range(changes.shape[0]):
    if changes[i] * predictions[i] > 0:
        same_direction+=1
same_direction, np.sum(np.abs(error) > 0.3)
"""

'\nchanges= np.array(changes)\npredictions = np.array(predictions)\nerror = (predictions - changes) / changes\nnp.sum(np.abs(error) > 1)\n\nsame_direction=0\nfor i in range(changes.shape[0]):\n    if changes[i] * predictions[i] > 0:\n        same_direction+=1\nsame_direction, np.sum(np.abs(error) > 0.3)\n'

### Show how much the prediction changed w.r.t test point

In [60]:
def change_trend(score, interval):
    row_num = len(list(range(1, y["train"].shape[0], interval)))
    col_num = int(X["dev"].shape[0])

    change_matrix = np.zeros((row_num, col_num))
    flip_matrix = np.zeros((row_num, col_num))
    
    
    for k in range(1, y["train"].shape[0], interval):
        i = int((k-1)/interval)
        print(k)
        for j in range(col_num):
            change_matrix[i][j], flip_matrix[i][j], _ = new_train(k, j, score)
            if change_matrix[i][j] == "stop":
                return change_matrix, flip_matrix

    return change_matrix, flip_matrix

In [61]:
interval = 30
import warnings
warnings.filterwarnings("ignore")

IP_change, IP_flip = change_trend(delta_pred, interval)
np.save("IP_change_log.npy", IP_change)
np.save("IP_flip_log.npy", IP_flip)

1
3001


KeyboardInterrupt: 

In [None]:
print("IF")
import warnings
warnings.filterwarnings("ignore")
IF_change, If_flip = change_trend(influence, interval)
np.save("IF_change_log.npy", IF_change)
np.save("IF_flip_log.npy", If_flip)
print("RIF")
RIF_change, RIF_flip= change_trend(relative_influence, interval)
np.save("RIF_change.npy", RIF_change)
np.save("RIF_flip.npy", RIF_flip)
print("GD")
GD_change, GD_flip= change_trend(gradient_dot_product, interval)
np.save("GD_change.npy", GD_change)
np.save("GD_flip.npy", GD_flip)
print("GC")
GC_change, GC_flip = change_trend(gradient_cosine_product, interval)
np.save("GC_change.npy", GC_change)
np.save("GC_flip.npy", GC_flip)