# MLP for token classification (seg)

In [3]:
%load_ext autoreload
%autoreload 2

In [43]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from utils import deserialize, serialize


In [44]:
sos_dict = "/dfs/scratch1/gmachi/datasets/wikisection_processed/train-sos.obj"
sos_dict = deserialize(sos_dict)

embeds_dict = "/dfs/scratch1/gmachi/datasets/wikisection_processed/embed_dict.obj"
embeds_dict = deserialize(embeds_dict)

In [45]:
Xs,ys = [],[]
for k,v in sos_dict.items():
    Xs.append(embeds_dict[k])
    ys.append(int(v))

In [46]:
import numpy as np
X = np.vstack(Xs)
y = np.array(ys)
print(X.shape, y.shape)

(5000, 384) (5000,)


In [47]:
clf = MLPClassifier(random_state=1, max_iter=1000).fit(X, y)

In [48]:
clf.classes_

array([0, 1])

In [51]:
import os
import networkx as nx
save_path = "/dfs/scratch1/gmachi/datasets/wikisection_processed/mlp_Gs"
Gs_path = "/dfs/scratch1/gmachi/datasets/wikisection_processed/test_Gs"

for G_file in os.listdir(Gs_path):
    # load Gs
    G = os.path.join(Gs_path, G_file)
    G = deserialize(G)
    # make copy as MLP graph
    Gm = G.copy()
    for node in Gm.nodes:
        X_test = Gm.nodes[node]["emb"]
        p = clf.predict_proba(X_test)[0][1]
        Gm.nodes[node]["emb"] = p
    save_at = os.path.join(save_path, G_file)
    serialize(Gm, save_at)



In [52]:
nx.get_node_attributes(Gm, "emb")

{0: 4.3579316477969615e-05,
 1: 2.1627906587401834e-05,
 2: 5.197795428464583e-06,
 3: 4.710880458829425e-06,
 4: 5.976118094657616e-08,
 5: 1.5863832768164285e-08,
 6: 0.00038726140563424876,
 7: 0.00014439482125605513,
 8: 0.0002645863480532541,
 9: 1.0608585924395681e-07,
 10: 4.6108883827857445e-05,
 11: 2.8188574489046775e-07,
 12: 1.8667581026446123e-07,
 13: 0.00020934097858862723,
 14: 1.7182038113557157e-05,
 15: 9.980032472385816e-06,
 16: 8.02604029508696e-07,
 17: 2.400235488103014e-05,
 18: 7.348434294374832e-08,
 19: 2.6007748014703272e-05,
 20: 1.989148615657685e-08,
 21: 1.5408390390154372e-06,
 22: 1.1454256711972896e-05,
 23: 1.5177103392334304e-06,
 24: 1.3083109051706364e-07,
 25: 2.198075224770422e-06,
 26: 0.00026966944509450487,
 27: 5.681661671319134e-07,
 28: 0.0003737505483266776,
 29: 1.4771456330072142e-05,
 30: 0.013014728405894311,
 31: 1.8081232823902247e-08,
 32: 1.853630771765585e-07,
 33: 1.0439927112820328e-05,
 34: 0.0004980537132595116,
 35: 0.00021

# SVD for novelty detection

In [53]:
from sklearn.svm import OneClassSVM

In [54]:
Xs,ys = [],[]
for k,v in sos_dict.items():
    if v == 0:
        Xs.append(embeds_dict[k])
        ys.append(int(v))

In [55]:
import numpy as np
X = np.vstack(Xs)
y = np.array(ys)
print(X.shape, y.shape)

(4809, 384) (4809,)


In [56]:
clf = OneClassSVM(gamma='auto').fit(X)

In [60]:
import os
import networkx as nx
save_path = "/dfs/scratch1/gmachi/datasets/wikisection_processed/svd_Gs"
Gs_path = "/dfs/scratch1/gmachi/datasets/wikisection_processed/test_Gs"

for G_file in os.listdir(Gs_path):
    # load Gs
    G = os.path.join(Gs_path, G_file)
    G = deserialize(G)
    # make copy as MLP graph
    Gm = G.copy()
    for node in Gm.nodes:
        X_test = Gm.nodes[node]["emb"]
        p = clf.score_samples(X_test)[0]
        Gm.nodes[node]["emb"] = p
    save_at = os.path.join(save_path, G_file)
    serialize(Gm, save_at)

In [61]:
nx.get_node_attributes(Gm, "emb")

{0: 2392.8088164794785,
 1: 2393.0229840336083,
 2: 2392.899406303149,
 3: 2392.709794464928,
 4: 2392.6750296645405,
 5: 2393.022430167734,
 6: 2393.196943129346,
 7: 2392.8721366692816,
 8: 2392.5967240465443,
 9: 2392.7871969382923,
 10: 2392.8458967946276,
 11: 2392.6163955325114,
 12: 2392.77759702016,
 13: 2392.7904007587035,
 14: 2392.5610822509475,
 15: 2393.0591095492437,
 16: 2392.7371238117958,
 17: 2392.846548032603,
 18: 2392.661744476926,
 19: 2392.849515871642,
 20: 2392.7865005675976,
 21: 2393.2253377116776,
 22: 2393.1013787173124,
 23: 2392.7987073028894,
 24: 2392.80896542733,
 25: 2393.0060536800074,
 26: 2393.2757434781174,
 27: 2393.0479462335893,
 28: 2392.8652299915616,
 29: 2392.814230089395,
 30: 2393.5157187637014,
 31: 2393.0058816013734,
 32: 2392.8115090239567,
 33: 2392.8965308581915,
 34: 2392.7919787495775,
 35: 2393.0180767931874,
 36: 2393.310341036503,
 37: 2393.2162601659434,
 38: 2392.881532910292,
 39: 2393.153187204399,
 40: 2393.428787869532,
 

# generate baseline results

In [63]:
from evaluation import eval_baseline_explanations
import utils 

baseline= "mlp"

G_dir = "/dfs/scratch1/gmachi/datasets/wikisection_processed/mlp_Gs"
gt_graph_dir = "/dfs/scratch1/gmachi/datasets/wikisection_processed/test_gts"
refined_test_dict = utils.deserialize("/dfs/scratch1/gmachi/datasets/wikisection_processed/test-doc-labs.obj")
cache_dir = "/dfs/scratch1/gmachi/k2/K2/src/outputs/baselines/mlp-full.obj"

model_results_dict, data_linearized_dict = eval_baseline_explanations(G_dir, gt_graph_dir, modality="text", label_dict=refined_test_dict)
utils.serialize(model_results_dict, cache_dir)

skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient


In [64]:
from evaluation import eval_baseline_explanations
import utils 

baseline = "svd"

G_dir = "/dfs/scratch1/gmachi/datasets/wikisection_processed/svd_Gs"
gt_graph_dir = "/dfs/scratch1/gmachi/datasets/wikisection_processed/test_gts"
refined_test_dict = utils.deserialize("/dfs/scratch1/gmachi/datasets/wikisection_processed/test-doc-labs.obj")
cache_dir = "/dfs/scratch1/gmachi/k2/K2/src/outputs/baselines/svd-full.obj"

model_results_dict, data_linearized_dict = eval_baseline_explanations(G_dir, gt_graph_dir, modality="text", label_dict=refined_test_dict)
utils.serialize(model_results_dict, cache_dir)

skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient


Do this for added baselines too

attention, probs, entail

In [9]:
from evaluation import eval_baseline_explanations
import utils 

G_dir = "/dfs/scratch1/gmachi/datasets/wikisection_processed/attn_Gs"
gt_graph_dir = "/dfs/scratch1/gmachi/datasets/wikisection_processed/test_gts"
refined_test_dict = utils.deserialize("/dfs/scratch1/gmachi/datasets/wikisection_processed/test-doc-labs.obj")
cache_dir = "/dfs/scratch1/gmachi/k2/K2/src/outputs/baselines/deberta-attn-full.obj"

model_results_dict, data_linearized_dict = eval_baseline_explanations(G_dir, gt_graph_dir, modality="text", label_dict=refined_test_dict)
utils.serialize(model_results_dict, cache_dir)

skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient


In [10]:
from evaluation import eval_baseline_explanations
import utils 

G_dir = "/dfs/scratch1/gmachi/datasets/wikisection_processed/entail_Gs"
gt_graph_dir = "/dfs/scratch1/gmachi/datasets/wikisection_processed/test_gts"
refined_test_dict = utils.deserialize("/dfs/scratch1/gmachi/datasets/wikisection_processed/test-doc-labs.obj")
cache_dir = "/dfs/scratch1/gmachi/k2/K2/src/outputs/baselines/deberta-entail-full.obj"

model_results_dict, data_linearized_dict = eval_baseline_explanations(G_dir, gt_graph_dir, modality="text", label_dict=refined_test_dict)
utils.serialize(model_results_dict, cache_dir)

skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient


In [11]:
from evaluation import eval_baseline_explanations
import utils 

G_dir = "/dfs/scratch1/gmachi/datasets/wikisection_processed/prob_Gs"
gt_graph_dir = "/dfs/scratch1/gmachi/datasets/wikisection_processed/test_gts"
refined_test_dict = utils.deserialize("/dfs/scratch1/gmachi/datasets/wikisection_processed/test-doc-labs.obj")
cache_dir = "/dfs/scratch1/gmachi/k2/K2/src/outputs/baselines/deberta-prob-full.obj"

model_results_dict, data_linearized_dict = eval_baseline_explanations(G_dir, gt_graph_dir, modality="text", label_dict=refined_test_dict)
utils.serialize(model_results_dict, cache_dir)

skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
skipping class-1 example with all salient
