In [None]:
from torch_geometric.datasets import Planetoid
# from hivegraph.contrastive.grace import GRACE
from GRACE_new import GRACENew
from sklearn.linear_model import LogisticRegression
from util import * 
from torch_geometric.utils import to_dense_adj
import torch_geometric.transforms as T 
from augmentation import *
from query_strategies import *
import matplotlib.pyplot as plt
from model import *
from model_wrapper import *
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
import wandb
from trainers import *
from torch_geometric import seed_everything

from GRACE_new2 import GRACENew2



In [None]:
wandb.login()

In [None]:
device = 'cuda'

In [None]:
DROPOUT= 0.3
NUM_PASSES = 10
BUDGET = 150
EPOCHS = 100
SIGNIFICANCE_ITERATIONS = 10

NOISE_PROB = 0.4
NOISE_LEVEL = 0.5

# Augmentations
drop_edge = DropEdge(DROPOUT)
noise_feature_all = NoiseFeature(NOISE_LEVEL, 1)
noise_feature_col = NoiseFeature(NOISE_LEVEL, NOISE_PROB, "col")
noise_feature_row = NoiseFeature(NOISE_LEVEL, NOISE_PROB, "row")
noise_latent = NoiseLatent(NOISE_LEVEL)
mask_feature = MaskFeature(DROPOUT)

drop_edge_noise_all = T.Compose([drop_edge, noise_feature_all])
drop_edge_noise_col = T.Compose([drop_edge, noise_feature_col])
drop_edge_noise_row = T.Compose([drop_edge, noise_feature_row])
drop_edge_mask_feature = T.Compose([drop_edge, mask_feature])

drop_mask_noise = T.Compose([drop_edge, noise_feature_all, mask_feature])

In [None]:
data_splits = [torch.load(f"data_splits/split_{i}.pt") for i in range(10)]


In [249]:
SIGNIFICANCE_ITERATIONS = 10
for i in range(0,SIGNIFICANCE_ITERATIONS):
    print(i)
    seed_everything(i)

    dataset_o = data_splits[i].to('cuda')
    dataset_o.contrastive_validation_mask = dataset_o.val_mask
    dataset_o.contrastive_train_mask = ~dataset_o.val_mask

    # model = GRACENew(num_features=dataset_o.num_features,hidden=128, num_layers=2, drop_edge_rate_1=0.3,drop_edge_rate_2=0.3,drop_feature_rate_1=0.3,drop_feature_rate_2=0.3).to(device)
    train_augmentor = drop_edge_noise_row
    # train_augmentor = noise_feature_all
    model = GRACENew2(num_features=dataset_o.num_features,hidden=128, num_layers=2,
                     augmentor1=train_augmentor, augmentor2=train_augmentor).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=0.001)
    # labels = label_indices(dataset_o)
    for epoch in range(200):
        opt.zero_grad()
        loss = model.train_step(dataset_o)
        # print(loss)
        loss.backward()
        opt.step()

0
1
2
3
4
5
6
7
8
9


In [250]:
accs = []

for i in range(0,SIGNIFICANCE_ITERATIONS):
    dataset_o = data_splits[i].to('cuda')
    out = model(dataset_o.x,dataset_o.edge_index).detach().cpu().numpy()
    train_mask = dataset_o.train_mask.cpu().numpy()
    cc = init_kmeans(dataset_o,dataset_o.train_mask, out)
    kmeans = KMeans(n_clusters=7,init=cc,max_iter=500,tol=1e-4,random_state=0)
    # kmeans = KMeans(n_clusters=7,init="k-means++",n_init=10,max_iter=500,tol=1e-4,random_state=0)
    kmeans.fit(out)
    labels = kmeans.labels_
    lr = LogisticRegression(max_iter=1000)

    label_bin = LabelBinarizer()
    label_bin.fit(labels)
    labels_bin = label_bin.transform(labels)
    distances = kmeans.transform(out)
    

    val_mask = get_mask_indices( dataset_o.val_mask).cpu().numpy()[:40]
    
    test_mask = dataset_o.test_mask.cpu().numpy()
    features = np.hstack([out,labels_bin,distances])
    lr.fit(features[train_mask],dataset_o.y[train_mask].cpu().numpy())
    acc = (lr.predict(features)[test_mask] == dataset_o.y.cpu().numpy()[test_mask]).mean()

    # mapping = map_labels(dataset_o.y.cpu().numpy()[train_mask],labels[train_mask])[0]
    # mapping = map_labels(dataset_o.y.cpu().numpy(),labels)[0]
    # mapped_labels = np.vectorize(mapping.get)(labels)
    # acc = (mapped_labels == dataset_o.y.cpu().numpy())[test_mask].mean()
    
    accs.append(acc)

In [253]:
(lr.predict(features) == dataset_o.y.cpu().numpy()).mean()

0.31499261447562776

In [251]:
accs

[0.676, 0.642, 0.599, 0.34, 0.472, 0.631, 0.645, 0.424, 0.673, 0.302]

In [252]:
accs = np.array(accs)
accs.mean(), accs.std(), accs.min(), accs.max()


(0.5404, 0.13568286553577796, 0.302, 0.676)

In [None]:
out = model(dataset_o.x,dataset_o.edge_index).detach().cpu().numpy()
cc = init_kmeans(dataset_o,dataset_o.train_mask, out)
kmeans = KMeans(n_clusters=7,init=cc,max_iter=500,tol=1e-4,random_state=0)
# kmeans = KMeans(n_clusters=7,init="k-means++",n_init=10,max_iter=500,tol=1e-4,random_state=0)
kmeans.fit(out)

mapping, acc = map_labels(dataset_o.y.cpu().numpy(),kmeans.labels_)
acc

In [None]:
out_tmp = np.vstack([out,kmeans.cluster_centers_])
y_tmp = np.hstack([dataset_o.y.cpu().numpy(),np.repeat(-3,7)])
tsne = TSNE(n_components=2,random_state=0)
tsne_out_c = tsne.fit_transform(out_tmp)

In [None]:
train_mask_tmp = np.hstack([train_mask,np.repeat(False,7)])

In [None]:
scatter1 = plt.scatter(tsne_out_c[:-7,0],tsne_out_c[:-7,1],c=y_tmp[:-7], cmap="tab10")
plt.scatter(tsne_out_c[-7:,0],tsne_out_c[-7:,1],c='green')

for i, p in enumerate(tsne_out_c[-7:]):
    plt.annotate(mapping.get(i), (p[0], p[1]), fontsize=20)
    
# train_samples = tsne_out_c[train_mask_tmp]
# plt.scatter(train_samples[:,0],train_samples[:,1],c='black')
# for i, txt in enumerate(y_tmp[train_mask_tmp]):
#     plt.annotate(txt, (train_samples[i, 0], train_samples[i, 1]), fontsize=20)
plt.legend(*scatter1.legend_elements(), title="Classes", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
lr_pred =lr.predict(features)

scatter1 = plt.scatter(tsne_out_c[:-7,0],tsne_out_c[:-7,1],c=lr_pred, cmap="tab10")

# plt.scatter(tsne_out_c[-7:,0],tsne_out_c[-7:,1],c='green')
# for i, p in enumerate(tsne_out_c[-7:]):
#     plt.annotate(mapping.get(i), (p[0], p[1]), fontsize=20)
    
train_samples = tsne_out_c[train_mask_tmp]
plt.scatter(train_samples[:,0],train_samples[:,1],c='black')
for i, txt in enumerate(y_tmp[train_mask_tmp]):
    plt.annotate(txt, (train_samples[i, 0], train_samples[i, 1]), fontsize=20)
plt.legend(*scatter1.legend_elements(), title="Classes", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
y_preds = [mapping.get(i) for i in kmeans.labels_]

scatter1 = plt.scatter(tsne_out_c[:-7,0],tsne_out_c[:-7,1],c=y_preds, cmap="tab10")
plt.scatter(tsne_out_c[-7:,0],tsne_out_c[-7:,1],c='green')

for i, p in enumerate(tsne_out_c[-7:]):
    plt.annotate(mapping.get(i), (p[0], p[1]), fontsize=20)
    
train_samples = tsne_out_c[train_mask_tmp]
plt.scatter(train_samples[:,0],train_samples[:,1],c='black')
# for i, txt in enumerate(y_tmp[train_mask_tmp]):
#     plt.annotate(txt, (train_samples[i, 0], train_samples[i, 1]), fontsize=20)
plt.legend(*scatter1.legend_elements(), title="Classes", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
scatter1 = plt.scatter(tsne_out_c[:-7,0],tsne_out_c[:-7,1],c=y_preds == y_tmp[:-7])
plt.scatter(tsne_out_c[-7:,0],tsne_out_c[-7:,1],c='green')

for i, p in enumerate(tsne_out_c[-7:]):
    plt.annotate(mapping.get(i), (p[0], p[1]), fontsize=20)
    
train_samples = tsne_out_c[train_mask_tmp]
plt.scatter(train_samples[:,0],train_samples[:,1],c='black')
# for i, txt in enumerate(y_tmp[train_mask_tmp]):
#     plt.annotate(txt, (train_samples[i, 0], train_samples[i, 1]), fontsize=20)
plt.legend(*scatter1.legend_elements(), title="Classes", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
tsne_out = tsne.fit_transform(out)

In [None]:
train_mask = dataset_o.train_mask.detach().cpu().numpy()    

In [None]:
# labels = kmeans.labels_
labels = [mapping[x] for x in kmeans.labels_]
fig, axs = plt.subplots(2, 2, figsize=(15, 10))
axs = axs.flatten()
# First plot: current plot
scatter1 = axs[0].scatter(tsne_out[:,0], tsne_out[:,1], c=dataset_o.y.cpu().numpy(), cmap = "tab10")
axs[0].set_title("True Labels")
axs[0].legend(*scatter1.legend_elements(), title="Classes")

# Second plot: color with kmeans.labels_
scatter2 = axs[1].scatter(tsne_out[:,0], tsne_out[:,1], c=labels, cmap = "tab10")
axs[1].set_title("KMeans Labels")
axs[1].legend(*scatter2.legend_elements(), title="Clusters")

# Third plot: color if kmeans.labels_ == true label
correct_labels = labels == dataset_o.y.cpu().numpy()
scatter3 = axs[2].scatter(tsne_out[:,0], tsne_out[:,1], c=correct_labels)
axs[2].set_title("KMeans Correct Labels")
axs[2].legend(*scatter3.legend_elements(), title="Correct")

scatter4 = axs[3].scatter(tsne_out[:,0][train_mask], tsne_out[:,1][train_mask], c=dataset_o.y.cpu().numpy()[train_mask], cmap = "tab10")
axs[3].set_title("Train nodes")
axs[3].legend(*scatter4.legend_elements(), title="Train nodes")

plt.show()

In [254]:
datasaet_o = data_splits[-1].to('cuda')
true_labels = dataset_o.y.cpu().numpy()
val_mask = dataset_o.val_mask.cpu().numpy()

In [255]:
lambda_ = 1.0
model = GRACENew(num_features=data_splits[0].num_features,hidden=128, num_layers=2, drop_edge_rate_1=0.3,drop_edge_rate_2=0.3,drop_feature_rate_1=0.3,drop_feature_rate_2=0.3, lambda_=lambda_).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
wrapped_model = GRACEModelWrapperCluster(model,optimizer)

trainer = Trainer()
trainer.train(wrapped_model, dataset_o, 200)
wrapped_model.test_step(dataset_o)
(wrapped_model.cluster_labels_.argmax(axis=1) == dataset_o.y.cpu().numpy()).sum() / dataset_o.y.size(0)

0.3567208271787297

In [256]:
out_w = wrapped_model(dataset_o).detach().cpu().numpy()
tsne = TSNE(n_components=2,random_state=0)
tsne_wrapped = tsne.fit_transform(out_w)

In [None]:
acc_kmeans_lr = []
acc_kmeans_lr_wrap = []

In [None]:
dataset_tmp = dataset_o.clone()
wrapped_model.eval()
# init_wandb("LATENT_KMEANS", "", "CORA")
for b in range(1,BUDGET+1):
    # KMEANS
    train_mask = dataset_tmp.train_mask.cpu().numpy()
    cc = init_kmeans(dataset_tmp,dataset_tmp.train_mask, out)
    kmeans = KMeans(n_clusters=7,init=cc,max_iter=500,tol=1e-4,random_state=0)
    # kmeans = KMeans(n_clusters=7,init="k-means++",n_init=10,max_iter=500,tol=1e-4,random_state=0)
    kmeans.fit(out)
    kmeans.labels_
    pred_labels = kmeans.labels_

    best_mapping, best_accuracy = map_labels(true_labels, pred_labels)
    print(f"Best Mapping: {best_mapping}")
    print(f"Best Accuracy: {best_accuracy}")

    # LOGISTIC REGRESSION
    # lb = LabelBinarizer()
    # lb.fit(kmeans.labels_)
    # cluster_labels =  lb.transform(kmeans.labels_)
    # distances = kmeans.transform(out)

    # # features = np.hstack([out,distances])
    # # features = out
    # features = np.hstack([out,distances, cluster_labels])
    # lr = LogisticRegression(max_iter=1000, random_state=0)
    # lr.fit(features[train_mask],true_labels[train_mask])
    # acc = lr.score(features,true_labels )
    
    # pred_log_probas = lr.predict_log_proba(features)
    pred_log_probas = wrapped_model.test_step(dataset_tmp).detach().cpu().numpy()
    # acc = pred_log_probas.argmax(dim=1).eq(dataset_tmp.y).sum().item() / dataset_tmp.y.size(0)
    acc = torch.argmax(torch.tensor(pred_log_probas), dim=1).eq(dataset_tmp.y.cpu()).sum().item() / dataset_tmp.y.cpu().size(0)
    entropies = calculate_entropy(torch.tensor(pred_log_probas))
    
    # # TTA
    for i in range(NUM_PASSES):
        data_tmp = dataset_tmp.clone()
        data_tmp = drop_edge_noise_all(data_tmp)
        # out_c = model(data_tmp.x, data_tmp.edge_index).detach().cpu().numpy()
        # # distances = kmeans.transform(out_c)
        # # cluster_labels =  lb.transform(kmeans.predict(out_c))
        # features = np.hstack([out_c,distances,cluster_labels])
        # pred_log_probas = lr.predict_log_proba(features)
        pred_log_probas = wrapped_model.test_step(data_tmp).detach().cpu().numpy()
        entropies = entropies + calculate_entropy(torch.tensor(pred_log_probas))/NUM_PASSES
    
    pool_indices = get_mask_indices(dataset_tmp.train_pool).cpu()
    chosen_node_ix = torch.argmax(entropies[pool_indices])
    chosen_node = pool_indices[chosen_node_ix]
    dataset_tmp.train_pool[chosen_node] = False
    dataset_tmp.train_mask[chosen_node] = True
    predictor = LogisticRegression()
    acc_kmeans_lr_wrap.append(acc)
    print(f"Budget {b} - Accuracy: {acc}")
    # wandb.log({"accuracy_mean": acc, "step": b})
    wrapped_model.reset_predictor()
wandb.finish()
    

In [None]:
# plt.plot(acc_kmeans_lr, label="LR")
plt.plot(acc_kmeans_lr_wrap, label="GRACE")
plt.legend()
# plt.yticks(np.arange(0.6,0.85,0.025))
plt.show()

In [None]:
mapped_labels = np.array([best_mapping[i] for i in pred_labels])
label_color = dataset_o.y.cpu().numpy() == mapped_labels
# label_color = mapped_labels
to_plot = out_2[~label_color]

scatter = plt.scatter(to_plot[:,0], to_plot[:,1], c=mapped_labels[~label_color])
plt.legend(*scatter.legend_elements(), title="Classes")
plt.show()

In [None]:
data_tmp = dataset_o.clone()
# data_tmp.edge_index = data_tmp.edge_index[:,3:]
data_tmp = drop_edge_mask_feature(data_tmp)
print(data_tmp.edge_index.shape)

out_c = model(data_tmp.x, data_tmp.edge_index).detach().cpu().numpy()

tsne = TSNE(n_components=2,random_state=0)
out_c2 = tsne.fit_transform(out_c)

plt.scatter(out_c2[:,0], out_c2[:,1], c=dataset_o.y.cpu().numpy())

In [None]:
predictor = LogisticRegression()
train_mask = dataset_o.train_mask.cpu().numpy()
test_mask = dataset_o.test_mask.cpu().numpy()
y = dataset_o.y.cpu().numpy()
predictor.fit(out[train_mask], y[train_mask])
print(predictor.score(out[test_mask],y[test_mask]))

In [None]:
y_pred = predictor.predict(out)
y_correct = y_pred == y

In [None]:
plt.scatter(out_2[:,0], out_2[:,1], c=y_correct)

In [None]:
# AUGMENTED ENTROPY
dataset = dataset_o.clone()
acc_aug = []
for b in range(BUDGET):
    # if b % 10 == 0:
    #     model = GRACE(num_features=dataset_o.num_features,hidden=128, num_layers=2, drop_edge_rate_1=0.3,drop_edge_rate_2=0.3,drop_feature_rate_1=0.3,drop_feature_rate_2=0.3).to(device)
    #     opt = torch.optim.Adam(model.parameters(), lr=0.001)
    #     labels = label_indices(dataset)
    #     for epoch in range(200):
    #         opt.zero_grad()
    #         loss = model.train_step(dataset.x,dataset.edge_index,labels)
    #         loss.backward()
    #         opt.step()
    #     loss = model.train_step(dataset.x,dataset.edge_index,labels)
    #     print(dataset.train_mask.sum())
    #     print(loss)
        
    pool_indices = get_mask_indices(dataset.train_pool).cpu()
    out = model(dataset.x,dataset.edge_index)
    predictor = LogisticRegression()
    predictor.fit(out[dataset.train_mask].detach().cpu().numpy(), dataset.y[dataset.train_mask].detach().cpu().numpy())
    acc = predictor.score(out[dataset.test_mask].detach().cpu().numpy(), dataset.y[dataset.test_mask].detach().cpu().numpy())

    entropy_sum = torch.zeros(dataset.num_nodes)
    
    for _ in range(NUM_PASSES):
        data_tmp = dataset.clone()
        data_tmp = drop_edge_noise_all(data_tmp)
        out_c = model(data_tmp.x, data_tmp.edge_index)
        pred_log_probas = predictor.predict_log_proba(out_c.detach().cpu().numpy())
        entropies = calculate_entropy(torch.tensor(pred_log_probas))
        entropy_sum += entropies
        
    pred_log_probas = predictor.predict_log_proba(out.detach().cpu().numpy())
    entropy_o = calculate_entropy(torch.tensor(pred_log_probas))
    entropy_sum +=entropy_o
    entropy_sum /= NUM_PASSES
    
    chosen_node_ix = torch.argmax(entropy_sum[pool_indices])
    chosen_node = pool_indices[chosen_node_ix]
    dataset.train_pool[chosen_node] = False
    dataset.train_mask[chosen_node] = True
    
    predictor = LogisticRegression()
    acc_aug.append(acc)
    print(f"Budget {b} - Accuracy: {acc}")

In [None]:
out.shape

In [None]:
out_n = out.detach()

In [None]:
def predict_log_proba_sklearn_model(model, x):
    out = x @ model.coef_.T + model.intercept_
    out = torch.tensor(out)
    return torch.nn.functional.log_softmax(out, dim=1)

In [None]:
class PredictorWrapper():
    def __init__(self, model):
        self.weights = torch.tensor(model.coef_.T, dtype=torch.float32).cuda()
        self.weights.requires_grad = False
        self.bias = torch.tensor(model.intercept_, dtype=torch.float32).cuda()
        self.bias.requires_grad = False
    def predict_log_proba(self, x):
        out =  x @ self.weights + self.bias
        return torch.nn.functional.log_softmax(out, dim=1)

In [None]:
# LATENT NOISE

dataset = dataset_o.clone()
acc_laten_noise_o = []
out_a = out.clone()
for b in range(BUDGET):
    pool_indices = get_mask_indices(dataset.train_pool).cpu()

    predictor = LogisticRegression()
    predictor.fit(out[dataset.train_mask].detach().cpu().numpy(), dataset.y[dataset.train_mask].detach().cpu().numpy())
    acc = predictor.score(out[dataset.test_mask].detach().cpu().numpy(), dataset.y[dataset.test_mask].detach().cpu().numpy())

    entropy_sum = torch.zeros(dataset.num_nodes)
    
    for _ in range(NUM_PASSES):
        out_c = out_a + noise_latent(out_a)
        pred_log_probas = predictor.predict_log_proba(out_c.detach().cpu().numpy())
        entropies = calculate_entropy(torch.tensor(pred_log_probas))
        entropy_sum += entropies
    entropy_sum /= NUM_PASSES
    
    chosen_node_ix = torch.argmax(entropy_sum[pool_indices])
    chosen_node = pool_indices[chosen_node_ix]
    dataset.train_pool[chosen_node] = False
    dataset.train_mask[chosen_node] = True
    
    acc_laten_noise_o.append(acc)
    print(f"Budget {b} - Accuracy: {acc}")

In [None]:
# ENTROPY
dataset = dataset_o.clone()
acc_entropy = []
for b in range(BUDGET):
    predictor = LogisticRegression()
    predictor.fit(out[dataset.train_mask].detach().cpu().numpy(), dataset.y[dataset.train_mask].detach().cpu().numpy())
    
    acc = predictor.score(out[dataset.test_mask].detach().cpu().numpy(), dataset.y[dataset.test_mask].detach().cpu().numpy())
    pred_log_probas = predictor.predict_log_proba(out.detach().cpu().numpy())
    entropies = calculate_entropy(torch.tensor(pred_log_probas))
    pool_indices = get_mask_indices(dataset.train_pool).cpu()
    chosen_node_ix = torch.argmax(entropies[pool_indices])
    chosen_node = pool_indices[chosen_node_ix]
    dataset.train_pool[chosen_node] = False
    dataset.train_mask[chosen_node] = True
    predictor = LogisticRegression()
    acc_entropy.append(acc)
    print(f"Budget {b} - Accuracy: {acc}")
    

In [None]:
# LATENT DISTANCE
BUDGET = 150
dataset = dataset_o.clone()
acc_latent = []
for b in range(BUDGET):
    
    predictor = LogisticRegression()
    predictor.fit(out[dataset.train_mask].detach().cpu().numpy(), dataset.y[dataset.train_mask].detach().cpu().numpy())
    acc = predictor.score(out[dataset.test_mask].detach().cpu().numpy(), dataset.y[dataset.test_mask].detach().cpu().numpy())
    
    pool_indices = get_mask_indices(dataset.train_pool).cpu()

    dist_matrix = torch.cdist(out[dataset.train_pool], out[dataset.train_pool])
    adj = to_dense_adj(dataset.edge_index)[0][dataset.train_pool][:,dataset.train_pool]
    adj = adj.cuda()
    dist_matrix = dist_matrix * adj
    
    
    max_dist = dist_matrix.mean(dim=1)
    min_max_dist = torch.argmin(max_dist)
    chosen_node = pool_indices[min_max_dist]

    
    dataset.train_pool[chosen_node] = False
    dataset.train_mask[chosen_node] = True
    # predictor = LogisticRegression()
    acc_latent.append(acc)
    print(f"Budget {b} - Accuracy: {acc}")

In [None]:

plt.plot(acc_aug, label="Augmented Entropy")
plt.plot(acc_entropy, label="Entropy")
# plt.plot(acc_latent, label="Latent Distance")
# plt.plot(acc_laten_noise, label="Latent Noise ADV")
# plt.plot(acc_laten_noise_o, label="Latent Noise")
plt.legend()
plt.show()

In [None]:
acc_laten_noise_o[15:41]