In [None]:
import numpy as np
from collections import Counter

In [None]:
dataset_name = 'CIFAR-10'
version = 'd256_K32_N32_A_v1'

models = {
    'PCA':'PCA',
    'UMAP':'UMAP',
    'Basic AutoEncoder':'AE',
    'Topological AutoEncoder':'TopoAE (Moor et.al.)',
    'RTD AutoEncoder H1':'RTD-AE',
    'GNSA AutoEncoder':'GNSA-AE',
    'LNSA AutoEncoder':'LNSA-AE',
    'NSA AutoEncoder':'NSA-AE',
}

In [None]:
if "COIL" in dataset_name:
    data = np.load(f'data/{dataset_name}/prepared/data.npy')
else:
    data = np.load(f'data/{dataset_name}/prepared/train_data.npy')
data = data.reshape(data.shape[0], -1)

In [None]:
latent_train_data = np.load(f'data/{dataset_name}/{model}_latent_output_{version}.npy')
#latent_train_labels = np.load(f'data/{dataset_name}/prepared/train_labels.npy')
#latent_train_labels = np.load(f'data/{dataset_name}/prepared/labels.npy')
latent_train_labels = np.load(f'data/{dataset_name}/{model}_final_labels_{version}.npy')

latent_test_data = np.load(f'data/{dataset_name}/{model}_latent_output_{version}_test.npy')
#latent_test_labels = np.load(f'data/{dataset_name}/prepared/test_labels.npy')
latent_test_labels = np.load(f'data/{dataset_name}/{model}_final_labels_{version}_test.npy')

In [None]:
print(latent_train_data.shape[0]==latent_train_labels.shape[0])
print(latent_test_data.shape[0]==latent_test_labels.shape[0])

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

tsne = TSNE(n_components=3, random_state=42)
tsne_results = tsne.fit_transform(latent_train_data)

# fig = plt.figure(figsize=(10, 10))
# ax = fig.add_subplot(111, projection='3d')
# colormap = plt.cm.get_cmap('tab20', 20)
# scatter = ax.scatter(tsne_results[:, 0], tsne_results[:, 1], tsne_results[:, 2], c=latent_train_labels)
# legend = ax.legend(*scatter.legend_elements(), title="Classes")
# ax.add_artist(legend)
# plt.title(f"3D t-SNE Visualization for {dataset_name} with {model}", fontsize = 20)
# #plt.savefig(f"{model}_{dataset_name}_tSNE.jpg", bbox_inches='tight')
# plt.show()

In [None]:
fig = plt.figure(figsize = (20,20))
ax = fig.add_subplot(111, projection='3d')
colormap = plt.cm.get_cmap('tab20', 20)
# Plot each class with a unique color
for class_label in range(21):
    mask = latent_train_labels == class_label
    ax.scatter(tsne_results[mask, 0], tsne_results[mask, 1], tsne_results[mask, 2], label=f'Class {class_label}', c=[colormap(class_label)])

ax.set_xlabel('X-axis')
ax.set_ylabel('Y-axis')
ax.set_zlabel('Z-axis')
#ax.legend()

#plt.title('3D Scatter Plot with More than 10 Classes')
plt.savefig(f"{model}_{dataset_name}_tSNE.jpg", bbox_inches='tight')
plt.show()

In [None]:
#To plot coil dataset

fig = plt.figure(figsize = (20,20))
ax = fig.add_subplot(111, projection='3d')
colormap = plt.cm.get_cmap('tab20', 20)
# Plot each class with a unique color
for class_label in range(21):
    mask = latent_train_labels == class_label
    ax.scatter(tsne_results[mask, 0], tsne_results[mask, 1], tsne_results[mask, 2], label=f'Class {class_label}', c=[colormap(class_label)])

ax.set_xlabel('X-axis')
ax.set_ylabel('Y-axis')
ax.set_zlabel('Z-axis')
ax.legend()

plt.title('3D Scatter Plot with More than 10 Classes')
plt.show()

In [None]:
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D

# tsne = TSNE(n_components=3, random_state=42)
# tsne_results = tsne.fit_transform(latent_train_data)

# fig = plt.figure(figsize=(8, 8))
# ax = fig.add_subplot(111, projection='3d')
# scatter = ax.scatter(tsne_results[:, 0], tsne_results[:, 1], tsne_results[:, 2], c=latent_train_labels, cmap='tab10')
# legend = ax.legend(*scatter.legend_elements(), title="Classes")
# ax.add_artist(legend)
# plt.title("3D t-SNE Visualization of Latent Embeddings")
# plt.show()

## Sentence Similarity with Word2Vec

In [None]:
!wget "https://github.com/mmihaltz/word2vec-GoogleNews-vectors/raw/master/GoogleNews-vectors-negative300.bin.gz"

In [None]:
import gensim.downloader as api

model = api.load('word2vec-google-news-300')

vec_king = model['king']

In [None]:
vec_king.shape

In [None]:
from datasets import load_dataset

# Load the STS benchmark dataset
sts_dataset = load_dataset("stsb_multi_mt",'en')

In [None]:
sts_dataset['train']

In [None]:
from gensim.models import KeyedVectors

# Load your Word2Vec model
#model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Extracting words and their vectors
words = []
word_vectors = []
word_to_vec_map = {}

for word in model.key_to_index:
    words.append(word)
    word_vector = model[word]
    word_vectors.append(word_vector)
    word_to_vec_map[word] = word_vector

# Convert word vectors to an array
import numpy as np
word_vectors_array = np.array(word_vectors)

# Now, 'word_vectors_array' is an array of all word vectors
# 'words' is the list of words corresponding to these vectors
# 'word_to_vec_map' is a dictionary mapping words to their vectors
word_dict={}
index_dict={}
for i,word in enumerate(words):
    word_dict[i]=word
    index_dict[word]=i

In [None]:
import numpy as np

def get_sentence_embedding(sentence, model):
    words = sentence.split()
    word_embeddings = [model[word] for word in words if word in model]
    if not word_embeddings:
        return np.zeros(model.vector_size)
    sentence_embedding = np.mean(word_embeddings, axis=0)
    return sentence_embedding

# Example usage
sentence_embedding = get_sentence_embedding("This is a test sentence.", model)

In [None]:
sentence_embedding.shape

In [None]:
from scipy.spatial.distance import cosine

def compute_similarity_pairs(dataset, model,space=False, index_dict=index_dict):
    similarities = []
    gt_similarities = []
    for item in dataset:
        if space==True:
            emb1 = get_sentence_embedding_from_space(item['sentence1'], model, index_dict)
            emb2 = get_sentence_embedding_from_space(item['sentence2'], model, index_dict)
        else:
            emb1 = get_sentence_embedding(item['sentence1'], model)
            emb2 = get_sentence_embedding(item['sentence2'], model)
        pure_sim = item['similarity_score']
        sim = 1 - cosine(emb1, emb2)  # Cosine similarity
        similarities.append(sim)
        gt_similarities.append(pure_sim)
    return gt_similarities,similarities

gt_similarities,original_similarities = compute_similarity_pairs(sts_dataset['train'], model)

In [None]:
len(original_similarities)

In [None]:
models = {
    "AE":"Basic AutoEncoder",
    "NSA-AE":"NSA AutoEncoder"
}
model_choice = "AE"
version = 'd32'
latent_data = np.load(f'data/word2vec/{models[model_choice]}_latent_output_{version}.npy')

In [None]:
latent_data.shape

In [None]:
models = {
    "AE":"Basic AutoEncoder",
    "NSA-AE":"NSA AutoEncoder"
}
model_choice = "AE"
version = 'd32'
latent_data = np.load(f'data/word2vec/{models[model_choice]}_latent_output_{version}.npy')
latent_labels = np.load(f'data/word2vec/{models[model_choice]}_latent_labels_{version}.npy')

final_data = np.load(f'data/word2vec/{models[model_choice]}_final_output_{version}.npy')
final_labels = np.load(f'data/word2vec/{models[model_choice]}_final_labels_{version}.npy')

In [None]:
import numpy as np

def get_sentence_embedding_from_space(sentence, embedding_space,index_dict):
    words = sentence.split()
    word_embeddings = [embedding_space[index_dict[word]] for word in words if word in index_dict]
    if not word_embeddings:
        return np.zeros(embedding_space[0].shape[0])
    sentence_embedding = np.mean(word_embeddings, axis=0)
    return sentence_embedding

# Example usage
sentence_embedding = get_sentence_embedding_from_space("This is a test sentence.", latent_data, index_dict)

In [None]:
sentence_embedding.shape

In [None]:
gt_similarities,reconstructed_similarities = compute_similarity_pairs(sts_dataset['train'], latent_data,space=True)

In [None]:
len(reconstructed_similarities)

In [None]:
from scipy.stats import pearsonr

# Ground truth scores
ground_truth_scores = [item['similarity_score'] for item in sts_dataset['train']]

# Pearson correlation for original embeddings
pearson_corr_original = pearsonr(original_similarities, ground_truth_scores)
print(f"Pearson Correlation (Original): {pearson_corr_original}")

# Pearson correlation for reduced embeddings
pearson_corr_reduced = pearsonr(reconstructed_similarities, ground_truth_scores)
print(f"Pearson Correlation (Reduced): {pearson_corr_reduced}")

In [None]:
pearsonr(original_similarities, reconstructed_similarities)

## GNN Tests

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np

import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.transforms import Compose
from torch_geometric.datasets import Amazon
from torch_geometric.transforms.random_node_split import RandomNodeSplit
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GATConv
from torch_geometric.loader import NeighborLoader
from torch_geometric.nn import SAGEConv
from sklearn.metrics import roc_auc_score

from torch_geometric.utils import negative_sampling
from torch_geometric.utils import train_test_split_edges

from copy import deepcopy
import torch.nn as nn
from IPython.display import Javascript  # Restrict height of output cell.

In [None]:
from torch_geometric.datasets import Planetoid, Flickr, Amazon
from torch_geometric.transforms import NormalizeFeatures

dataset_name="Amazon"

if dataset_name=='Flickr':
    transform = Compose([
        #NormalizeFeatures(),
        RandomNodeSplit('train_rest',num_val = 2000, num_test = 10000)
    ])
    dataset = Flickr(root='data/Flickr', \
                     transform =transform)
elif dataset_name=='Amazon':
    transform = Compose([
        #NormalizeFeatures(),
        RandomNodeSplit('train_rest',num_val = 1000, num_test = 3000)
    ])
    dataset = Amazon(root='data/Amazon', name='Computers', \
                     transform =transform)

elif dataset_name in ['Cora', 'Citeseer', 'Pubmed']:
    # For Planetoid datasets, the standard split is already defined
    dataset = Planetoid(root=f'data/{dataset_name}', name=dataset_name)

else:
    raise ValueError(f"Unknown dataset: {dataset_name}")

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

In [None]:
pos_edge_index = data.edge_index

In [None]:
from torch_geometric.utils import negative_sampling

In [None]:
neg_edge_index = negative_sampling(data.edge_index)

In [None]:
#Run this cell if you already have embeddings from GNN_Analysis
#!mkdir data/LinkPrediction/Amazon
#!cp ../GNN_analysis/model_data/Amazon/GCN/LP_3_200.npz data/LinkPrediction/Amazon

In [None]:
features = np.load(f'data/LinkPrediction/{dataset_name}/LP_3_200.npz')

In [None]:
features = dict(features)
features['conv1'].shape

In [None]:
@torch.no_grad()
def test():
    model.eval()
    perfs = []
    for prefix in ["val", "test"]:
        pos_edge_index = data[f'{prefix}_pos_edge_index']
        neg_edge_index = data[f'{prefix}_neg_edge_index']

        z = model.encode(data.x, data.train_pos_edge_index) # encode train
        link_logits = model.decode(z, pos_edge_index, neg_edge_index) # decode test or val
        link_probs = link_logits.sigmoid() # apply sigmoid
        
        link_labels = get_link_labels(pos_edge_index, neg_edge_index) # get link
        
        perfs.append(roc_auc_score(link_labels.cpu(), link_probs.cpu())) #compute roc_auc score
    return perfs

In [None]:
original_data = features['conv1']

In [None]:
original_data.shape

In [None]:
models = {
    # "AE":"Basic AutoEncoder",
    # "NSA-AE":"NSA AutoEncoder",
    # "RTD-AE":"RTD AutoEncoder H1",
    'GNSA-AE':'GNSA AutoEncoder',
    'LNSA-AE':'LNSA AutoEncoder',
    'NSA-AE':'NSA AutoEncoder',

}
model_choice = "GNSA-AE"
version = 'd64_2'
latent_data = np.load(f'data/LinkPrediction/{dataset_name}/{models[model_choice]}_latent_output_{version}.npy')
latent_labels = np.load(f'data/LinkPrediction/{dataset_name}/{models[model_choice]}_latent_labels_{version}.npy')

final_data = np.load(f'data/LinkPrediction/{dataset_name}/{models[model_choice]}_final_output_{version}.npy')
final_labels = np.load(f'data/LinkPrediction/{dataset_name}/{models[model_choice]}_final_labels_{version}.npy')

In [None]:
print(final_data.shape)
print(latent_data.shape)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def get_link_labels(pos_edge_index, neg_edge_index):
    # returns a tensor:
    # [1,1,1,1,...,0,0,0,0,0,..] with the number of ones is equel to the length of pos_edge_index
    # and the number of zeros is equal to the length of neg_edge_index
    E = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(E, dtype=torch.float, device=device)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels

def generate_prediction_score(embeddings, pos_edge_index, neg_edge_index):
    edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1) # concatenate pos and neg edges
    print(edge_index.shape)
    logits = (embeddings[edge_index[0]] * embeddings[edge_index[1]]).sum(dim=-1)  # dot product 
    
    print(logits.shape)
    link_probs = logits.sigmoid() # apply sigmoid
    #link_probs = np.array(link_probs)
    #link_probs = (link_probs>=0.5).astype(int)
    #print(Counter(link_probs))
    link_labels = get_link_labels(pos_edge_index, neg_edge_index) # get link
    print(link_labels.shape)
    print(Counter(np.array(link_labels.cpu())))
    return roc_auc_score(link_labels.cpu(), link_probs.cpu())
    #return sum(np.array(link_labels.cpu()) == np.array(link_probs)) #compute roc_auc score

In [None]:
generate_prediction_score(torch.tensor(latent_data), pos_edge_index, neg_edge_index)

In [None]:
generate_prediction_score(torch.tensor(final_data), pos_edge_index, neg_edge_index)

In [None]:
generate_prediction_score(torch.tensor(original_data), pos_edge_index, neg_edge_index)