# Load the graphs

In [3]:
#@title  { form-width: "30%" }
import tqdm as tq
import igraph as ig 
import os

datapath = "/Users/maurizio/Downloads/" #@param {type:"string"}
dataset = "LUNG" #@param ["Mutag", "LFR", "MREG", "Kidney_9.2", "COBREpos"] {allow-input: true}
format = "graphml" #@param ["graphml", "edgelist"] {allow-input: true}
path = f'{datapath}/{dataset}/{format}'
filenames = os.listdir(path)
graphs = []
for f in tq.tqdm(filenames):
  if f.endswith(format):
    g = ig.load(os.path.join(path,f))
    g["name"] = f.split('.')[0]
    graphs += [g]

100%|██████████| 1136/1136 [00:58<00:00, 19.50it/s]


# Read the labels

In [4]:
import pandas as pd
import numpy as np
dfl = pd.read_csv(f'{datapath}/{dataset}/{dataset}.txt', sep='\t')
last_column = dfl.iloc[:,[0] + [-1]]
y = [last_column[last_column['Samples'] == g["name"]].iloc[:,-1:].values[0] for g in graphs]
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(np.ravel(y))
y = le.transform(np.ravel(y))
assert len(y) == len(graphs)
from collections import Counter
Counter(y)

Counter({0: 585, 1: 550})

# Validation of inductive embeddings



In [7]:
#@title  { form-width: "30%" }
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,matthews_corrcoef,accuracy_score,precision_score,f1_score, recall_score
from sklearn.model_selection import train_test_split
from netpro2vec.Netpro2vec import Netpro2vec
params = {"agg_by": [1], "cut_off": [0.1], "dimensions": 512, "encodew": True, "epochs": 400, "extractor": [1], "min_count": 2, "prob_type": ["ndd"], "save_vocab": False, "seed": 1, "verbose": True, "vertex_attribute": None, "workers": 4}

G = np.array(graphs)
print("INDUCTIVE EMBEDDING:")
G_train, G_test, y_train, y_test = train_test_split(graphs, y, test_size=0.10, random_state=42)
model = Netpro2vec(**params)
model.fit(G_train)
X_train = model.get_embedding()
X_test = np.array(model.infer_vector(G_test,epochs=0))
clf = SVC(kernel='linear')
preds = clf.fit(X_train,y_train).predict(X_test)
print(confusion_matrix(y_test, preds),f'Acc. {accuracy_score(y_test, preds)}')

INDUCTIVE EMBEDDING:
Calculating Node Distance Distribution...


100%|██████████| 1021/1021 [1:03:46<00:00,  3.75s/it]


Building vocabulary for ndd...


100%|██████████| 1021/1021 [00:56<00:00, 17.92it/s]


Doc2Vec embedding in progress...Done!
Calculating Node Distance Distribution...


100%|██████████| 114/114 [07:09<00:00,  3.77s/it]


Building vocabulary for ndd...


100%|██████████| 114/114 [00:06<00:00, 18.47it/s]


Doc2Vec inferring (steps=0, alpha0.025000) in progress...