# New requirement - (update when new version is ready)

Install the NEExT package and dependencies as follows:

```
pip install pydantic==2.11.7
pip install imbalanced_learn==0.13.0
pip install --no-deps neext==0.2.9
```

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import time
import pandas as pd
import umap
from NEExT import NEExT
import igraph as ig
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rfc 
import sklearn.metrics as metrics
from sklearn.cluster import KMeans
import seaborn as sns
from collections import Counter 


## Datasets

Set the directory below as required.

In the next cell, we load one of the datasets ```nci1``` or ```nci109```.

There are 4 csv files:
* edges: the edgelist for every graph
* graph_indicator: maps each node to its graph
* graph_labels: binary labels (0/1)
* node_labels: numerical label for each node


In [None]:
datapath = '../Datasets/NCI/'

In [None]:
## nci1 or nci109
dataset = 'nci1'

## csv data files
edges = datapath+dataset+'_edges.csv'
node_graph_mapping = datapath+dataset+'_graph_indicator.csv'
graph_labels = datapath+dataset+'_graph_labels.csv'
node_labels = datapath+dataset+'_node_labels.csv'

# Initialize NEExT and set logging level
nxt = NEExT(log_level="ERROR")

# Load data with node reindexing and largest component filtering
# Load as networkx for now
print("\nLoading data...")
graph_collection = nxt.read_from_csv(
    edges_path=edges,
    node_graph_mapping_path=node_graph_mapping,
    graph_label_path=graph_labels,
    node_features_path=node_labels,
    reindex_nodes=True,
    filter_largest_component=False,
    graph_type="networkx",
    node_sample_rate=1.0
)


# EDA

We explore some simple descriptive statistics: number of graphs, number of nodes and edges, etc.

We see that graphs with label = 1 have slightly more nodes and edges on average; we will come back to this when we explore supervised learning.



In [None]:
## number of graphs
n_graphs = graph_collection.describe()['num_graphs']
print('number of graphs:', n_graphs)

## store graph labels for supervised learning 
g_labels = graph_collection.get_labels()['label']
print('graphs with label = 1:',np.sum(g_labels))

## map to igraph objects and count the number of vertices and edges
g = ig.Graph()
M = 0
for i in range(n_graphs):
    g[i] = ig.Graph.from_networkx(graph_collection.graphs[i].G, vertex_attr_hashable='name').simplify()
    g[i].vs['node_label'] = [int(l) for l in g[i].vs['node_label']] ## map node labels to integers
    M = max(M, np.max(g[i].vs['node_label']))
vc = [g[i].vcount() for i in range(n_graphs)]
ec = [g[i].ecount() for i in range(n_graphs)]
print('mean number of nodes:', np.mean(vc))
print('mean number of edges:', np.mean(ec))
print('max node label (min=0):',M)


In [None]:
## Plot number of nodes/edges for graphs with label 0 and 1 resp.

a = [vc[i] for i in range(len(vc)) if g_labels[i]==0]
b = [vc[i] for i in range(len(vc)) if g_labels[i]==1]
c = [ec[i] for i in range(len(ec)) if g_labels[i]==0]
d = [ec[i] for i in range(len(ec)) if g_labels[i]==1]

plt.subplots(1,2,figsize=(9,4))
plt.subplot(121)
plt.boxplot([a,b],labels=['0','1'],widths=.6, 
            flierprops = dict(marker='.', markerfacecolor='black', markersize=3,linestyle='none'),
            medianprops = dict(linestyle='-', linewidth=1.5, color='black'))
plt.ylabel('Count per graph',fontsize=14);
plt.xlabel('Label',fontsize=14)
plt.grid()
plt.title('Number of nodes',fontsize=14);

plt.subplot(122)
plt.boxplot([c,d],labels=['0','1'],widths=.6, 
            flierprops = dict(marker='.', markerfacecolor='black', markersize=3,linestyle='none'),
            medianprops = dict(linestyle='-', linewidth=1.5, color='black'))
#plt.ylabel('Count per graph',fontsize=11)
plt.xlabel('Label',fontsize=14);
plt.grid()
plt.title('Number of edges',fontsize=14)
#plt.savefig(dataset+'_counts.eps')
plt.show()


In [None]:
## example of a graph with label 0
id = 799
print('graph label:',g_labels[id])
ig.plot(g[id], bbox=(0,0,300,300), vertex_size=10, vertex_color='grey', 
        #target=dataset+'_0.eps',
        vertex_label=g[id].vs['node_label'], vertex_label_size=0)


In [None]:
## example of a graph with label 1 
id = 2299
print('label:',g_labels[id])
ig.plot(g[id], bbox=(0,0,300,300), vertex_size=10, vertex_color='grey', 
        #target=dataset+'_1.eps',
        vertex_label=g[id].vs['node_label'], vertex_label_size=0)


# Supervised learning

We build supervised learning (classification) models to predict the graph labels. 

In what follows, we use **random forest** classifiers, train each model on 80% of the graph, and apply it to the other 20%. We use accuracy as well as area under ROC curves (aoc) as measures of performance.

We will explore three ways to obtain features (vector representations) for each graph:
* overall **graph features**, such as degree and coreness distribution, number of nodes and edges, etc.
* **graph2vec** embedding, a neural method that takes advantage of the node labels, and
* **NEExT**, a tool to build graph embeddings based on vectors of node features for each graph.

We look at each set of features separately, and also using all features at once.


In [None]:
## results in book obtained on a MacOS 14.6.1 with M1 chip.
## seeding for reproducibility
## some results may still vary slightly on different architectures

RS = 321
np.random.seed(RS)


## (1) Overall graph features

* number of nodes and edges, and the graph density
* degree distribution
* coreness distribution
* graph assortativity


In [None]:
### Build graph-based features for each compound (graph)
L = []

for i in range(n_graphs):
    sg = g[i]
    
    ## node and edge counts, density
    vc = sg.vcount()
    ec = sg.ecount()
    x = [vc,ec,vc/ec]
    
    ## assortativity
    a = sg.assortativity_degree()
    if np.isnan(a):
        a=0
    x.extend([a])
    
    ## degree distribution (1, 2 and >=3)
    c = Counter(sg.degree())
    x.extend([c[1]/vc,c[2]/vc,(vc-c[1]-c[2])/vc])
    
    ## coreness distribution
    c = Counter(sg.coreness())
    x.extend([c[1]/vc,c[2]/vc,(vc-c[1]-c[2])/vc])
    
    L.append(x)

## store all features in a dataframe
col = ['nodes','edges','density','assort','deg1','deg2','deg3+','core1','core2','core3+']
OGF = pd.DataFrame(L,columns=col)
OGF.head()


In [None]:
## accuracy, ROC curve and auc
## overall graph features only

## train/test split
X_train, X_test, y_train, y_test = train_test_split(OGF, g_labels, test_size=0.2, random_state=RS)

## random forest classifier -- accuracy
rfc_mdl = rfc(n_estimators=100, criterion='entropy', random_state=RS)
rfc_mdl.fit(X_train,y_train)
y_pred = rfc_mdl.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## ROC curve
y_probs = rfc_mdl.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_probs)
auc = metrics.roc_auc_score(y_test, y_probs)
plt.plot(fpr,tpr,label="ROC, auc="+str('%.3f' % auc),color='black')
plt.plot([0,1],[0,1],'--',label='Random',color='black')
plt.legend(loc=4,fontsize=14)
plt.title('Graph features', fontsize=16)
plt.xlabel('False Positive Rate',fontsize=14)
plt.ylabel('True Positive Rate',fontsize=14)
#plt.savefig('nci_64d_roc.eps')
plt.show()


## (2) graph2vec features

Pre-computed graph embeddings using graph2vec. We tested three versions:
* 1024-dimensioanl embeddings
* 64-dimensional embeddings
* 1024-dimensional embedding followed by reduction to 64 dimensions (using UMAP)

The last method gave the best results and is the one we use below.


In [None]:
## read the embedings from csv file
g2v = datapath+dataset+'_g2v.csv'
G2VF = pd.read_csv(g2v, header=None)
G2VF = np.array(G2VF.drop(columns=[0]))


In [None]:
## accuracy, ROC curve and auc
## graph2vec features only

## train/test split
X_train, X_test, y_train, y_test = train_test_split(G2VF, g_labels, test_size=0.2, random_state=RS)

## random forest classifier -- accuracy
rfc_mdl = rfc(n_estimators=100, criterion='entropy', random_state=RS)
rfc_mdl.fit(X_train,y_train)
y_pred = rfc_mdl.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## ROC curve
y_probs = rfc_mdl.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_probs)
auc = metrics.roc_auc_score(y_test, y_probs)
plt.plot(fpr,tpr,label="ROC, AUC="+str('%.3f' % auc),color='black')
plt.plot([0,1],[0,1],'--',label='Random',color='black')
plt.legend(loc=4,fontsize=14)
plt.xlabel('False Positive Rate',fontsize=14)
plt.ylabel('True Positive Rate',fontsize=14)
#plt.savefig('nci_64d_roc.eps')
plt.show()


## (3) NEExT: embedding using node-level features

This is a 2-step process; we use the functions provided by the NEExT package.

* first, we compute several node features for each graph and normalize; those are the "bag of vectors" that represent each graph;
* then, we embed the graphs via the earth mover (Wasserstein or similar) distance.


In [None]:
%%time
nxt.set_log_level("ERROR") ## minimize log output

# Compute node features
features = nxt.compute_node_features(
    graph_collection=graph_collection,
    feature_list=["page_rank","degree_centrality","closeness_centrality","betweenness_centrality","eigenvector_centrality",
                  "clustering_coefficient","local_efficiency","lsme","load_centrality","basic_expansion"],
    feature_vector_length=5,
    show_progress=False
)

# normalize
features.normalize()
features.features_df.head()


In [None]:
# Compute graph embeddings
embeddings = nxt.compute_graph_embeddings(
    graph_collection=graph_collection,
    features=features,
    embedding_algorithm="approx_wasserstein",
    embedding_dimension=len(features.feature_columns),
    random_state=RS
)

## node feature based embeddings
NEF = embeddings.embeddings_df
NEF = np.array(NEF.drop(columns=['graph_id']))


In [None]:
## accuracy, ROC curve and auc
## NEExT's node-based feature embeddings only

## train/test split
X_train, X_test, y_train, y_test = train_test_split(NEF, g_labels, test_size=0.2, random_state=RS)


## random forest classifier -- accuracy
rfc_mdl = rfc(n_estimators=100, criterion='entropy', random_state=RS)
rfc_mdl.fit(X_train,y_train)
y_pred = rfc_mdl.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## ROC curve
y_probs = rfc_mdl.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_probs)
auc = metrics.roc_auc_score(y_test, y_probs)
plt.plot(fpr,tpr,label="ROC, AUC="+str('%.3f' % auc),color='black')
plt.plot([0,1],[0,1],'--',label='Random',color='black')
plt.legend(loc=4,fontsize=14)
plt.xlabel('False Positive Rate',fontsize=14)
plt.ylabel('True Positive Rate',fontsize=14)
#plt.savefig('nci_64d_roc.eps')
plt.show()


## (4) Using all features

We merge all the features we computed above.

A 2-dim projection show some small clusters with (mostly) graphs with the same label, but for most graphs, there is no clear separation.


In [None]:
## merge features
AF = np.concatenate((OGF, G2VF, NEF), axis=1)
AF.shape


In [None]:
# Visualisation -  reduce the embeddings to 2D 
umap_model = umap.UMAP(n_neighbors=15, 
                      min_dist=0.1, 
                      n_components=2, 
                      random_state=RS,
                      n_jobs=1)
embedding_2d = umap_model.fit_transform(AF)

# Create a DataFrame with the 2D embeddings and labels
viz_df = pd.DataFrame({
    'UMAP1': embedding_2d[:, 0],
    'UMAP2': embedding_2d[:, 1],
    #'graph_id': embeddings.embeddings_df['graph_id']
})

# Add class labels from graph collection
viz_df['label'] = g_labels

## plot
plt.scatter(viz_df.UMAP1, viz_df.UMAP2, c=viz_df.label, s=5, cmap='Set1')
plt.show()


In [None]:
## accuracy, ROC curve and auc
## using all features

## train/test split
X_train, X_test, y_train, y_test = train_test_split(AF, g_labels, test_size=0.2, random_state=RS)

## random forest classifier -- accuracy
rfc_mdl = rfc(n_estimators=100, criterion='entropy', random_state=RS)
rfc_mdl.fit(X_train,y_train)
y_pred = rfc_mdl.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## ROC curve
y_probs = rfc_mdl.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_probs)
auc = metrics.roc_auc_score(y_test, y_probs)
plt.plot(fpr,tpr,label="ROC, auc="+str('%.3f' % auc),color='black')
plt.plot([0,1],[0,1],'--',label='Random',color='black')
plt.legend(loc=4,fontsize=14)
plt.grid(color='lightgrey')
plt.xlabel('False Positive Rate',fontsize=14)
plt.ylabel('True Positive Rate',fontsize=14)
#plt.savefig(dataset+'_roc.eps')
plt.show()


In [None]:
## bootstrap confidence interval for the AUC above
def bootstrap(y_test,y_probs,n_boot=1000):
    y_pred = np.array(y_probs)
    y_true = np.array(y_test)
    scores = []
    rng = np.random.RandomState(RS)
    for i in range(n_boot):
        indices = rng.randint(0, len(y_pred), len(y_pred))
        sc = metrics.roc_auc_score(y_true[indices], y_pred[indices])
        scores.append(sc)
    sorted_scores = np.array(scores)
    sorted_scores.sort()
    conf_lo = sorted_scores[int(0.025 * len(sorted_scores))]
    conf_up = sorted_scores[int(0.975 * len(sorted_scores))]
    return (conf_lo, conf_up)
bootstrap(y_test,y_probs)


## Top features

From the previous experiment using all features, we look at the top ones in terms of importance.

With random forest classifiers, we already have such feature importance measures available.

The NEExT package also provided methods for feature importance which can be used in unsupervised context.

Recall that we have:

* 10 overall graph features,
* 64 features from the graph2vec embeddings, and
* 50 features from the NEExT feature-based embeddings.


In [None]:
top_features = np.argsort(rfc_mdl.feature_importances_)[::-1]
f = list(OGF.columns) + ['g2v_'+str(i) for i in np.arange(0,64)] + ['neext_'+str(i) for i in np.arange(0,50)]
n_top = 10
print('top',n_top,'features:')
print([f[i] for i in top_features[:n_top]])


In [None]:
n_top = 10
AF_norm = (AF - AF.mean(axis=0)) / (AF.std(axis=0))

data = {
    'feature' : np.concatenate([np.repeat(str(t+1),n_graphs) for t in top_features[:n_top]]),
    'value' : np.concatenate([np.array(AF_norm[:,t]) for t in top_features[:n_top]]),
    'label' : list(g_labels)*n_top
}
_df = pd.DataFrame(data)
sns.boxplot(x='feature', y='value', data=_df, hue='label', palette='grey', showfliers=False, width=.5, gap=.1);
plt.xlabel('top features', fontsize=14)
plt.ylabel('normalized value', fontsize=14)
plt.grid(axis='y')
plt.xticks(np.arange(n_top), [f[i] for i in top_features[:n_top]], rotation=45)
plt.subplots_adjust(bottom=0.2)
#plt.savefig(dataset+'_features.eps')
plt.show()


# Unsupervised learning

We perform simple **k-means** clustering (fixing k to 10) and explore the content of the different clusters.

Wityh unsupervised learning, we do not use the graph labels to identify the clusters.
However, we use the labels a-posteriori for diagnostic.

We plot the proportion of graphs with label == 1 vs the size of the cluster, and we observe several small clusters with a large proportion of graphs with label == 1.

We then explore the clusters by looking at two of the top overall graph features: the number of edges, and the proportion of nodes of degree 3 or more. 
We see that a few small clusters can easily be identified only with those simple features, and those clusters are (for the most part) mainly made up of graphs with label 1.

However, a large proportion of the graphs end up in two larger clusters where graphs with both label values are found in large number, and where those two overall graph features are close to the average values over all graphs.


In [None]:
## build k-means clustering 
NCL = 10 ## number of clusters
km = KMeans(n_clusters=NCL, n_init=100, random_state=RS).fit(AF_norm).labels_
K = pd.DataFrame(np.array([g_labels, km, np.repeat(1,len(g_labels))]).transpose(),
                 columns=['label=1','cluster','total'])
C = K.groupby(by='cluster').sum()
C['ratio'] = C['label=1']/C['total']


In [None]:
OGF['cluster'] = km
_df = OGF.groupby(by='cluster').mean()
_df['size'] = C['total']
_df['label=1'] = C['ratio']
_df.sort_values(by='label=1', ascending=False)


In [None]:
## overall averages
OGF.mean()


In [None]:
## plot the clusters w.r.t. two overall graph features
## dashed lines are the average values for those features over all graphs
## greyscale color indicates the proportion of label 1 graphs (black = 100%)
## sizes are proportional to the number of graphs in each cluster

OGF['cluster'] = km
_df = OGF.groupby(by='cluster').mean()
_df['sizes'] = C['total']
_df['ratio'] = 1-C['ratio']

plt.scatter(_df.edges, _df['deg3+'], s=_df.sizes, c=_df.ratio, cmap='grey', vmin=0, vmax=1)
plt.grid(linestyle=':')
plt.vlines(OGF.edges.mean(), .18,.44,linestyles='--',color='grey')
plt.hlines(OGF['deg3+'].mean(), 10, 100,linestyles='--',color='grey')
plt.xlabel('number of edges', fontsize=14)
plt.ylabel('proportion of degree-3+ nodes', fontsize=14)
#plt.savefig(dataset+'_kmeans_2.eps')
plt.show()


In [None]:
## plot example with large 2-core 
id = 2077
print('cluster:',km[id])
print('label:',g_labels[id])
print(OGF[id:(id+1)]['core2'])
ig.plot(g[id], bbox=(0,0,300,300), vertex_size=8, vertex_color='grey', 
        #target=dataset+'_2core_1.eps',
        vertex_label=g[id].vs['node_label'], vertex_label_size=0)


In [None]:
## variance of the 2-core proportion
np.var(OGF[OGF.cluster==2]['core2'])

In [None]:
## plot example with small 2-core 
id = 0
print('cluster:',km[id])
print('label:',g_labels[id])
ig.plot(g[id], bbox=(0,0,300,300), vertex_size=8, vertex_color='grey', 
        #target=dataset+'_2core_2.eps',
        vertex_label=g[id].vs['node_label'], vertex_label_size=0)


In [None]:
## variance of the 2-core proportion
np.var(OGF[OGF.cluster==9]['core2'])