# New requirement

Install the NEExT package and dependencies as follows:

```
pip install --no-deps pydantic pydantic_core typing_inspection annotated_types imbalanced_learn neext
```

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import time
import pandas as pd
import umap
from NEExT import NEExT
import igraph as ig
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rfc 
import sklearn.metrics as metrics
from sklearn.cluster import KMeans
import seaborn as sns
from collections import Counter 


In [None]:
#datapath = '/Users/francois/Book/GraphMiningNotebooks/Datasets/NCI/'
datapath = '../Datasets/NCI/'

In [None]:
## nci1 or nci109
dataset = 'nci1'

## csv data files
edges = datapath+dataset+'_edges.csv'
node_graph_mapping = datapath+dataset+'_graph_indicator.csv'
graph_labels = datapath+dataset+'_graph_labels.csv'
node_labels = datapath+dataset+'_node_labels.csv'

# Initialize NEExT and set logging level
nxt = NEExT(log_level="ERROR")

# Load data with node reindexing and largest component filtering
# Load as networkx for now
print("\nLoading data...")
graph_collection = nxt.read_from_csv(
    edges_path=edges,
    node_graph_mapping_path=node_graph_mapping,
    graph_label_path=graph_labels,
    node_features_path=node_labels,
    reindex_nodes=True,
    filter_largest_component=True,
    graph_type="networkx",
    node_sample_rate=1.0
)


# EDA

In [None]:
## number of graphs
n_graphs = graph_collection.describe()['num_graphs']
n_graphs

In [None]:
## graph labels
g_labels = graph_collection.get_labels()['label']

## number of vertices and edges
g = ig.Graph()
for i in range(n_graphs):
    g[i] = ig.Graph.from_networkx(graph_collection.graphs[i].G, vertex_attr_hashable='name').simplify()
    g[i].vs['label'] = [int(l) for l in g[i].vs['node_label']] ## map node labels to integers
vc = [g[i].vcount() for i in range(n_graphs)]
ec = [g[i].ecount() for i in range(n_graphs)]


In [None]:
## Plot number of nodes/edges for graphs with label 0 and 1 resp.
a = [vc[i] for i in range(len(vc)) if g_labels[i]==0]
b = [vc[i] for i in range(len(vc)) if g_labels[i]==1]
c = [ec[i] for i in range(len(ec)) if g_labels[i]==0]
d = [ec[i] for i in range(len(ec)) if g_labels[i]==1]

plt.subplots(1,2,figsize=(9,4))
plt.subplot(121)
plt.boxplot([a,b],labels=['0','1'],widths=.6, 
            flierprops = dict(marker='.', markerfacecolor='black', markersize=3,linestyle='none'),
            medianprops = dict(linestyle='-', linewidth=1.5, color='black'))
plt.ylabel('Count per graph',fontsize=14);
plt.xlabel('Label',fontsize=14)
plt.grid()
plt.title('Number of nodes',fontsize=14);

plt.subplot(122)
plt.boxplot([c,d],labels=['0','1'],widths=.6, 
            flierprops = dict(marker='.', markerfacecolor='black', markersize=3,linestyle='none'),
            medianprops = dict(linestyle='-', linewidth=1.5, color='black'))
#plt.ylabel('Count per graph',fontsize=11)
plt.xlabel('Label',fontsize=14);
plt.grid()
plt.title('Number of edges',fontsize=14);
#plt.savefig('nci_counts.eps');


In [None]:
## example of a graph with label 0
id = 799
print('graph label:',g_labels[id])
ig.plot(g[id], bbox=(0,0,300,300), vertex_size=10, vertex_color='grey', 
        vertex_label=g[id].vs['node_label'],vertex_label_size=0)
#ig.plot(sg,target='nci_0.eps',bbox=(0,0,300,300))


In [None]:
## example of a graph with label 1 
id = 2299
print('label:',g_labels[id])
ig.plot(g[id], bbox=(0,0,300,300), vertex_size=10, vertex_color='grey', 
        vertex_label=g[id].vs['node_label'],vertex_label_size=0)
#ig.plot(sg,target='nci_1.eps',bbox=(0,0,300,300))


## overall graph features


In [None]:
RS = 321
np.random.seed(RS)


In [None]:
### Build graph-based features for each compound (graph)
def degrees(sg,md):
    ctr = Counter(sg.degree())
    return [ctr[i]/sg.vcount() for i in range(1,md+1)]
def core(sg,mc):
    ctr = Counter(sg.coreness())
    return [ctr[i] for i in range(1,mc+1)]

## Compute features for each graph
L = []
md = np.max([np.max(g[i].degree()) for i in range(n_graphs)])
mc = np.max([np.max(g[i].coreness()) for i in range(n_graphs)])

for i in range(n_graphs):
    sg = g[i]
    ## node and edge counts, density
    x = [sg.vcount(),sg.ecount(),sg.ecount()/sg.vcount()]
    ## assortativity
    a = sg.assortativity_degree()
    if np.isnan(a):
        a=0
    x.extend([a])
    ## degree distribution
    x.extend(degrees(sg,md))
    ## coreness distribution
    x.extend([i/sg.vcount() for i in core(sg,mc)])
    L.append(x)

In [None]:
## store all features in a dataframe
col = ['nodes','edges','density','assort']
col.extend(['deg'+str(i) for i in np.arange(1,md+1)])
col.extend(['core'+str(i) for i in np.arange(1,mc+1)])
#col.extend(['label'+str(i+1) for i in np.arange(ml)])
OGF = pd.DataFrame(L,columns=col)
OGF.head()

In [None]:
## overall graph features only
X_train, X_test, y_train, y_test = train_test_split(OGF, g_labels, test_size=0.2, random_state=RS)
## random forest classifier -- accuracy
rfc_mdl = rfc(n_estimators=100, criterion='entropy', random_state=RS)
rfc_mdl.fit(X_train,y_train)
y_pred = rfc_mdl.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## ROC curve
y_probs = rfc_mdl.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_probs)
auc = metrics.roc_auc_score(y_test, y_probs)
plt.plot(fpr,tpr,label="ROC, auc="+str('%.3f' % auc),color='black')
plt.plot([0,1],[0,1],'--',label='Random',color='black')
plt.legend(loc=4,fontsize=14)
plt.xlabel('False Positive Rate',fontsize=14)
plt.ylabel('True Positive Rate',fontsize=14);
#plt.savefig('nci_64d_roc.eps');


## graph2vec features


In [None]:
g2v = datapath+dataset+'_g2v.csv'
G2VF = pd.read_csv(g2v, header=None)
G2VF = np.array(G2VF.drop(columns=[0]))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(G2VF, g_labels, test_size=0.2, random_state=RS)
## random forest classifier -- accuracy
rfc_mdl = rfc(n_estimators=100, criterion='entropy', random_state=RS)
rfc_mdl.fit(X_train,y_train)
y_pred = rfc_mdl.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## ROC curve
y_probs = rfc_mdl.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_probs)
auc = metrics.roc_auc_score(y_test, y_probs)
plt.plot(fpr,tpr,label="ROC, AUC="+str('%.3f' % auc),color='black')
plt.plot([0,1],[0,1],'--',label='Random',color='black')
plt.legend(loc=4,fontsize=14)
plt.xlabel('False Positive Rate',fontsize=14)
plt.ylabel('True Positive Rate',fontsize=14);
#plt.savefig('nci_64d_roc.eps');

## NEExT: embedding using node-level features


In [None]:
%%time
nxt.set_log_level("ERROR")

# Compute node features
features = nxt.compute_node_features(
    graph_collection=graph_collection,
    feature_list=["page_rank","degree_centrality","closeness_centrality","betweenness_centrality","eigenvector_centrality",
                  "clustering_coefficient","local_efficiency","lsme","load_centrality","basic_expansion"],
    feature_vector_length=5,
    show_progress=True
)

features.normalize()
features.features_df.head()

In [None]:
%%time
# Compute graph embeddings
embeddings = nxt.compute_graph_embeddings(
    graph_collection=graph_collection,
    features=features,
    embedding_algorithm="approx_wasserstein",
    #embedding_dimension=16,
    embedding_dimension=len(features.feature_columns),
    random_state=RS
)


In [None]:
## Classification with RFC: node feature based embeddings only
NEF = embeddings.embeddings_df
NEF = np.array(NEF.drop(columns=['graph_id']))
X_train, X_test, y_train, y_test = train_test_split(NEF, g_labels, test_size=0.2, random_state=RS)
## random forest classifier -- accuracy
rfc_mdl = rfc(n_estimators=100, criterion='entropy', random_state=RS)
rfc_mdl.fit(X_train,y_train)
y_pred = rfc_mdl.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## ROC curve
y_probs = rfc_mdl.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_probs)
auc = metrics.roc_auc_score(y_test, y_probs)
plt.plot(fpr,tpr,label="ROC, AUC="+str('%.3f' % auc),color='black')
plt.plot([0,1],[0,1],'--',label='Random',color='black')
plt.legend(loc=4,fontsize=14)
plt.xlabel('False Positive Rate',fontsize=14)
plt.ylabel('True Positive Rate',fontsize=14);
#plt.savefig('nci_64d_roc.eps');


# using all features

In [None]:
AF = np.concatenate((NEF,G2VF,OGF), axis=1)
AF.shape


In [None]:
# Use UMAP to reduce the embeddings to 2D for visualization
print("\nReducing embeddings to 2D using UMAP...")

# Create and fit UMAP model
umap_model = umap.UMAP(n_neighbors=15, 
                      min_dist=0.1, 
                      n_components=2, 
                      random_state=42,
                      n_jobs=1)
embedding_2d = umap_model.fit_transform(AF)

# Create a DataFrame with the 2D embeddings and labels
viz_df = pd.DataFrame({
    'UMAP1': embedding_2d[:, 0],
    'UMAP2': embedding_2d[:, 1],
    #'graph_id': embeddings.embeddings_df['graph_id']
})

# Add class labels from graph collection
#graph_labels_dict = {g.graph_id: g.graph_label for g in graph_collection.graphs}
#viz_df['label'] = viz_df['graph_id'].map(graph_labels_dict)
viz_df['label'] = g_labels

In [None]:
plt.scatter(viz_df.UMAP1, viz_df.UMAP2, c=viz_df.label, s=5, cmap='Set1');


In [None]:
# Use UMAP to reduce the embeddings to 2D for visualization
print("\nReducing embeddings to 2D using UMAP...")
# Extract embeddings as numpy array for UMAP
embedding_data = embeddings.embeddings_df[embeddings.embedding_columns].values

# Create and fit UMAP model
umap_model = umap.UMAP(n_neighbors=15, 
                      min_dist=0.1, 
                      n_components=2, 
                      random_state=42,
                      n_jobs=1)
embedding_2d = umap_model.fit_transform(embedding_data)

# Create a DataFrame with the 2D embeddings and labels
viz_df = pd.DataFrame({
    'UMAP1': embedding_2d[:, 0],
    'UMAP2': embedding_2d[:, 1],
    'graph_id': embeddings.embeddings_df['graph_id']
})

# Add class labels from graph collection
graph_labels_dict = {g.graph_id: g.graph_label for g in graph_collection.graphs}
viz_df['label'] = viz_df['graph_id'].map(graph_labels_dict)


In [None]:
## three sets
X_train, X_test, y_train, y_test = train_test_split(AF, g_labels, test_size=0.2, random_state=RS)
## random forest classifier -- accuracy
rfc_mdl = rfc(n_estimators=100, criterion='entropy', random_state=RS)
rfc_mdl.fit(X_train,y_train)
y_pred = rfc_mdl.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## ROC curve
y_probs = rfc_mdl.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_probs)
auc = metrics.roc_auc_score(y_test, y_probs)
plt.plot(fpr,tpr,label="ROC, auc="+str('%.3f' % auc),color='black')
plt.plot([0,1],[0,1],'--',label='Random',color='black')
plt.legend(loc=4,fontsize=14)
plt.xlabel('False Positive Rate',fontsize=14)
plt.ylabel('True Positive Rate',fontsize=14);
#plt.savefig('nci_64d_roc.eps');

In [None]:
## bootstrap confidence interval for the AUC
def bootstrap(y_test,y_probs,n_boot=1000):
    y_pred = np.array(y_probs)
    y_true = np.array(y_test)
    scores = []
    rng = np.random.RandomState(RS)
    for i in range(n_boot):
        indices = rng.randint(0, len(y_pred), len(y_pred))
        sc = metrics.roc_auc_score(y_true[indices], y_pred[indices])
        scores.append(sc)
    sorted_scores = np.array(scores)
    sorted_scores.sort()
    conf_lo = sorted_scores[int(0.05 * len(sorted_scores))]
    conf_up = sorted_scores[int(0.95 * len(sorted_scores))]
    return (conf_lo, conf_up)
bootstrap(y_test,y_probs)

## top features

In [None]:
## 50 from NEExT feature-based embedding
## 64 from g2v embedding
## 11 graph features
top_features = np.argsort(rfc_mdl.feature_importances_)[::-1]
f = ['emb_'+str(i) for i in np.arange(0,50)] + ['g2v_'+str(i) for i in np.arange(0,64)] + list(OGF.columns)
n_top = 60
print('top',n_top,'features:')
print('feature-based embedding:',len([f[i] for i in top_features[:n_top] if i<50]))
print('graph2vec:',len([f[i] for i in top_features[:n_top] if i>=50 and i<114]))
print('graph features:',[f[i] for i in top_features[:n_top] if i>=114])


In [None]:
n_top = 10
AF_norm = (AF - AF.mean(axis=0)) / (AF.std(axis=0))

data = {
    'feature' : np.concatenate([np.repeat(str(t+1),n_graphs) for t in top_features[:n_top]]),
    'value' : np.concatenate([np.array(AF_norm[:,t]) for t in top_features[:n_top]]),
    'label' : list(g_labels)*n_top
}
_df = pd.DataFrame(data)
sns.boxplot(x='feature', y='value', data=_df, hue='label', palette='grey', showfliers=False, width=.5, gap=.1);
plt.xlabel('top features', fontsize=14)
plt.ylabel('normalized value', fontsize=14)
plt.grid(axis='y')
plt.xticks(np.arange(n_top), [str(i+1) for i in range(n_top)]);

# Unsupervised

We perform simple k-means clustering (fixing k to 10) and explore the content of the different clusters.

We plot the proportion of graphs with label == 1 vs the size of the cluster.

We see several small clusters with a large proportion of graphs with label == 1.



In [None]:
NCL = 10 ## number of clusters
RS = 123
cl = KMeans(n_clusters=NCL, n_init=10, random_state=RS).fit(AF_norm).labels_
K = pd.DataFrame(np.array([g_labels,cl,np.repeat(1,len(g_labels))]).transpose(),columns=['label=1','cluster','total'])
C = K.groupby(by='cluster').sum()
C['ratio'] = C['label=1']/C['total']
plt.figure(figsize=(6,4))
plt.plot(C['total'],C['ratio'],'o',color='black')
plt.xlabel('cluster size', fontsize=14)
plt.ylabel('proportion with label 1', fontsize=14);
#plt.savefig('kmeans_1.eps');


In [None]:
## same data in a table
_df = C.sort_values(by='ratio',ascending=False).round(decimals=3)
_df


In [None]:
OGF['cluster'] = cl
_df = OGF.groupby(by='cluster').mean()
_df['sizes'] = C['total']
_df['ratio'] = 1-C['ratio']

plt.scatter(_df.edges, _df.deg3, s=_df.sizes, c=_df.ratio, cmap='grey', vmin=0, vmax=1)
plt.grid()
plt.vlines(OGF.edges.mean(), .225,.400,linestyles='--',color='grey')
plt.hlines(OGF.deg3.mean(), 10, 100,linestyles='--',color='grey')
plt.xlabel('number of edges')
plt.ylabel('proportion of degree-3 nodes');


In [None]:
_df = OGF.groupby(by='cluster').mean()
_df = _df[['edges','assort','deg3','core2']]
_df

In [None]:
_df['size'] = C['total']
_df['label=1'] = C['ratio']
_df.sort_values(by='size')

In [None]:
OGF.mean()[['edges','assort','deg3','core2']]