In [None]:
import graphistry
import pandas as pd
import torch.nn.functional as F

from ml.dgl_utils import *
from ml.utils import *

In [None]:
import logging
logging.basicConfig()
logger = logging.getLogger('demo')
logger.setLevel(logging.DEBUG)

In [None]:
def scatterplot(ux, color_labels=None):
    #small helper viz
    import matplotlib.pyplot as plt

    plt.figure(figsize=(10, 8))
    plt.scatter(ux.T[0], ux.T[1], c=color_labels, s=100, alpha=0.4)

In [None]:
#graphistry.register(api=3, protocol="https", server="hub.graphistry.com", username="..", password="..") 

# We import a subgraph from the LittleSis dataset centered around BlackRock, Inc

In [None]:
edf = pd.read_csv('data/edges_blackrock.csv', index_col=0)
ndf = pd.read_csv('data/nodes_blackrock.csv', index_col=0)

In [None]:
edf

# Explore subgraphs
`get_graphistry_from_search` is a useful way to do fuzzy search over the dataframes to retrieve useful information

In [None]:
%%timeit
search_to_df('Bank', 'to_node', edf)

In [None]:
# g = get_graphistry_from_search('Bank', 'to_node', 'from_node', 'Node', edf, ndf)
# g.plot()

In [None]:
# g = get_graphistry_from_search('climate', 'to_node', 'from_node', 'Node', edf, ndf)
# g.plot()

# Explore Milieu
`get_graphistry_from_milieu_search` is a useful way to do fuzzy search over the dataframes to retrieve useful information over 1 and 2 connections from `search_term`

In [None]:
# this works much better on full LittleSis data, than just the small BlackRock sample above...
# g = get_graphistry_from_milieu_search('meta', 'to_node', 'from_node', 'Node', edf, ndf, both=True)
# g.plot()

# Let's encode the graph as a DGL graph for use in Machine Learning

In [None]:
# Let's explicitly make a node level target (simplified to two classes)
node_target = ndf.Types.apply(lambda x: x.split(',')[0])
node_target = pd.DataFrame({'Types': node_target.values}, index=node_target.index)

In [None]:
Counter(node_target.Types)  # we have a simple target defined here

In [None]:
ndf.columns # not all of these are useful for building a model

In [None]:
ndf.info() # and we can see that few are present past the 5th entry below

In [None]:
src, dst = 'from_node', 'to_node' #backwards due to the way we scraped the data
node_column = 'Node'
node_target = 'Types' # uncomment to build node_target above
edge_target = 'relationship_type' # can be column label or, as in previous cell, the df itself.

# can use ndf instead of nndf here, no problem, doesn't really change analysis
graph = BaseDGLGraphFromPandas(ndf, edf, src, dst, node_column, node_target=node_target, edge_target=edge_target)


In [None]:
# graph.build_simple_graph()

In [None]:
graph.embeddings()

In [None]:
# now we have a DGL graph with ndata and edata built via dirty_cat
graph.graph

In [None]:
graph.weighted_edges_df

In [None]:
# now we have two adjacency matrices, one from standard edgelist, and another from UMAP
fig, ax = plt.subplots(2, 1, figsize=(15,15))

ax[0].imshow(graph.weighted_adjacency.toarray(), aspect='auto')
ax[1].imshow(graph.adjacency.toarray(), aspect='auto')

In [None]:
# let's try to add the graphistry plottable with the umap coords and edgelist from umap
e2i = graph.entity_to_index
ndf['n'] = ndf.Node.apply(lambda x: e2i[x])

In [None]:
g = (
    graphistry
    .nodes(ndf, 'n')
    .edges(graph.weighted_edges_df, '_src', '_dst')
    .bind(point_x='x', point_y='y', edge_weight='_weight')
    .settings(url_params={'play': 0, 'edgeInfluence': 5})
    .encode_edge_color('_weight', ['maroon', 'pink', 'white'], as_continuous=True)
    .encode_point_size('count')
)

In [None]:
# still doesn't work....
# g.plot()

# Now we with this in hand, we can train a model

In [None]:
from ml.networks import GCN  # this under the hood, only works for ndata
# this `logits = model(g, features)` breaks it if we switch to edata in training call. 
# TODO: understand why GCN is breaking this

In [None]:
# get the DGL graph object
g = graph.graph

In [None]:
# we have to add masks (i know these are not exclusive, but this is fast and torchie, for demonstration only)
g.ndata['train_mask'] = torch.zeros(g.ndata['feature'].shape[0], dtype=torch.bool).bernoulli(0.8)
g.ndata['test_mask'] = torch.zeros(g.ndata['feature'].shape[0], dtype=torch.bool).bernoulli(0.1)
g.ndata['val_mask'] = torch.zeros(g.ndata['feature'].shape[0], dtype=torch.bool).bernoulli(0.1)

In [None]:
g.ndata['train_mask'][:10]

In [None]:
def train_node_model(g, model, n_epochs=100):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    best_val_acc = 0
    best_test_acc = 0

    features = g.ndata['feature']
    labels = g.ndata['target']
    targets = labels.argmax(1) # a bit of a hack
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    for e in range(n_epochs):
        # Forward
        logits = model(g, features.float())

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == targets[train_mask]).float().mean()
        val_acc = (pred[val_mask] == targets[val_mask]).float().mean()
        test_acc = (pred[test_mask] == targets[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 20 == 0:
            print('In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})'.format(
                e, loss, val_acc, best_val_acc, test_acc, best_test_acc))



In [None]:
# to get a sense of the different parts in training loop above
features = g.ndata['feature']
labels = g.ndata['target']
train_mask = g.ndata['train_mask']
val_mask = g.ndata['val_mask']
test_mask = g.ndata['test_mask']
targets = labels.argmax(1)

In [None]:
features.shape, labels.shape, targets.shape

# Define the Model 

In [None]:
num_features = g.ndata['feature'].shape[1]
latent_dim = 32
num_classes = g.ndata['target'].shape[1]

# here is the model
model = GCN(num_features, latent_dim, num_classes)
model

In [None]:
logits = model(g, features.float()) # have to call .float, or it gives a type(DOUBLE) error.
logits.shape

In [None]:
# untrained comparison
pred = logits.argmax(1)
sum(pred == targets)/len(pred)

## Train the Model

In [None]:
train_node_model(g, model, 621)

In [None]:
# trained comparison
logits = model(g, features.float())
pred = logits.argmax(1)

sum(pred == targets)/len(pred) # only 8% better than random

In [None]:
# To get forward activations 
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

In [None]:
model.conv1.register_forward_hook(get_activation('conv1'))
model.conv2.register_forward_hook(get_activation('conv2'))
# now call model to do forward  
logits = model(g, features.float())
# this will load the dictionary
print(activation['conv1'])
print(activation['conv2'])

In [None]:
# just a pretty graph
plt.figure(); plt.imshow(np.cov(activation['conv1']>0), aspect='auto',  cmap=plt.get_cmap('plasma'))

# Let's UMAP it

In [None]:
graph.umap()

In [None]:
ux = graph.embedding_

In [None]:
ndf['x'] = ux.T[0]*100
ndf['y'] = ux.T[1]*100

scatterplot(ux, activation['conv2'].argmax(1))

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
one_hot = OneHotEncoder(handle_unknown='ignore', sparse=False) # get a clean target

In [None]:
T = one_hot.fit_transform(ndf.Types.values.reshape(-1,1))

In [None]:
adj = graph.adjacency

In [None]:
res = graph.fit_transform(adj) # let's reduce the adj matrix

In [None]:
scatterplot(res, T.argmax(1))

In [None]:
# umap coords seems to pull entities together according to Type from ndf
scatterplot(graph.embedding_, T.argmax(1))

In [None]:
# lets compare to weighted adj from UMAP (umaping the umap)
res = graph.fit_transform(graph.weighted_adjacency) # 
scatterplot(res, T.argmax(1))  # meh

In [None]:
from sknetwork.embedding import LouvainEmbedding
louvain = LouvainEmbedding()

In [None]:
embedding = louvain.fit_transform(adj.tocsr())
embedding.shape

In [None]:
emb = graph.fit_transform(embedding)

In [None]:
scatterplot(emb, T.argmax(1))

In [None]:
# hmmm not working...
gr = graphistry.edges(edf, 'to_node', 'from_node').nodes(ndf, 'Node').bind(point_x='x', point_y='y')
# gr.plot()

In [None]:
#TODO add autoencoder as well as EDATA model