In [1]:
# needs to be updated for new SDR interface
import networkx as nx
import numpy as np
from scipy.io.matlab import loadmat
#import sktensor
import random
import pandas as pd
from scipy.sparse import lil_matrix
#from sktensor.rescal import als as rescal_als
from numpy import zeros, dot
from numpy.linalg import norm
from sklearn.metrics import precision_recall_curve, auc, accuracy_score, roc_auc_score, roc_curve
from sklearn.preprocessing import normalize
import os, json
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from collections import OrderedDict
from realML.matrix import SDR

## LinkPrediction model

In [None]:
def tensorCompletion(T, V=[]):
    """
    Complete the tensor by tensor factorization and recomposition (we use Rescal)
    """
    def __predict_rescal_als(T, V=[]):
        if V==[]:
            A, R, _, _, _ = rescal_als(T, 100, init='nvecs', conv=1e-3, lambda_A=10, lambda_R=10)
        else:
            A, R, _, _, _ = rescal_als(T, 100, attr=[V], init='nvecs', conv=1e-3, lambda_A=10, lambda_R=10)
        n = A.shape[0]
        P = zeros((n, n, len(R)))
        for k in range(len(R)):
            P[:, :, k] = dot(A, dot(R[k], A.T))
        return P
    def __normalize_predictions(P, e, k):
        for a in range(e):
            for b in range(e):
                nrm = norm(P[a, b, :k])
                if nrm != 0:
                    # round values for faster computation of AUC-PR
                    P[a, b, :k] = np.round_(P[a, b, :k] / nrm, decimals=3)
        return P

    e, k = T.shape[0], T.shape[2]

    # Convert T into list of sparse matrices as required by Rescal
    T = [lil_matrix(T[:, :, i]) for i in range(k)]
    Tc = [Ti.copy() for Ti in T]

    # call Rescal and normalize
    P = __predict_rescal_als(Tc, V)
    P = __normalize_predictions(P, e, k)
    return P

In [12]:
class tensorCompletion:
    def __init__(self, T):
        self.T = T
        self.P = np.zeros_like(T)
        for k in range(T.shape[2]):
            sdrObj = SDR(dim=10, numrandfeats=1000)
            sdrObj.set_training_data(inputs=[np.squeeze(T[:,:,k])])
            sdrObj.fit(iterations=10)
            U, V = sdrObj.produce()
            self.P[:, :, k] = np.exp(U.dot(V.transpose()))

In [13]:
class LinkPrediction():
    def __init__(self, G):
        """
        G is an instance of nx.MultiGraph
        """
        # convert the graph into adjacency tensor
        I = len(G.nodes())
        J = I
        K = len(set(nx.get_edge_attributes(G,'linkType').values()))
        shape = (I, J, K)
        print(shape)
        self.A = np.zeros(shape=shape)
        for i,j,data in G.edges(data=True):
            k = (data['linkType'])
            self.A[i][j][k] = 1.
        print(self.A.shape)
    
    def fit(self):
        # self.A_completed = tensorCompletion(self.A, attrDF.as_matrix())
        self.A_completed = tensorCompletion(self.A).P
        print(np.amin(self.A_completed))
        print(np.amax(self.A_completed))
        
    def predict(self, X):
        """
        X is a DataFrame with columns=[source_nodeID, target_nodeID, linkType]
        """
        def __predictLink(row, T):
            k = int(row.linkType)
            i = int(row.source_nodeID)
            j = int(row.target_nodeID)
            return int(round(T[i][j][k]))
        X['linkExists']=X.apply(__predictLink, T=self.A_completed, axis=1)
        return X

## Make pipeline

In [14]:
# initializations
dataDir = "./r_59/data"
rawDataDir = os.path.join(dataDir, "raw_data")
assert os.path.exists(dataDir)
assert os.path.exists(rawDataDir)

random.seed(50)

graph = '%s/graph.gml'%rawDataDir

In [15]:
# read the graph from gml file
print('read graph ...')
G = nx.read_gml(graph, label='id')
    
# set aside some edges (10%) validation of the model
print('setting aside 10% of edges for validation and remove them from graph ....')
edges_validation=pd.DataFrame(columns=['source_nodeID','target_nodeID','linkType'])
for i, (u,v,key,data) in enumerate(G.edges(data=True, keys=True)):
    if random.random() < 0.1:
        G.remove_edge(u,v,key=key)
        edges_validation.loc[len(edges_validation)] = [u,v,data['linkType']]
print('number of edge set aside for validation:',len(edges_validation))

read graph ...
setting aside 10% of edges for validation and remove them from graph ....
number of edge set aside for validation: 431


In [16]:
# initialize the model
print('initializing the linkPrediction model ...')
lp = LinkPrediction(G)

# fit the training graph
print('fitting the training graph ...')
lp.fit()

# make predictions on the validation data
print('making predictions on validation edges ...')
edges_prediction=lp.predict(edges_validation)

initializing the linkPrediction model ...
(135, 135, 49)
(135, 135, 49)
fitting the training graph ...
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.9. 
Constructing PPMI
0.0.Refining
0.1.2.3.4.5.6.7.8.

In [17]:
# compute accuracy on validation data
print('computing accuracy on validation data ...')
accuracy = len(edges_prediction[edges_prediction['linkExists']==1])/len(edges_prediction)
print('model accuracy:', accuracy)
train_performance = OrderedDict([
    ('train', OrderedDict([
        ('split', OrderedDict([
                ('type', 'custom'),
                ('n_splits', 1),
                ('shuffle', True),
                ('test_size', 0.1)])
        ),
        ('score', OrderedDict([
                ('metric', 'accuracy'),
                ('value', accuracy)])
        )
    ]))
])

computing accuracy on validation data ...
model accuracy: 0.14153132250580047


In [29]:
# now train the model on the whole graph
print('training the model on the whole graph ...')
# read the graph from gml file
G = nx.read_gml(graph, label='id')
# initialize the model
lp = LinkPrediction(G)
# fit the graph
lp.fit()

training the model on the whole graph ...
(135, 135, 49)
(135, 135, 49)
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Constructing PPMI
0.0.Refining
 
Cons

In [20]:
len(G.edges())

3977

## Submit predictions on test data

In [None]:
print('predictions on test data ...')

In [None]:
try:
    print('read the testData ...')
    testData = pd.read_csv('%s/testData.csv'%dataDir, index_col=0)
    print(testData['source_nodeID'].min())
    print(testData['source_nodeID'].max())
    print(testData['target_nodeID'].min())
    print(testData['target_nodeID'].max())
    
    print('make prediciton on testData and save as testTargets.csv')
    predictions = lp.predict(testData)
    y_predicted = pd.DataFrame(predictions['linkExists'])
    y_predicted.to_csv('testTargets.csv')
except:
    print('Looks like this is a redacted dataset. testData is unavailable. Cannot complete this step ...')

In [None]:
# make prediciton on testData and save the testTargets.cav
predictions = lp.predict(testData)
y_predicted = pd.DataFrame(predictions['linkExists'])
y_predicted.to_csv('testTargets.csv')

## Compute performance on test data

In [None]:
try:
    print('read predicted values ....')
    y_predicted = pd.read_csv('testTargets.csv')['linkExists']
    
    print('read truth values ...')
    y_truth = pd.read_csv('%s/testTargets.csv'%dataDir)['linkExists']
    
    print('computing performance ...')
    accuracy = [
        accuracy_score(y_truth, y_predicted), # accuracy of the current model
        accuracy_score(y_truth, np.zeros(len(y_truth))), # accuracy if you guessed 0 for all links
        accuracy_score(y_truth, np.ones(len(y_truth))), # accuracy if you guessed 1 for all links
        accuracy_score(y_truth, np.random.choice([0, 1], size=(len(y_truth),))),] # accuracy if you randomly guessed 0 or 1
    print('accuracy',accuracy[0])
    test_performance = OrderedDict([
        ('test', OrderedDict([
            ('score', OrderedDict([
                    ('metric', 'accuracy'),
                    ('value', accuracy[0])])
            )
        ]))
    ])
except:
    print('Looks like this is a redacted dataset. testTargets is unavailable. cannot complete this step ...')

In [None]:
overall_performance = OrderedDict()
overall_performance.update(train_performance)
overall_performance.update(test_performance)

with open('performance.json', 'w', encoding='utf-8') as f:
    json.dump(overall_performance, f, indent=2)
print(json.dumps(overall_performance, indent=2))