In [1]:
%load_ext autoreload
%autoreload 2
import networkx as nx
import numpy as np
import pandas as pd
import os
import random
from scipy.sparse import csr_matrix
from sklearn.metrics import roc_auc_score
random.seed(4)
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm

In [2]:
class DMBI_hackathon_ddi_utils():
    NODE_1 = 'node1'
    NODE_2 = 'node2'
    def __init__(self,number_of_drugs = 1434):
        self.number_of_drugs = number_of_drugs

    def write_list_to_file(self, list, path):
        thefile = open(path, 'w')
        for item in list:
            thefile.write("%s\n" % item)
        thefile.close()

    def read_sparse_matrix(self,train_data):
        print('creating matrix')
        x = train_data[self.NODE_1]
        y = train_data[self.NODE_2]
        assert len(x) == len(y)
        data = [1] * len(x)
        m = csr_matrix((data,(x,y)), shape=(self.number_of_drugs, self.number_of_drugs),dtype='f')
        print('m shape:', m.shape, 'm non zeros:', m.nnz)
        assert np.allclose(m.todense(), m.T.todense(), atol=1e-8) #matrix is symmetric
        return m.todense()#the matrix is small, sparse matrix is not necessary.
    
    def write_solution_to_file(self,preds,file_path, num_interactions_train):
        #preds is assumed to be ordered by confidence level
        #adds the header to the soution, combines the node IDs and writes the soךution to file
        #asserts are important. Note them.
        
        print('writing predictions to file: ',file_path)
        for u, v in preds:
           assert u < v, 'graph is undirected, predict edges where the first node id is smaller than the second only'
        assert len(preds) == (self.number_of_drugs * self.number_of_drugs - self.number_of_drugs - num_interactions_train) / 2, "number of predictions is equal to number of non existing edges"
        output = [','.join([self.NODE_1 + '_' + self.NODE_2])]+[','.join([str(p[0]) +'_' + str(p[1])]) for p in preds]
        self.write_list_to_file(output,file_path)

    def create_holdout_set(self, m_train, train_percent = 0.9):
        # create holdout set. the set will contain both existing and non-existing edges
        m_train_holdout = np.matrix(m_train)
        validation_set = set()
        for i in range(self.number_of_drugs):
            for j in range(i+1, self.number_of_drugs):
                if random.random() > train_percent:
                    validation_set.add((i, j))
                    m_train_holdout[i, j] = 0
                    m_train_holdout[j, i] = 0
        return m_train_holdout, validation_set

    def average_precision_at_k(self, k, class_correct):
        #return average precision at k
        #more examples: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
        #and: https://www.kaggle.com/c/avito-prohibited-content#evaluation
        #class_correct is a list with the binary correct label ordered by confidence level.
        assert k <= len(class_correct) and k > 0        
        score = 0.0
        hits = 0.0
        for i in range(k):
            if class_correct[i] == 1:
                hits += 1.0
            score += hits / (i+1.0)
        score /= k
        return score



In [3]:
#simple prediction class
class link_prediction_predictor:
    def __init__(self, number_of_drugs):
        self.G = nx.Graph()
        self.G.add_nodes_from(range(number_of_drugs))

    def fit(self, edge_list):
        self.G.add_edges_from(edge_list)

    def predict(self,prediction_set=None):
        preds = nx.resource_allocation_index(self.G, ebunch=prediction_set)  
        # if ebunch is None then all non-existent edges in the graph will be used.
        preds = [(u1, v1) for (p, u1, v1) in sorted([(prediction, u, v) for (u, v, prediction) in preds],reverse=True)]  
        #predictions are expected as described in write_solution_to_file. The values are suppose to be ordered by confidence.
        return preds

In [57]:
def jac_predict(A):
    
    n = A.shape[0]
    print('n:', n)

    # create graph
    x, y = A.nonzero() # x and y indices of nonzero cells (existing edges)
    edge_list = list(zip(x,y)) 
    G = nx.Graph()
    G.add_edges_from(edge_list)

    # compute Jackard coefficients
    cuts = A*A
    d = A.sum(1)
    joins = np.tile(d, [1,n])
    joins = joins + joins.T - cuts   
    J_mat = cuts/joins
    np.nan_to_num(J_mat, copy=False)    
    
    Score = J_mat*A*J_mat
    np.fill_diagonal(Score, 0)    

    scores = [(i , j, Score[i,j]) for j in range(n) for i in range(j) if A[i,j]==0]
    scores = [(u1, v1, s) for (s, u1, v1) in sorted([(s, u, v) for (u, v, s) in scores], reverse=True)]  

    return scores

In [58]:
DMBI_hackathon_ddi = DMBI_hackathon_ddi_utils()
train_matrix = DMBI_hackathon_ddi.read_sparse_matrix(pd.read_csv('data/train.csv'))

creating matrix
m shape: (1434, 1434) m non zeros: 93200


In [59]:
#Evaluate model. 
#Note that holdout is based on random decision. 
#Test set contains new interactions that random selection does not emulate.

m_train_holdout, validation_set = DMBI_hackathon_ddi.create_holdout_set(train_matrix)

In [60]:
preds = jac_predict(m_train_holdout)
# x, y = m_train_holdout.nonzero() # x and y indices of nonzero cells (existing edges)
# edge_list = list(zip(x,y)) 
# link_prediction = link_prediction_predictor(DMBI_hackathon_ddi.number_of_drugs)
# link_prediction.fit(edge_list)
# preds = link_prediction.predict(validation_set)
class_correct = [train_matrix[x[0],x[1]] for x in preds]
average_precision = DMBI_hackathon_ddi.average_precision_at_k(k=100,class_correct=class_correct)
print('average precision @ 100: ', average_precision)

n: 1434




average precision @ 100:  0.7373423785971178


In [9]:
#Create final submission file
x,y = train_matrix.nonzero()
num_interactions_train = len(x);assert len(x)==len(y)
edge_list = list(zip(x,y))
link_prediction = link_prediction_predictor(DMBI_hackathon_ddi.number_of_drugs)
link_prediction.fit(edge_list)
preds = link_prediction.predict()
DMBI_hackathon_ddi.write_solution_to_file(preds,'sample_predictions.csv',num_interactions_train=num_interactions_train)

writing predictions to file:  sample_predictions.csv
