In [1]:
import csv
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [9]:
# Read train/test sets and node embeddings/features files
def readFiles():
    
    # Read train/test sets
    train_nodes = []
    train_y = []
    
    test_nodes = []
    test_y = []
    
    trainFile = "train.tsv"
    testFile = "test.tsv"
    
    with open(trainFile) as new:
        for line in csv.reader(new, delimiter="\t"):
            train_nodes.append((int(line[0]), int(line[1])))
            train_y.append(int(line[2]))
    
    with open(testFile) as new:
        for line in csv.reader(new, delimiter="\t"):
            test_nodes.append((int(line[0]), int(line[1])))
            test_y.append(int(line[2]))
    
    # Read node embeddings file
    node_embeddings = {} # dict from node ID to embedding
    embeddingFile = "train_edges.emb"
    
    index = 0
    with open(embeddingFile) as new:
        for line in csv.reader(new, delimiter=" "):
            # Ignore header
            if index == 0:
                index += 1
                continue
            
            nodeID = int(line[0])
            embeddings = np.array([float(val) for val in line[1:]])
            
            node_embeddings[nodeID] = embeddings
            
            index += 1
            
    # Read node network features files
    node_network_features = {} # dict from node ID to network_features
    networkFile = "local_features_embeddings.txt"
    
    index = 0
    with open(networkFile) as new:
        for line in csv.reader(new, delimiter="\t"):
            # Ignore header
            if index == 0:
                index += 1
                continue
            
            nodeID = int(line[0])
            network_features = line[1].split(" ")
            network_features.remove("")
            network_features = np.array([float(val) for val in network_features])
            
            node_network_features[nodeID] = network_features
            
            index += 1
    
    # Read node chemical features files
    node_fingerprints = {} # dict from node ID to fingerprints (for drug nodes)
    fingerprintsFile = "fingerprints.txt"
    
    index = 0
    with open(fingerprintsFile) as new:
        for line in csv.reader(new, delimiter=" "):
            # Ignore header
            if index == 0:
                index += 1
                continue
            
            nodeID = int(line[0])
            fingerprint = line[1:]
            fingerprint.remove("")
            fingerprint = np.array([int(val) for val in fingerprint])
            
            node_fingerprints[nodeID] = fingerprint
            
            index += 1
    
    return train_nodes, train_y, test_nodes, test_y, node_embeddings, node_network_features, node_fingerprints

In [28]:
# Create train/test set features from node embeddings, network features, and molecular fingerprints
def createFeatures():
    
    train_nodes, train_y, test_nodes, test_y, node_embeddings, node_network_features, node_fingerprints = readFiles()
    print(len(train_nodes), len(train_y), len(test_nodes), len(test_y), len(node_embeddings), len(node_network_features), len(node_fingerprints))
    
    # Features for each link include combining embeddings of the two nodes through Concatenation, Hadamard, Sum, Distance
    # Additionally, for adding embeddings with network features/fingerprints, use concatenation
    
    feature_types = ['node2vec Concatenation', 'node2vec Hadamard Product', 'node2vec Sum', 'node2vec Distance',
                    'node2vec + Network Features', 'node2vec + Molecular Fingerprints', 'node2vec + Network + Fingerprints']
    train_x = [[] for i in range(len(feature_types))] # list of feature matrix for each type
    test_x = [[] for i in range(len(feature_types))]
    updated_train_y = []
    updated_test_y = []
    
    # Combine node embeddings and features for train edges
    for i in range(len(train_nodes)):
        node1, node2 = train_nodes[i]
        y = train_y[i]
        node1_embed = node_embeddings[node1]
        node2_embed = node_embeddings[node2]
        node1_network = node_network_features[node1]
        node2_network = node_network_features[node2]
        
        # Only use drugs with valid fingerprints
        if node2 not in node_fingerprints:
            continue
            
        node2_fingerprint = node_fingerprints[node2] # only drugs have molecular fingerprints
        
#         # node2vec Concatenate
#         concat = np.concatenate((node1_embed, node2_embed))
#         train_x[0].append(concat)
        
#         # node2vec Hadamard Product
#         hadamard = np.multiply(node1_embed, node2_embed)
#         train_x[1].append(hadamard)
        
#         # node2vec Sum
#         summation = node1_embed + node2_embed
#         train_x[2].append(summation)
        
#         # node2vec Absolute Distance
#         distance = np.absolute(node1_embed - node2_embed)
#         train_x[3].append(distance)
        
#         # node2vec + Network Features
#         node2vec_network = np.concatenate((node1_network, node2_network))
#         train_x[4].append(node2vec_network)
        
#         # node2vec + Molecular Fingerprints
#         node2vec_fingerprint = np.concatenate((node1_embed, node2_embed, node2_fingerprint))
#         train_x[5].append(node2vec_fingerprint)
        
        # node2vec + Network Features + Molecular Fingerprints
        node2vec_network_fingerprint = np.concatenate((node1_network, node2_network, node2_fingerprint))
        train_x[6].append(node2vec_network_fingerprint)
        
        # Add y
        updated_train_y.append(y)
    
    # Combine node embeddings and features for test edges
    for i in range(len(test_nodes)):
        node1, node2 = test_nodes[i]
        y = test_y[i]
        node1_embed = node_embeddings[node1]
        node2_embed = node_embeddings[node2]
        node1_network = node_network_features[node1]
        node2_network = node_network_features[node2]
        
        # Only use drugs with valid fingerprints
        if node2 not in node_fingerprints:
            continue
            
        node2_fingerprint = node_fingerprints[node2] # only drugs have molecular fingerprints
        
#         # node2vec Concatenate
#         concat = np.concatenate((node1_embed, node2_embed))
#         test_x[0].append(concat)
        
#         # node2vec Hadamard Product
#         hadamard = np.multiply(node1_embed, node2_embed)
#         test_x[1].append(hadamard)
        
#         # node2vec Sum
#         summation = node1_embed + node2_embed
#         test_x[2].append(summation)
        
#         # node2vec Absolute Distance
#         distance = np.absolute(node1_embed - node2_embed)
#         test_x[3].append(distance)
        
# #         node2vec + Network Features
#         node2vec_network = np.concatenate((node1_network, node2_network))
#         test_x[4].append(node2vec_network)
        
#         # node2vec + Molecular Fingerprints
#         node2vec_fingerprint = np.concatenate((node1_embed, node2_embed, node2_fingerprint))
#         test_x[5].append(node2vec_fingerprint)
        
#         # node2vec + Network Features + Molecular Fingerprints
        node2vec_network_fingerprint = np.concatenate((node1_network, node2_network, node2_fingerprint))
        test_x[6].append(node2vec_network_fingerprint)
        
        # Add y
        updated_test_y.append(y)
    
    train_x = [np.array(matrix) for matrix in train_x]
    test_x = [np.array(matrix) for matrix in test_x]
    
    train_y = updated_train_y
    test_y = updated_test_y
    
    return train_x, train_y, test_x, test_y, feature_types

In [29]:
# Predict links using various classifier models; for each classifier, predict on each feature type and evaluate
def predictClassifier():
    train_x, train_y, test_x, test_y, feature_types = createFeatures()
    
    classifiers = ['Logistic Regression', 'Random Forest']
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    
    # list of performance metrics by feature type for each classifier
    performance = [[] for i in range(len(classifiers))]
    
    # Logistic Regression
    print ("Logistic Regression")
    for i in range(6, 7): # already completed basic  4 node2vec features, start at 5th feature
        print (i)
        logReg = LogisticRegression()
        
        # Train model and predict
        logReg.fit(train_x[i], train_y)
        predictions = logReg.predict(test_x[i])
        
        # Evaluate performance metrics
        accuracy = sklearn.metrics.accuracy_score(test_y, predictions)
        precision = sklearn.metrics.precision_score(test_y, predictions)
        recall = sklearn.metrics.recall_score(test_y, predictions)
        f1_score = 2 * (precision * recall) / (precision + recall)
        
        # Record metrics
        performance[0].append((accuracy, precision, recall, f1_score))
    
    # Random Forest
    print( "Random Forest")
    for i in range(6, 7): # already completed basic  4 node2vec features, start at 5th feature
        print( i)
        rf = RandomForestClassifier()
        
        # Train model and predict
        rf.fit(train_x[i], train_y)
        predictions = rf.predict(test_x[i])
        
        # Evaluate performance metrics
        accuracy = sklearn.metrics.accuracy_score(test_y, predictions)
        precision = sklearn.metrics.precision_score(test_y, predictions)
        recall = sklearn.metrics.recall_score(test_y, predictions)
        f1_score = 2 * (precision * recall) / (precision + recall)
        
        # Record metrics
        performance[1].append([accuracy, precision, recall, f1_score])
    
    return classifiers, feature_types, metrics, performance

In [30]:
# Write performance metrics to output file
def writeClassifierPerformanceOutput():
    classifiers, feature_types, metrics, performance = predictClassifier()
    outputFileName = "model_performance_all.txt" # changed name to v2 for second run using network + fingerprint features
    
    with open(outputFileName, 'w') as output:
        for i in range(len(classifiers)):
            output.write(classifiers[i] + ":\n\n")
            for j in range(0,1):
                output.write(feature_types[j] + ":\n")
                for k in range(len(metrics)):
                    output.write(metrics[k] + ": " + str(performance[i][j][k]) + "\n")
                output.write("\n")
            output.write("\n")
    output.close()

In [None]:
writeClassifierPerformanceOutput()

926506 926506 4000 4000 6328 6328 1614


In [13]:
# Predict nodes simply based on embedding distance
def predictEmbedDistance():
    train_nodes, train_y, test_nodes, test_y, node_embeddings = readFiles()
    
    # Obtain only real edges in train network
    diseases = set([])
    drugs = set([])
    positive_train_nodes = set([])
    positive_test_nodes = set([])
    
    # Train nodes
    for i in range(len(train_nodes)):
        disease, drug = train_nodes[i]
        
        if train_y[i] != 1:
            continue
        
        diseases.add(disease)
        drugs.add(drug)
        positive_train_nodes.add((disease, drug))
    
    # Test nodes
    for i in range(len(test_nodes)):
        disease, drug = test_nodes[i]
        
        if test_y[i] != 1:
            continue
        
        diseases.add(disease)
        drugs.add(drug)
        positive_test_nodes.add((disease, drug))
        
    print len(positive_train_nodes), len(positive_test_nodes), len(diseases), len(drugs)
    
    # Calculate L2 distance between each disease drug
    distances = []
    
    for disease in diseases:
        for drug in drugs:
            if (disease, drug) not in positive_train_nodes:
                distance = np.linalg.norm(node_embeddings[disease] - node_embeddings[drug])
                distances.append((distance, disease, drug))
    distances.sort()
    print distances[:5]
    
    # Sort by by ascending order, take top n as newly predicted links, count how many are "correct" (in test set)
    n = len(positive_test_nodes)
    num_correct = 0
    
    for distance, disease, drug in distances[:n]:
        if (disease, drug) in positive_test_nodes:
            num_correct += 1
            
    accuracy = float(num_correct) / n
    print "Predict Links via Embedding Distance Accuracy:", accuracy

In [14]:
predictEmbedDistance()

463253 2000 4766 1562
[(0.1085827633657915, 12488, 602013), (0.11357551461669167, 12585, 600895), (0.11476520661489824, 12510, 603629), (0.11595785678059664, 12213, 603629), (0.11712053956092647, 7871, 605212)]
Predict Links via Embedding Distance Accuracy: 0.0
