In [7]:
import csv
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [8]:
# Read train/test sets and node embeddings files
def readFiles():
    
    # Read train/test sets
    train_nodes = []
    train_y = []
    
    test_nodes = []
    test_y = []
    
    trainFile = "train.tsv"
    testFile = "test.tsv"
    
    with open(trainFile) as new:
        for line in csv.reader(new, delimiter="\t"):
            train_nodes.append((int(line[0]), int(line[1])))
            train_y.append(int(line[2]))
    
    with open(testFile) as new:
        for line in csv.reader(new, delimiter="\t"):
            test_nodes.append((int(line[0]), int(line[1])))
            test_y.append(int(line[2]))
    
    # Read node embeddings file
    node_embeddings = {} # dict from node ID to embedding
    embeddingFile = "train_edges.emb"
    
    index = 0
    with open(embeddingFile) as new:
        for line in csv.reader(new, delimiter=" "):
            # Ignore header
            if index == 0:
                index += 1
                continue
            
            nodeID = int(line[0])
            embeddings = np.array([float(val) for val in line[1:]])
            
            node_embeddings[nodeID] = embeddings
            
            index += 1
    
    return train_nodes, train_y, test_nodes, test_y, node_embeddings

In [9]:
# Create train/test set features from embeddings
def createFeatures():
    
    train_nodes, train_y, test_nodes, test_y, node_embeddings = readFiles()
    print len(train_nodes), len(train_y), len(test_nodes), len(test_y), len(node_embeddings)
    
    # Features for each link include combining embeddings of the two nodes:
    # Concatenation, Hadamard, Sum, Distance
    
    feature_types = ['Concatenation', 'Hadamard Product', 'Sum', 'Distance']
    train_x = [[] for i in range(len(feature_types))] # list of feature matrix for each type
    test_x = [[] for i in range(len(feature_types))]
    
    # Combine node embeddings for train edges
    for node1, node2 in train_nodes:
        node1_embed = node_embeddings[node1]
        node2_embed = node_embeddings[node2]
        
        # Concatenate
        concat = np.concatenate((node1_embed, node2_embed))
        train_x[0].append(concat)
        
        # Hadamard Product
        hadamard = np.multiply(node1_embed, node2_embed)
        train_x[1].append(hadamard)
        
        # Sum
        summation = node1_embed + node2_embed
        train_x[2].append(summation)
        
        # Absolute Distance
        distance = np.absolute(node1_embed - node2_embed)
        train_x[3].append(distance)
    
    # Combine node embeddings for test edges
    for node1, node2 in test_nodes:
        node1_embed = node_embeddings[node1]
        node2_embed = node_embeddings[node2]
        
        # Concatenate
        concat = np.concatenate((node1_embed, node2_embed))
        test_x[0].append(concat)
        
        # Hadamard Product
        hadamard = np.multiply(node1_embed, node2_embed)
        test_x[1].append(hadamard)
        
        # Sum
        summation = node1_embed + node2_embed
        test_x[2].append(summation)
        
        # Absolute Distance
        distance = np.absolute(node1_embed - node2_embed)
        test_x[3].append(distance)
    
    train_x = [np.array(matrix) for matrix in train_x]
    test_x = [np.array(matrix) for matrix in test_x]
    
    return train_x, train_y, test_x, test_y, feature_types

In [10]:
# Predict links using various classifier models; for each classifier, predict on each feature type and evaluate
def predict():
    train_x, train_y, test_x, test_y, feature_types = createFeatures()
    
    classifiers = ['Logistic Regression', 'Random Forest']
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    
    # list of performance metrics by feature type for each classifier
    performance = [[] for i in range(len(classifiers))]
    
    # Logistic Regression
    print "Logistic Regression"
    for i in range(len(feature_types)):
        print i
        logReg = LogisticRegression()
        
        # Train model and predict
        logReg.fit(train_x[i], train_y)
        predictions = logReg.predict(test_x[i])
        
        # Evaluate performance metrics
        accuracy = sklearn.metrics.accuracy_score(test_y, predictions)
        precision = sklearn.metrics.precision_score(test_y, predictions)
        recall = sklearn.metrics.recall_score(test_y, predictions)
        f1_score = 2 * (precision * recall) / (precision + recall)
        
        # Record metrics
        performance[0].append((accuracy, precision, recall, f1_score))
    
    # Random Forest
    print "Random Forest"
    for i in range(len(feature_types)):
        print i
        rf = RandomForestClassifier()
        
        # Train model and predict
        rf.fit(train_x[i], train_y)
        predictions = rf.predict(test_x[i])
        
        # Evaluate performance metrics
        accuracy = sklearn.metrics.accuracy_score(test_y, predictions)
        precision = sklearn.metrics.precision_score(test_y, predictions)
        recall = sklearn.metrics.recall_score(test_y, predictions)
        f1_score = 2 * (precision * recall) / (precision + recall)
        
        # Record metrics
        performance[1].append([accuracy, precision, recall, f1_score])
    
    return classifiers, feature_types, metrics, performance

In [11]:
# Write performance metrics to output file
def writePerformanceOutput():
    classifiers, feature_types, metrics, performance = predict()
    outputFileName = "model_performance.txt"
    
    with open(outputFileName, 'w') as output:
        for i in range(len(classifiers)):
            output.write(classifiers[i] + ":\n\n")
            for j in range(len(feature_types)):
                output.write(feature_types[j] + ":\n")
                for k in range(len(metrics)):
                    output.write(metrics[k] + ": " + str(performance[i][j][k]) + "\n")
                output.write("\n")
            output.write("\n")
    output.close()

In [12]:
writePerformanceOutput()

926506 926506 4000 4000 6328
Logistic Regression
0
1
2
3
Random Forest
0
1
2
3
