In [82]:
import csv
import networkx as nx
import numpy as np
from random import randint
from random import random
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss, accuracy_score
from random import choice
from gensim.models import Word2Vec
import keras

In [9]:
from scipy.sparse import identity, diags

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [10]:

G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
edges = list(G.edges())

val_edges = list()
G_train = G

for edge in edges:
    if random() < 0.1:
        val_edges.append(edge)

# We remove the val edges from the graph G
for edge in val_edges:
    G_train.remove_edge(edge[0], edge[1])

n = G_train.number_of_nodes()
m = G_train.number_of_edges()
train_edges = list(G_train.edges())
    
print('Number of nodes of training set:', n)
print('Number of edges of training set:', m)

y_val = [1]*len(val_edges)

n_val_edges = len(val_edges)

# Create random pairs of nodes
for i in range(n_val_edges):
    n1 = nodes[randint(0, n-1)]
    n2 = nodes[randint(0, n-1)]
    (n1, n2) = (min(n1, n2), max(n1, n2))
    val_edges.append((n1, n2))
    
# Remove from val_edges edges that exist in both train and val

for edge in list(set(val_edges) & set(train_edges)):
    val_edges.remove(edge)
    
n_val_edges = len(val_edges) - len(y_val) #because we removed from val_edges edges that exist in both
y_val.extend([0]*n_val_edges)

Number of nodes of training set: 138499
Number of edges of training set: 983233


In [57]:
adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix
print(adj.shape[0], adj.shape[1])

  adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix


138499 138499


In [12]:
def normalize_adjacency(A):
    n = A.shape[0]
    A = A + identity(n)
    degs = A.dot(np.ones(n))
    inv_degs = np.power(degs, -1)
    D_inv = diags(inv_degs)
    A_hat = D_inv.dot(A)
    return A_hat

In [13]:
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

# Is it equivalent to Word2vec for dimensionality reduction ?

In [58]:
adj = normalize_adjacency(adj) # Normalizes the adjacency matrix
indices = np.array(adj.nonzero())

# Do we create the adjencency matrix based on the Training G ? And then we need to create one for the val G ?

# Combine input and output to get indices matrix undirected for the undirected Graph.
# You need to experiment with different hyperparameters and number of layers.

In [59]:
features_np = np.random.randn(G.number_of_edges(), 32) # Generates node features

In [60]:
# Create class labels
y = np.zeros(2*G.number_of_edges())
y[:G.number_of_edges()] = 1

# Transforms the numpy matrices/vectors to torch tensors
features = torch.FloatTensor(features_np).to(device)
y = torch.LongTensor(y).to(device)
adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
indices = torch.LongTensor(indices).to(device)

# What does it mean a Torch Vector ?

In [61]:
adj.size()

torch.Size([138499, 138499])

In [17]:
class GNN(nn.Module):
    def __init__(self, n_feat, n_hidden, n_class, dropout):
        super(GNN, self).__init__()
        self.fc1 = nn.Linear(n_feat, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.fc4 = nn.Linear(n_hidden, n_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x_in, adj, pairs):
        
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.mm(adj, h1))
        z1 = self.dropout(z1)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.mm(adj, h2))
        
        x = z2[pairs[0,:],:] - z2[pairs[1,:],:]
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)

        return F.log_softmax(x, dim=1)

In [62]:
# Hyperparameters
epochs = 25
n_hidden = 128
dropout_rate = 0.2
n_class = 2

# Creates the model and specifies the optimizer
model = GNN(features.shape[1], n_hidden, n_class, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [73]:
# adj.size()
# pairs.size()
# features.size()
# features.shape[0]
help(model)

Help on GNN in module __main__ object:

class GNN(torch.nn.modules.module.Module)
 |  GNN(n_feat, n_hidden, n_class, dropout)
 |  
 |  Base class for all neural network modules.
 |  
 |  Your models should also subclass this class.
 |  
 |  Modules can also contain other Modules, allowing to nest them in
 |  a tree structure. You can assign the submodules as regular attributes::
 |  
 |      import torch.nn as nn
 |      import torch.nn.functional as F
 |  
 |      class Model(nn.Module):
 |          def __init__(self):
 |              super().__init__()
 |              self.conv1 = nn.Conv2d(1, 20, 5)
 |              self.conv2 = nn.Conv2d(20, 20, 5)
 |  
 |          def forward(self, x):
 |              x = F.relu(self.conv1(x))
 |              return F.relu(self.conv2(x))
 |  
 |  Submodules assigned in this way will be registered, and will have their
 |  parameters converted too when you call :meth:`to`, etc.
 |  
 |  .. note::
 |      As per the example above, an ``__init__()`` ca

In [74]:
import time

# Train model
model.train()
for epoch in range(epochs):
    t = time.time()
    optimizer.zero_grad()
    rand_indices = torch.randint(0, features.size(0), (indices.size(0),indices.size(1)), device=adj.device)
    pairs = torch.cat((indices, rand_indices), dim=1)
    output = model(features, adj, pairs)
    loss_train = F.nll_loss(output, y)
    acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())
    loss_train.backward()
    optimizer.step()
    
    if epoch % 5 == 0:
        print('Epoch: {:03d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'acc_train: {:.4f}'.format(acc_train.item()),
              'time: {:.4f}s'.format(time.time() - t))

print("Optimization Finished!")
print()

RuntimeError: addmm: Argument #3 (dense): Expected dim 0 size 138499, got 983233

In [None]:
# Create a non(?) directed graph
G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
node_to_idx = dict()
for i, node in enumerate(nodes):
    node_to_idx[node] = i
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Hyperparameters
epochs = 200
n_hidden = 128
dropout_rate = 0.2

n_class = 2
n_nodes = G.number_of_nodes()
adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix
indices = np.array(adj.nonzero())
adj = normalize_adjacency(adj) # Normalizes the adjacency matrix
features_np = np.random.randn(n_nodes, 32) # Generates node features

# Create class labels
y = np.zeros(4*m)
y[:2*m] = 1

# Transforms the numpy matrices/vectors to torch tensors
features = torch.FloatTensor(features_np).to(device)
y = torch.LongTensor(y).to(device)
adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
indices = torch.LongTensor(indices).to(device)

# Creates the model and specifies the optimizer
model = GNN(features.shape[1], n_hidden, n_class, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train model
model.train()
for epoch in range(epochs):
    t = time.time()
    optimizer.zero_grad()
    rand_indices = torch.randint(0, features.size(0), (indices.size(0),indices.size(1)), device=adj.device)
    pairs = torch.cat((indices, rand_indices), dim=1)
    output = model(features, adj, pairs)
    loss_train = F.nll_loss(output, y)
    acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())
    loss_train.backward()
    optimizer.step()
    
    if epoch % 5 == 0:
        print('Epoch: {:03d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'acc_train: {:.4f}'.format(acc_train.item()),
              'time: {:.4f}s'.format(time.time() - t))

print("Optimization Finished!")
print()

# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

# Testing
model.eval()
node_pairs = np.array(np.transpose(node_pairs))
pairs = torch.LongTensor(node_pairs).to(device)
output = model(features, adj, pairs)
y_pred = torch.exp(output)
y_pred = y_pred.detach().cpu().numpy()

# Compute log loss
y_test = np.loadtxt('y_test.txt', delimiter=',')[:,1]
y_pred = y_pred[:,1]
y_pred[y_pred>0.9999] = 0.9999
y_pred[y_pred<0.0001] = 0.0001
print('Log loss:', log_loss(y_test, y_pred))

In [88]:
# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

# Testing
model.eval()
node_pairs = np.array(np.transpose(node_pairs))
pairs = torch.LongTensor(node_pairs).to(device)
output = model(features, adj, pairs)
y_pred = torch.exp(output)
y_pred = y_pred.detach().cpu().numpy()

In [163]:
import pandas as pd

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])

In [141]:
# y_pred_df.head()

y_pred_array = np.column_stack((range(len(y_pred)), y_pred_true))
# y_pred_df = pd.DataFrame(y_pred)
df_pred = pd.DataFrame({range(len(y_pred)), y_pred_true}, columns={'id','predicted'})#, columns={'id', 'predicted'}).astype({'id':'int'})
df_pred.head()

TypeError: unhashable type: 'list'

In [164]:
# pd.DataFrame(y_pred_array, columns={'id', 'predicted'}).astype({'id':'int'}).head()

# pd.DataFrame(y_pred_array, columns={'id', 'predicted'}).astype({'id':'int'}).to_csv(
# "submission.csv", header=True, index=False
# )

pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
"submission.csv", header=True, index=True, index_label='id'
)

In [144]:
# Write predictions to a file
predictions = zip(range(len(y_pred)), y_pred)
with open("submission.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in y_pred_array:
        csv_out.writerow(row) 

CNN with Sigmoid to predict if they connected based on the abstract.

A good way to aggregate the text is to calculate the average (mean)

Another approach is to use directly a GNN. Take the abstract, compute the embeddng of the words. Take the mean of the node.
Then you can take the features.

For thr GN, we have only the 

pairs is a tensor, contains a pair of nodes that contains all the positive samples and some of the negative samples. y: half of them are equal to one, and half of them are connected. rand_indices are random pairs that are considered as not connected.

As we have a non directed. We can take twice every edge (2*m instead of 2*m for y. Or we can take the edges only once.

One vector for the abstract using the word2vec embedding or any other similar approach.

Or we can directly use a CNN.We can take a CNN and feed pais of abstracts in the CNN, the CNN will produce one vector for the first abstract and one vector for the second. We can combine these two vectors.

Then we can use an MLP to produce a vector, and then we can concatenate the two vectors from CNN and MLP.

There is a pretrained word embedding (Google provided a pretrained embedding).

Embedding of each word. Then the CNN will provide one vector for the abstract.

Each abstract has a different number of words. the representation of the CNN will have a fixed size of the embedding vector.




In [6]:
import pandas as pd
import numpy as np
from scipy.sparse import identity, diags

def normalize_adjacency(A):
    n = A.shape[0]
    A = A + identity(n)
    degs = A.dot(np.ones(n))
    inv_degs = np.power(degs, -1)
    D_inv = diags(inv_degs)
    A_hat = D_inv.dot(A)
    return A_hat

In [22]:
A = np.array([[0, 1, 0, 0],
            [1, 0, 1, 0],
            [0, 1, 0, 1],
            [0, 0, 1, 0]])
A = identity(4)*6
print(A.shape)
A_normalized = normalize_adjacency(A)
print(A_normalized.shape)

(4, 4)
(4, 4)


In [26]:
print(A_normalized)

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
