In [8]:
import csv
import networkx as nx
import numpy as np
from random import randint
from random import random
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss, accuracy_score
from random import choice
from gensim.models import Word2Vec
import keras
import pandas as pd
from unidecode import unidecode


from scipy.sparse import identity, diags

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [6]:
def normalize_adjacency(A):
    n = A.shape[0]
    A = A + 2*identity(n)
    degs = A.dot(np.ones(n))
    inv_degs = np.power(degs, -1)
    D_inv = diags(inv_degs)
    A_hat = D_inv.dot(A)
    return A_hat

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    print(type(sparse_mx))
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

def random_walk(G, node, walk_length):
    walk = [node]
  
    for i in range(walk_length-1):
        neibor_nodes = list(G.neighbors(walk[-1]))
        if len(neibor_nodes) > 0:
            next_node = choice(neibor_nodes)
            walk.append(next_node)
    walk = [str(node) for node in walk] # in case the nodes are in string format, we don't need to cast into string, but if the nodes are in numeric or integer, we need this line to cast into string
    return walk

def generate_walks(G, num_walks, walk_length):
  # Runs "num_walks" random walks from each node, and returns a list of all random walk
    walks = list()  
    for i in range(num_walks):
        for node in G.nodes():
            walk = random_walk(G, node, walk_length)
            walks.append(walk)
        #print('walks : ', walks)
    return walks

def text_to_list(text):
    return unidecode(text).split(',')

def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    
    return 



In [None]:
class GNN(nn.Module):
    def __init__(self, n_feat, n_hidden, n_class, dropout):
        super(GNN, self).__init__()
        self.fc1 = nn.Linear(n_feat, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.fc4 = nn.Linear(n_hidden, n_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x_in, adj, pairs):
        
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.mm(adj, h1))
        z1 = self.dropout(z1)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.mm(adj, h2))
        
        x = z2[pairs[0,:],:] - z2[pairs[1,:],:] # embedded features (z2) of node 0 - embedded features of node 1
        # could we add a new dimension to pairs to specify if same author(s)? and then what could we do?
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)

        return F.log_softmax(x, dim=1)

### could we add a new dimension to pairs to specify if same author(s)? and then what could we do?


In [147]:
(features[pairs[0,:],:] - features[pairs[1,:],:]).shape

torch.Size([3931636, 128])

In [150]:
len(pairs[0,:])

3931636

In [128]:
pairs[0,0]

tensor(0)

In [10]:
G = nx.read_edgelist('../input_data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
edges = list(G.edges())

print('Number of nodes of total set:', n)
print('Number of edges of total set:', m)

node_to_idx = dict()
for i, node in enumerate(nodes):
    node_to_idx[node] = i

val_edges = list()
G_train = G

for edge in edges:
    if random() < 0.1:
        val_edges.append(edge)

# We remove the val edges from the graph G
for edge in val_edges:
    G_train.remove_edge(edge[0], edge[1])

n = G_train.number_of_nodes()
m = G_train.number_of_edges()
train_edges = list(G_train.edges())
    
print('Number of nodes of training set:', n)
print('Number of edges of training set:', m)

y_val = [1]*len(val_edges)

n_val_edges = len(val_edges)

# Create random pairs of nodes (testing negative edges)
for i in range(n_val_edges):
    n1 = nodes[randint(0, n-1)]
    n2 = nodes[randint(0, n-1)]
    (n1, n2) = (min(n1, n2), max(n1, n2))
    val_edges.append((n1, n2))
    
# Remove from val_edges edges that exist in both train and val
# for edge in list(set(val_edges) & set(train_edges)):
#     val_edges.remove(edge)
    
# n_val_edges = len(val_edges) # - len(y_val) #because we removed from val_edges edges that exist in both
y_val.extend([0]*(n_val_edges))

Number of nodes of total set: 138499
Number of edges of total set: 1091955
Number of nodes of training set: 138499
Number of edges of training set: 982849


In [19]:
authors = pd.read_csv('../input_data/authors.txt', sep = '|', header=None)
authors = authors.rename(columns={0: "paper_id", 2: "authors"})
authors['authors'] = authors['authors'].apply(text_to_list)
authors.drop(columns={1}, inplace=True)
authors.head()

Unnamed: 0,paper_id,authors
0,0,"[James H. Niblock, Jian-Xun Peng, Karen R. McM..."
1,1,"[Jian-Xun Peng, Kang Li, De-Shuang Huang]"
2,2,[J. Heikkila]
3,3,"[L. Teslic, B. Hartmann, O. Nelles, I. Skrjanc]"
4,4,"[Long Zhang, Kang Li, Er-Wei Bai, George W. Ir..."


In [20]:
authors_np = np.array(authors)
authors_np

array([[0,
        list(['James H. Niblock', 'Jian-Xun Peng', 'Karen R. McMenemy', 'George W. Irwin'])],
       [1, list(['Jian-Xun Peng', 'Kang Li', 'De-Shuang Huang'])],
       [2, list(['J. Heikkila'])],
       ...,
       [138496,
        list(['Hongge Chen', 'Huan Zhang', 'Pin-Yu Chen', 'Jinfeng Yi', 'Cho-Jui Hsieh'])],
       [138497, list(['Sanjeev Arora', 'Andrej Risteski', 'Yi Zhang'])],
       [138498,
        list(['Pietro Morerio', 'Jacopo Cavazza', 'Vittorio Murino'])]],
      dtype=object)

In [38]:
indices_pd.columns

Index(['two', 'one'], dtype='object')

In [54]:
indices_pd = pd.DataFrame(np.transpose(indices), columns={'one', 'two'})
indices_pd = pd.merge(indices_pd, authors, left_on='one', right_on='paper_id').drop(columns='paper_id').rename(columns={'authors':'one_authors'})
indices_pd = pd.merge(indices_pd, authors, left_on='two', right_on='paper_id').drop(columns='paper_id').rename(columns={'authors':'two_authors'})
indices_pd['intersection'] = indices_pd.apply(lambda x: intersection(x['one_authors'], x['two_authors']), axis=1)

#indices_pd['intersection'] = indices_pd['one_authors'] & indices_pd['two_authors']
indices_pd.head()

Unnamed: 0,two,one,one_authors,two_authors,intersection
0,0,1,"[Jian-Xun Peng, Kang Li, De-Shuang Huang]","[James H. Niblock, Jian-Xun Peng, Karen R. McM...",[Jian-Xun Peng]
1,0,2,[J. Heikkila],"[James H. Niblock, Jian-Xun Peng, Karen R. McM...",[]
2,3,1,"[Jian-Xun Peng, Kang Li, De-Shuang Huang]","[L. Teslic, B. Hartmann, O. Nelles, I. Skrjanc]",[]
3,3,60,"[Heriberto Cruz-Hernandez, Luis Gerardo de la ...","[L. Teslic, B. Hartmann, O. Nelles, I. Skrjanc]",[]
4,3,61,[Evgeniy Martyushev],"[L. Teslic, B. Hartmann, O. Nelles, I. Skrjanc]",[]


In [11]:
adj = nx.adjacency_matrix(G_train) # Obtains the adjacency matrix of the training graph
indices = np.array(adj.nonzero()) # Gets the positions of non zeros of adj into indices (without or with diags ??)
adj = normalize_adjacency(adj) # Normalizes the adjacency matrix only by adding ones to diag


  adj = nx.adjacency_matrix(G_train) # Obtains the adjacency matrix of the training graph


In [None]:
authors[authors.index==indices[0, 1]]['authors']
authors[authors.index==indices[1, 1]]['authors']

# With Giannis

How to do if the edge

To learn some low dimension of the authors and then concatenate them to the features with word2vec.
features of pairs of nodes. treat them and then concatenate.

NLP: directly the CNN and classify the nodes and train the model to predict if they are connected. And then combine with the GNN. From this model, once it's trained, we can use the embedded representation. Or combine the results.

The second thing to do: in a single model CNN to produce some embedding of the abstract. Annotate based on the features.
If you plan to do, he can take a 


It takes a pair of abstract and then predict if they site each other.

Keep seperated abstracts.
Element wide multiplication product or the difference of the two vectors. Multiplication of elements of vectors. Or sum or the absolute difference of the vectors. Once we have this vector, we apply a fc to predict the result.

We can give the model the same abstract twice and it will give the same vector twice.

We can use the word embedding of Google to have a matrix of the abstract. We need to use the embedding of the words and
Or we can use a random embdedding of the words to have an intial embedding. And then we can fine tune the embedding during the training.


For the seperation of validation and training, we need to make sure that there is no isolated nodes.


Message passing then attention.



We can create a graph of authors linked by the apers. And then make an embedding of the authors. Deep walk.

We can have one graph of papers and papers. authors is connected to a paper. edges = the ones already exist from G. Then add extra nodes and edges. new nodes = authors and new edges = author -> paper.


We can seperate the graph: unsupervised algorithm clustering. --> louvain python package. We can apply it to some graph. We can have one hot vector and each dimension correspond to one community.


To take the last 

## Amin
Get red of the authors that occured below a certain threshold and keep authors occuring a minimum number of times

### Is it fine to create walks only from the G_train?

In [98]:
walks = generate_walks(G=G_train, num_walks=10, walk_length=15)

In [99]:
wv_model = Word2Vec(vector_size=128, window=5, min_count=0, sg=1, workers=8)
wv_model.build_vocab(walks)
wv_model.train(walks, total_examples=wv_model.corpus_count, epochs=5) 

(102892850, 102892850)

In [101]:
# features initializaed from deepwalk wv embedding
features_np = []
for node in G_train.nodes():
    features_np.append(wv_model.wv[str(node)])
features_np = np.array(features_np)

In [102]:
# Create class labels
y = np.zeros(4*G_train.number_of_edges())
y[:2*G_train.number_of_edges()] = 1 # Concatenated ones for edges indices and zeros for random indices.

# Transforms the numpy matrices/vectors to torch tensors.
features = torch.FloatTensor(features_np).to(device)
y = torch.LongTensor(y).to(device)
adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
indices = torch.LongTensor(indices).to(device)



<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>
<class 'torch.Tensor'>


torch.Size([138499, 138499])

In [104]:
# Hyperparameters
epochs = 30
n_hidden = 128
dropout_rate = 0.2
n_class = 2
n_features = features.shape[1]

# Creates the model and specifies the optimizer
model = GNN(n_features, n_hidden, n_class, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [105]:
import time

# Train model
model.train()
start_time = time.time()
for epoch in range(epochs):
    t = time.time()
    optimizer.zero_grad()
    rand_indices = torch.randint(0, features.size(0), (indices.size(0),indices.size(1)), device=adj.device)# We take random indices each time we run an epoch
    pairs = torch.cat((indices, rand_indices), dim=1) # Concatenate the edges indices and random indices.   
    output = model(features, adj, pairs) # we run the model that gives the output.
    loss_train = F.nll_loss(output, y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
    #print(type(loss_train), '\n', loss_train.shape)
    acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())# just to show it in the out put message of the training
    loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
    optimizer.step() # Performs a single optimization step (parameter update).
    
    if epoch % 5 == 0:
        print('Epoch: {:03d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'acc_train: {:.4f}'.format(acc_train.item()),
              'time: {:.4f}s'.format(time.time() - t),
             'total_time: {}min'.format(round((time.time() - start_time)/60)))

print("Optimization Finished in {} min!".format(round((time.time() - start_time)/60)))
print()

Epoch: 001 loss_train: 0.6932 acc_train: 0.5271 time: 38.7843s total_time: 1min
Epoch: 006 loss_train: 0.5129 acc_train: 0.6261 time: 18.3107s total_time: 2min
Epoch: 011 loss_train: 0.4141 acc_train: 0.8407 time: 15.7362s total_time: 4min
Epoch: 016 loss_train: 0.3081 acc_train: 0.9000 time: 18.0251s total_time: 5min
Epoch: 021 loss_train: 0.2308 acc_train: 0.9194 time: 15.7148s total_time: 7min
Epoch: 026 loss_train: 0.1880 acc_train: 0.9290 time: 21.7787s total_time: 8min
Optimization Finished in 10 min!



In [113]:
len(y_val)-len(val_edges)

0

In [107]:
# Evaluating the model
model.eval()
node_pairs = np.array(np.transpose(val_edges))
pairs = torch.LongTensor(node_pairs).to(device)
output = model(features, adj, pairs)
y_pred = torch.exp(output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    
print('Log loss:', log_loss(y_val, y_pred_true))

Log loss: 2.2495268918639555


In [108]:
np.array(np.transpose(y_val)).shape[0] - 219308

-1216

In [109]:
node_pairs = np.array(np.transpose(val_edges))
pairs = torch.LongTensor(node_pairs).to(device)
#features of the nodes
#pairs of the val edges
output = model(features, adj, pairs)
y_pred_val = torch.exp(output)
y_pred_val = y_pred_val.detach().cpu().numpy()

In [110]:
y_pred_val[:, 1]

array([0.97891575, 0.63894755, 0.9650434 , ..., 0.03416799, 0.8449153 ,
       0.00570035], dtype=float32)

### Why is it too big compared to loss obtained in Kaggle?! And that the log loss > 1 even if values are between 0 and 1?

In [111]:
print('Log loss:', log_loss(y_val, y_pred_val[:,1]))

Log loss: 2.2495268918639555


In [112]:
from datetime import datetime
import pandas as pd



# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('../test_data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

# Testing
model.eval()
node_pairs = np.array(np.transpose(node_pairs))
pairs = torch.LongTensor(node_pairs).to(device)
output = model(features, adj, pairs)
y_pred = torch.exp(output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    

    
today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 100000)

pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
"../submissions_files/{}-submission-{}.csv".format(today, random_nb), header=True, index=True, index_label='id'
)

CNN with Sigmoid to predict if they connected based on the abstract.

A good way to aggregate the text is to calculate the average (mean)

Another approach is to use directly a GNN. Take the abstract, compute the embeddng of the words. Take the mean of the node.
Then you can take the features.

For thr GN, we have only the 

pairs is a tensor, contains a pair of nodes that contains all the positive samples and some of the negative samples. y: half of them are equal to one, and half of them are connected. rand_indices are random pairs that are considered as not connected.

As we have a non directed. We can take twice every edge (2*m instead of 2*m for y. Or we can take the edges only once.

One vector for the abstract using the word2vec embedding or any other similar approach.

Or we can directly use a CNN.We can take a CNN and feed pais of abstracts in the CNN, the CNN will produce one vector for the first abstract and one vector for the second. We can combine these two vectors.

Then we can use an MLP to produce a vector, and then we can concatenate the two vectors from CNN and MLP.

There is a pretrained word embedding (Google provided a pretrained embedding).

Embedding of each word. Then the CNN will provide one vector for the abstract.

Each abstract has a different number of words. the representation of the CNN will have a fixed size of the embedding vector.




### Embedding of abstract? Unsupervised? Or get it inside the global MLP?