# Training GNN



In [None]:
%%capture
import pandas as pd
import numpy as np
import scipy
from scipy.sparse.csgraph import shortest_path
import seaborn as sns
from matplotlib import pyplot as plt
import pickle
import json
import math
from numpy import ma
from glob import glob
from scipy.sparse import load_npz

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout,BatchNormalization,Conv1D,MaxPool1D,Flatten

import torch

!pip install spektral
import spektral
from spektral.layers import GCNConv, GlobalSumPool,GraphSageConv,GeneralConv
from spektral.data import Dataset, DisjointLoader, Graph
from spektral.layers import GCSConv, GlobalAvgPool,SortPool
from spektral.layers.pooling import TopKPool
from spektral.transforms.normalize_adj import NormalizeAdj
from spektral.transforms import GCNFilter


path = 'data_test/'


In [None]:
def func_aux(codigo):
    return int(codigo.split('/')[-1].split('_')[0])

In [None]:

# leyendo datos de train
matrices_entrenamiento = glob(path+'train/kaggle/working/train/*')
matrices_entrenamiento.sort(key=func_aux)
targets_entrenamiento = np.load(path+'targets_train.npy')
len(matrices_entrenamiento),len(targets_entrenamiento)
matrices_adj = matrices_entrenamiento[0::2]
matrices_x = matrices_entrenamiento[1::2]


In [None]:
for cont,elem in enumerate(matrices_adj):
    if elem.split('/')[-1].split('_')[1] == 'x':
        matrices_adj[cont],matrices_x[cont] = matrices_x[cont],matrices_adj[cont]
        


In [None]:
matrices_adj_train , matrices_adj_val,matrices_x_train,matrices_x_val, y_train, y_val = train_test_split(matrices_adj,matrices_x,
                                                                                         targets_entrenamiento,test_size=0.1,
                                                                                         random_state=0,shuffle=True,
                                                                                         stratify=targets_entrenamiento)
                                      

In [None]:
class Custom_Sequence(Sequence):
    def __init__(self, adj_set,x_set, y_set, batch_size):
        self.adj,self.x, self.y =adj_set, x_set, y_set
        self.batch_size = batch_size
        
    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        
        batch_adj = self.adj[idx * self.batch_size:(idx + 1) *
        self.batch_size]
        
        batch_x = self.x[idx * self.batch_size:(idx + 1) *
        self.batch_size]

        batch_y = self.y[idx * self.batch_size:(idx + 1) *
            self.batch_size]
        
        
        subgraph = load_npz(batch_adj[0])
        a = subgraph.toarray()
        
        dists = np.load(batch_x[0]).reshape(-1,1)
        spektral.utils.one_hot(dists, 10)
        graph = Graph(x=dists, a=a, y=batch_y[0])
        
        graph = GCNFilter()(graph)
        
        batch = ((graph.x,graph.a),np.array(graph.y).reshape(-1,1))
        
        return batch
    
    

In [None]:
train_iterator = Custom_Sequence(matrices_adj_train[:9000],matrices_x_train[:9000],y_train[:9000],1)
validation_iterator = Custom_Sequence(matrices_adj_val[:1000],matrices_x_val[:1000],y_val[:1000],1)

In [None]:
from tensorflow.keras import backend as K
class MyGNN(Model):
    def __init__(self, n_labels):
        super().__init__()
        self.graph_conv = GCNConv(128,activation='tanh')
        self.graph_conv2 = GCNConv(128,activation='tanh')
        self.graph_conv3 = GCNConv(128,activation='tanh')
        self.graph_conv4 = GCNConv(128,activation='tanh')
        self.graph_conv5 = GCNConv(1,activation='tanh')

        self.pool = SortPool(400)
        
        self.conv1 = Conv1D(filters=64, kernel_size=sum([128,128,128,128,1]), 
                            strides=sum([128,128,128,128,1]))
        self.conv2 = Conv1D(filters=128, kernel_size=5, strides=1)
        self.conv3 = Conv1D(filters=128, kernel_size=5, strides=1)
        self.conv4 = Conv1D(filters=64, kernel_size=5, strides=1)
        

        self.maxpool= MaxPool1D(pool_size=2)
        self.flatten = Flatten()
        self.dense1 = Dense(units=256, activation="relu")
        self.densex1 = Dense(units=128, activation="relu")


        self.dense2 = Dense(units=1, activation="sigmoid")
        

    def call(self, inputs):
        x,a = inputs
        
        a = tf.sparse.from_dense(a)
        
        x1 = self.graph_conv([x,a])
        x2 = self.graph_conv2([x1,a])
        x3 = self.graph_conv3([x2,a])
        x4 = self.graph_conv4([x3,a])
        x5 = self.graph_conv5([x4,a])
        
        x = tf.concat([x1,x2,x3,x4,x5], axis=-1)
        
        out = self.pool(x)
        
        out = tf.reshape(
               out, [1, out.shape[-1] * out.shape[0], 1]
            )
        
        x_out = self.conv1(out)
        x_out = self.conv2(x_out)
        x_out = self.maxpool(x_out)
        x_out = self.conv3(x_out)
        x_out = self.conv4(x_out)
        x_out = self.flatten(x_out)
        
        x_out = self.dense1(x_out)
        x_out = self.densex1(x_out)
        predictions = self.dense2(x_out)

        return predictions
model = MyGNN(1)
model.compile(tf.keras.optimizers.Nadam(learning_rate=0.00001),
              'binary_crossentropy',metrics='accuracy')

In [None]:
STEP_SIZE_TRAIN=train_iterator.__len__()
STEP_SIZE_VALID=validation_iterator.__len__()

history = model.fit(x=train_iterator,
                        steps_per_epoch=STEP_SIZE_TRAIN,
                        validation_data=validation_iterator,
                        validation_steps=STEP_SIZE_VALID,
                        epochs=5,verbose = 1)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('binary_crossentropy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:

def calculate_ROC(data_vertex_pairs,data_solution):
    data_solution=np.array(data_solution)
    data_vertex_pairs_sorted=data_solution[data_vertex_pairs]
    
    xpos=[0]
    ypos=[0]
    ROC_vals=[]
    for ii in range(len(data_vertex_pairs_sorted)):
        if data_vertex_pairs_sorted[ii]==1:
            xpos.append(xpos[-1])
            ypos.append(ypos[-1]+1)
        if data_vertex_pairs_sorted[ii]==0:
            xpos.append(xpos[-1]+1)
            ypos.append(ypos[-1])      
            ROC_vals.append(ypos[-1])
    
        # # # # # # # # # # # # # # # 
        # 
        # We normalize the ROC curve such that it starts at (0,0) and ends at (1,1).
        # Then our final metric of interest is the Area under that curve.
        # AUC is between [0,1].
        # AUC = 0.5 is acchieved by random predictions
        # AUC = 1.0 stands for perfect prediction.
    
    ROC_vals=np.array(ROC_vals)/max(ypos)
    ypos=np.array(ypos)/max(ypos)
    xpos=np.array(xpos)/max(xpos)
    
    plt.plot(xpos, ypos)
    plt.show()
    
    AUC=sum(ROC_vals)/len(ROC_vals)
    return AUC

In [None]:

preds_train = model.predict(train_iterator)
preds_val = model.predict(validation_iterator)
sorted_predictions_train=np.flip(np.argsort(preds_train,axis=0))
sorted_predictions_val=np.flip(np.argsort(preds_val,axis=0))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train.astype(int),np.array([1 if elem>0.5 else 0 for elem in preds_train])))
print(confusion_matrix(y_val.astype(int),np.array([1 if elem>0.5 else 0 for elem in preds_val])))


AUC=calculate_ROC(sorted_predictions_train, y_train)
print('Area Under Curve for Evaluation: ', AUC,'\n\n\n')

AUC=calculate_ROC(sorted_predictions_val, y_val)
print('Area Under Curve for Evaluation: ', AUC,'\n\n\n')


In [None]:

global indexer 
indexer = 0

def func_test(numero):
    global indexer
    matrices_test = glob(path+numero+'/kaggle/working/*/*')
    matrices_test.sort(key=func_aux)
    matrices_adj_test = matrices_test[0::2]
    matrices_x_test = matrices_test[1::2]
    
    
    for cont,elem in enumerate(matrices_adj_test):
        
        if elem.split('/')[-1].split('_')[1] == 'x':
            try:
                matrices_adj_test[cont],matrices_x_test[cont] = matrices_x_test[cont],matrices_adj_test[cont]
            except:
                
                print("error")
                break
                
    
    targets_numero = targets_test[targets_test >= 0]
    targets_numero = targets_numero[indexer:indexer+len(matrices_x_test)]
    indexer = indexer + len(matrices_x_test)
    test_iterator = Custom_Sequence(matrices_adj_test,matrices_x_test,targets_numero,1)
    return test_iterator
    

targets_test = np.load(path+'targets_test.npy')
lista = [func_test("cero"),func_test("uno"),func_test("dos"),func_test("tres"),func_test("cuatro"),
        func_test("cinco"),func_test("seis"),func_test("siete"),func_test("ocho"),
        func_test("nueve"),func_test("diez")]

In [None]:
preds_finales = []
for cont,test_iterator in enumerate(lista):
    print(cont)
    try:
        preds_val = list(model.predict(test_iterator))
    except:
        continue
    
    preds_finales = preds_finales+preds_val


In [None]:
preds_finales = np.array(preds_finales)
np.save("preds_finales",preds_finales)
targets_finales = targets_test.copy()
targets_finales[targets_finales>=0] = preds_finales
targets_finales[targets_finales<0] = 0
sorted_predictions_eval=np.flip(np.argsort(targets_finales,axis=0))
submit_file="model_all_idx"+str(2017)+"_"+str(3)+".json"
all_idx_list_float=list(map(float, sorted_predictions_eval))
with open(submit_file, "w", encoding="utf8") as json_file:
    json.dump(all_idx_list_float, json_file)
