# Import Packages and environmental setup

In [40]:
import numpy as np
import torch
torch.manual_seed(0)
import torch.nn as nn
from torch_geometric.data import Data
import itertools
import json
import sys
import time
import os
from os import path
sys.path.insert(0, '../')
import gc
import torch_geometric.transforms as T
from torch_geometric.nn.conv import TransformerConv
from torch_geometric.nn import VGAE
from torch_geometric.loader import DataLoader
from torch_geometric.utils import negative_sampling
gc.collect()

484

# Functions

In [2]:
# function to convert string to numbers
def convert_string_to_numbers(str, dict):
    ''' str: string to convert
        dict dictionary with the relative ordering of each char'''
            # create a map iterator using a lambda function
    # lambda x -> return dict[x]
    # This return the value for each key in dict based on str
    numbers = map(lambda x: dict[x], str)
    # return an array of int64 numbers
    return np.fromiter(numbers, dtype=np.int64)

In [3]:
# function to create a graph for each 
def construct_single_graph(idx, label):
    ''' idx: the current graph index w.r.t the label
        label: the current label'''
    # transform the character of amino acid in to numbers for all 5 sequences in this graph
    transformed_x = []
    for i in range(5):
        # get the index of the sequence from the original dataset
        seq_idx = 5*idx + i
        transformed_x.append(convert_string_to_numbers(seq_string[seq_idx][:-1], dict_amino))
        
    # set feature vectors of internal nodes to -1 with same length
    vec_len = len(transformed_x[0])
    internal_node_vec = np.full(vec_len, -1, dtype=np.int64)
    # append the three internal node
    for i in range(3):
        transformed_x.append(internal_node_vec)
    # create the node feature vector
    x = torch.tensor(transformed_x, dtype=torch.float)
    
    # now we create the edge set w.r.t the label
    # This part is quite dumb as I'm hard coding the 15 edge set
    if label == 0:
        edge_index = torch.tensor([[0,5],[5,0],[1,5],[5,1],
                                   [5,6],[6,5],[4,6],[6,4],
                                   [6,7],[7,6],[2,7],[7,2],
                                   [3,7],[7,3]], dtype=torch.long)
    elif label == 1:
        edge_index = torch.tensor([[0,5],[5,0],[1,5],[5,1],
                                   [5,6],[6,5],[3,6],[6,3],
                                   [6,7],[7,6],[2,7],[7,2],
                                   [4,7],[7,4]], dtype=torch.long)
    elif label == 2:
        edge_index = torch.tensor([[0,5],[5,0],[1,5],[5,1],
                                   [5,6],[6,5],[2,6],[6,2],
                                   [6,7],[7,6],[3,7],[7,3],
                                   [4,7],[7,4]], dtype=torch.long)
    elif label == 3:
        edge_index = torch.tensor([[0,5],[5,0],[2,5],[5,2],
                                   [5,6],[6,5],[4,6],[6,4],
                                   [6,7],[7,6],[3,7],[7,3],
                                   [1,7],[7,1]], dtype=torch.long)
    elif label == 4:
        edge_index = torch.tensor([[0,5],[5,0],[2,5],[5,2],
                                   [5,6],[6,5],[3,6],[6,3],
                                   [6,7],[7,6],[4,7],[7,4],
                                   [1,7],[7,1]], dtype=torch.long)
    elif label == 5:
        edge_index = torch.tensor([[0,5],[5,0],[2,5],[5,2],
                                   [5,6],[6,5],[1,6],[6,1],
                                   [6,7],[7,6],[4,7],[7,4],
                                   [3,7],[7,3]], dtype=torch.long)
    elif label == 6:
        edge_index = torch.tensor([[0,5],[5,0],[3,5],[5,3],
                                   [5,6],[6,5],[4,6],[6,4],
                                   [6,7],[7,6],[1,7],[7,1],
                                   [2,7],[7,2]], dtype=torch.long)
    elif label == 7:
        edge_index = torch.tensor([[0,5],[5,0],[3,5],[5,3],
                                   [5,6],[6,5],[2,6],[6,2],
                                   [6,7],[7,6],[1,7],[7,1],
                                   [4,7],[7,4]], dtype=torch.long)
    elif label == 8:
        edge_index = torch.tensor([[0,5],[5,0],[3,5],[5,3],
                                   [5,6],[6,5],[1,6],[6,1],
                                   [6,7],[7,6],[2,7],[7,2],
                                   [4,7],[7,4]], dtype=torch.long)
    elif label == 9:
        edge_index = torch.tensor([[0,5],[5,0],[4,5],[5,4],
                                   [5,6],[6,5],[3,6],[6,3],
                                   [6,7],[7,6],[2,7],[7,2],
                                   [1,7],[7,1]], dtype=torch.long)
    elif label == 10:
        edge_index = torch.tensor([[0,5],[5,0],[4,5],[5,4],
                                   [5,6],[6,5],[2,6],[6,2],
                                   [6,7],[7,6],[3,7],[7,3],
                                   [1,7],[7,1]], dtype=torch.long)
    elif label == 11:
        edge_index = torch.tensor([[0,5],[5,0],[4,5],[5,4],
                                   [5,6],[6,5],[1,6],[6,1],
                                   [6,7],[7,6],[3,7],[7,3],
                                   [2,7],[7,2]], dtype=torch.long)
    elif label == 12:
        edge_index = torch.tensor([[1,5],[5,1],[2,5],[5,2],
                                   [5,6],[6,5],[0,6],[6,0],
                                   [6,7],[7,6],[3,7],[7,3],
                                   [4,7],[7,4]], dtype=torch.long)
    elif label == 13:
        edge_index = torch.tensor([[1,5],[5,1],[3,5],[5,3],
                                   [5,6],[6,5],[0,6],[6,0],
                                   [6,7],[7,6],[2,7],[7,2],
                                   [4,7],[7,4]], dtype=torch.long)
    else:
        edge_index = torch.tensor([[1,5],[5,1],[4,5],[5,4],
                                   [5,6],[6,5],[0,6],[6,0],
                                   [6,7],[7,6],[2,7],[7,2],
                                   [3,7],[7,3]], dtype=torch.long)
    
    # Now we create the graph object as Data
    data = Data(x=x, edge_index=edge_index.t().contiguous())
    return data

# File inputs

In [30]:
# get name of the script
# nameScript = sys.argv[0].split('/')[-1]
nameScript = "gae_model.py"
# get json file name of the script
nameJson = "gae.json"
# nameJson = sys.argv[1]
print("------------------------------------------------------------------------")
print("Training the Garph Auto Encoder for 5-taxa dataset")
print("------------------------------------------------------------------------")
print("Executing " + nameScript + " following " + nameJson, flush = True)

# opening Json file 
jsonFile = open(nameJson) 
dataJson = json.load(jsonFile)

# loading the input data from the json file
ngpu = dataJson["ngpu"]                  # number of GPUS
lr = dataJson["lr"]                      # learning rate
embedSize = dataJson["embedSize"]        # Embedding size
nEpochs = dataJson["nEpochs"]            # Number of Epochs
batchSize = dataJson["batchSize"]        # batchSize


data_root = dataJson["dataRoot"]         # data folder
model_root = dataJson["modelRoot"]       # folder to save the data

label_files = dataJson["labelFile"]      # file with labels
sequence_files = dataJson["matFile"]     # file with sequences

if "summaryFile" in dataJson:
    summary_file = dataJson["summaryFile"]
else :
    summary_file = "summary_file.txt"


print("------------------------------------------------------------------------")
print("Loading Sequence Data in " + sequence_files, flush = True)
print("Loading Label Data in " + label_files, flush = True)

# we read the labels as list of strings
with open(data_root+label_files, 'r') as f:
    label_char = f.readlines()

# we read the sequence as a list of strings
with open(data_root+sequence_files, 'r') as f:
    seq_string = f.readlines()

n_samples = len(label_char)
seq_length = len(seq_string[0])-1
print("Number of samples:{}; Sequence length of each sample:{}"
        .format(n_samples, seq_length))
print("------------------------------------------------------------------------")

------------------------------------------------------------------------
Training the Garph Auto Encoder for 5-taxa dataset
------------------------------------------------------------------------
Executing gae_model.py following gae.json
------------------------------------------------------------------------
Loading Sequence Data in sequences12062021.in
Loading Label Data in labels12062021.in
Number of samples:10000; Sequence length of each sample:1550
------------------------------------------------------------------------


# Data pre-processing

In [5]:
# We need to extract the dictionary with the relative positions
# for each aminoacid

# first we need to extract all the different chars
strL = ""
for c in seq_string[0][:-1]:
    if not c in strL:
        strL += c

# we sort them
strL = sorted(strL)

# we give them a relative order
dict_amino = {}
for ii, c in enumerate(strL):
    dict_amino[c] = ii

# looping over the labels and create array. Here each element of the
# label_char has the form "1\n", so we only take the first one
labels = np.fromiter(map(lambda x: int(x[0])-1,
                         label_char), dtype= np.int64)

In [6]:
# Create all graphs from raw dataset
# empty dataset for all graphs
dataset = []
for i in range(n_samples):
    data = construct_single_graph(i, labels[i])
    if (not data.validate(raise_on_error=True)):
        print("Error! Node number and edge set does not match!")
        break
    if (not data.is_undirected()):
        print("Error! Incorrect edge set!")
        break
    dataset.append(data)

  x = torch.tensor(transformed_x, dtype=torch.float)


# Model

In [21]:
class VariationalGCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(VariationalGCNEncoder, self).__init__()
        self.conv1 = TransformerConv(in_channels, 3 * out_channels, heads=4, concat=False, beta=True)
        self.conv_mu = TransformerConv(3 * out_channels, out_channels, heads=4, concat=False, beta=True)
        self.conv_logstd = TransformerConv(3 * out_channels, out_channels, heads=4, concat=False, beta=True)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)

In [22]:
in_channels = seq_length
out_channels = embedSize

model = VGAE(VariationalGCNEncoder(in_channels, out_channels))
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")
model = model.to(device)

# Training

In [39]:
train_dataset = dataset[:9000]
test_dataset = dataset[9000:]
train_loader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batchSize, shuffle=True)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [42]:
a = negative_sampling(dataset[0].edge_index)
#TODO need batch negative sampling

In [43]:
a

tensor([[4, 2, 7, 7, 5, 5, 0, 3, 4, 1, 2, 4, 6, 2],
        [2, 5, 4, 1, 7, 4, 7, 4, 7, 6, 0, 1, 3, 6]])