# Testing a DeeProtGO model 

This notebook is aimed to test a DeeProtGO to predict Bilogical Process (BP) Gene Ontology (GO) terms for the *NK* proteins from Eukarya organisms in the CAFA3 benchmark dataset. 

## Requirements

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
#from google.colab import drive


In [2]:
# drive.mount('/content/drive')

In [2]:
os.chdir('/home/gabriela/Insync/gmerino@sinc.unl.edu.ar/Google Drive/EBI-EMBL/GOAnnot/DeeProtGO/')

In [3]:
from src2.sampler import Sampler
from src2.dataloader import Dataloader
from src2.DNNModel import DNNModel
from src2.logger import Logger
from src2.DNN import DNN
from src2.earlyStop import EarlyStopping

### 1. Setting global parameters

In [4]:
torch.set_num_threads(torch.get_num_threads())
use_GPU = True
if use_GPU & torch.cuda.is_available():
    torch.cuda.manual_seed_all(1)
else:
    use_GPU = False
np.random.seed(1)
torch.manual_seed(1)


<torch._C.Generator at 0x7fcdd0144f60>

In [5]:
res_dir = "examples/train_NK_EUKA_BP/"
dirData = 'data/processed/'

### 2. Setup parameters and model hyperparameters

In [6]:
activFunc = F.elu
optimMethod = torch.optim.Adam
criterion = nn.BCELoss
thresh = 0.2
learning_rate = 0.005
nbatch = 128
pDrop = 0.5

pPSD1 = 0.5
pPSD2 = 0.35
pEmb1 = 0.7
pEmb2 = 0.5
pTaxon1 = 0.7
pTaxon2 = 0.5

pHidden1 = 1
pHidden2 = 0.7

samplingNegPerc = 0.1399 # Of all negatives, we want to take 0.1399*100 of the data

### 3.  Model loading

In [7]:
propOutFile = dirData + "Training/GOTermsPropRel_Euka_BP_train.tab"
nCases = 7180
N_in_1 = 5308
N_in_2 = 408
N_in_3 = 1024

In [8]:
DeeProtGO_Euka_NK_BP = DNNModel(res_dir, propOutFile = propOutFile, nbatch = nbatch, N_in_1 = N_in_1, 
                                N_in_2 = N_in_2, N_in_3 = N_in_3, pN1_1 = pPSD1, pN1_2 = pPSD2, 
                                pN2_1 = pEmb1, pN2_2 = pEmb2, pN3_1 = pTaxon1, pN3_2 = pTaxon2,
                                pNO_1 = pHidden1, pNO_2 = pHidden2, thresh = thresh, pDrop = pDrop, 
                                activFunc = activFunc, optimMethod = optimMethod, criterion = criterion, 
                                learning_rate = learning_rate, useGPU = use_GPU)



In [9]:
#DeeProtGO_Euka_NK_BP.net.load_state_dict(torch.load(res_dir + 'PSD_Emb_Taxoncheckpoint.pt'))
DeeProtGO_Euka_NK_BP.net.load_state_dict(torch.load(res_dir + 'DeeProtGO_PSD_Emb_Taxon_Euka_BP_NK.pt'))


<All keys matched successfully>

### 4. Data loading


In [10]:
posProteinsFilePath = dirData + "Benchmark/PosEntries_Euka_BP.tab"
negProteinsFilePath = dirData + "Benchmark/NegEntries_Euka_BP.tab"

posNames = np.loadtxt(posProteinsFilePath, delimiter = '\t', dtype = 'str').tolist()
negNames = np.loadtxt(negProteinsFilePath, delimiter = '\t', dtype = 'str').tolist()
targetNames = np.concatenate([posNames,negNames], axis = 0)


In [21]:
PSDFilePath = dirData + "Benchmark/LevSim_BP_Euka.h5"
inData1 = pd.read_hdf(PSDFilePath)
inData1.head()

Entry,A0A075F932,A0A0C5B5G6,A0A0K3AV08,A0A0N9E2K8,A0A0R4IBK5,A0AVF1,A0FGR8,A0FGR9,A0FLQ6,A0JMQ9,...,Q8GWB2,Q5SV66,Q8NBF2,Q8TF61,Q6PKX4,P0CU05,Q3TY65,Q96ME1,Q9FHK4,Q8IUR7
Entry.name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AB17B_MOUSE,0.831279,0.799405,0.645559,0.80068,0.0,0.809605,0.698258,0.711432,0.41989,0.764981,...,0.837654,0.836804,0.763281,0.708457,0.836804,0.832554,0.830004,0.736507,0.793455,0.779006
AB17C_MOUSE,0.855504,0.870803,0.628559,0.800255,0.0,0.814705,0.683383,0.697408,0.39014,0.759881,...,0.891628,0.883978,0.754356,0.704632,0.883553,0.858904,0.857629,0.726732,0.795155,0.773056
ABHD4_MOUSE,0.855504,0.861028,0.630259,0.80153,0.0,0.81683,0.682958,0.699108,0.398215,0.762431,...,0.883978,0.879303,0.757756,0.698258,0.878028,0.856354,0.852104,0.727157,0.79303,0.776031
ABHDD_MOUSE,0.857204,0.863578,0.631109,0.80153,0.0,0.818105,0.685508,0.699108,0.398215,0.757756,...,0.884828,0.881853,0.759456,0.698258,0.879303,0.857204,0.855929,0.728432,0.792605,0.779006
ADPGK_MOUSE,0.831279,0.796005,0.647259,0.79643,0.0,0.81088,0.703782,0.711857,0.422439,0.765406,...,0.835954,0.835954,0.762856,0.713557,0.838079,0.828729,0.830429,0.739482,0.796855,0.778581


In [22]:
TaxonFilePath = dirData + "Benchmark/Taxon_BP_Euka.h5"
inData2 = pd.read_hdf(TaxonFilePath)
inData2.head()

Unnamed: 0_level_0,2762,2769,2903,3037,3039,3055,3311,3469,3490,3498,...,746128,756487,756488,763456,1077530,1108046,1176036,1221240,1234705,1260784
Entry.name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AB17B_MOUSE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AB17C_MOUSE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABHD4_MOUSE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABHDD_MOUSE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ADPGK_MOUSE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
EmbFilePath = dirData + "Benchmark/Emb_BP_Euka.h5"
inData3 = pd.read_hdf(EmbFilePath)
inData3.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
Entry.name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AB17B_MOUSE,0.001692,-0.015048,0.004948,-0.035817,-0.009328,-0.036168,0.039034,0.024736,0.043102,-0.115905,...,0.021268,0.058611,0.077691,0.01091,-0.025115,0.05452,-0.042507,-0.041234,0.026941,0.01507
AB17C_MOUSE,-0.033374,-0.00757,-0.094278,0.082492,-0.032613,-0.1042,-0.059237,0.013373,0.041502,-0.053527,...,0.025641,-0.011655,-0.115871,-0.236579,0.162707,-0.19604,-0.027725,-0.012942,-0.135077,-0.082106
ABHD4_MOUSE,-0.071416,-0.111335,-0.129712,0.006021,-0.094052,-0.149953,0.091222,-0.12461,-0.123191,0.046677,...,0.031299,0.078272,-0.078293,-0.097091,0.073202,-0.104448,0.003256,-0.049058,-0.072878,0.046521
ABHDD_MOUSE,-0.102275,-0.032009,-0.197966,0.073784,-0.009253,-0.085855,0.109043,-0.091667,-0.040165,-0.048966,...,0.067552,0.018883,-0.184525,-0.10495,0.001222,-0.09621,-0.004161,0.028244,-0.150359,-0.05571
ADPGK_MOUSE,0.044418,0.006028,-0.01029,0.010217,0.007213,-0.047875,-0.051736,0.082782,0.118333,-0.023534,...,0.031114,0.022265,-0.007227,-0.022782,-0.013508,-0.037946,0.038871,-0.053519,-0.025445,-0.020773


In [25]:
netOutFilePath = dirData + "Benchmark/netOut_BP_Euka.h5"

outData = pd.read_hdf(netOutFilePath)
outData.head()

Terms,GO:0000002,GO:0000003,GO:0000011,GO:0000018,GO:0000019,GO:0000023,GO:0000025,GO:0000027,GO:0000028,GO:0000038,...,GO:2001279,GO:2001280,GO:2001293,GO:2001295,GO:2001305,GO:2001307,GO:2001308,GO:2001310,GO:2001316,GO:2001317
Entry.name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AB17B_MOUSE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AB17C_MOUSE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABHD4_MOUSE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABHDD_MOUSE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ADPGK_MOUSE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
inData1 = torch.from_numpy(inData1.to_numpy())
inData2 = torch.from_numpy(inData2.to_numpy())
inData3 = torch.from_numpy(inData3.to_numpy())
outData = torch.from_numpy(outData.to_numpy())
if use_GPU:
    inData1 = inData1.cuda()
    inData2 = inData2.cuda()
    inData3 = inData3.cuda()
    outData = outData.cuda()

### 4. Model testing

In [27]:
DeeProtGO_Euka_NK_BP.test(outData.float(), inData1, inData2, inData3, propPrediction = True, CAFAerror = True)
#[0.05734247341752052, 0.3484107198364361, 0.37816292896326836, 0.32299859944507636, 0.11, 0.35835534541132785, 0.27919059989866296, 0.5001825597465603]
#[0.0534910224378109, 0.3322698569595002, 0.3653985442688523, 0.3046489986559331, 0.11, 0.3471119027311259, 0.26976689401098863, 0.4866353935389506]


[0.050914280116558075,
 0.343065386440793,
 0.37382353484192504,
 0.3169839880372515,
 0.12,
 0.35457609079393126,
 0.27598690658251496,
 0.49574208491812705]

In [17]:
f_Max, p_F_Max, r_F_Max, t_F_Max

(0.25968856524516215, 0.2771147557996426, 0.24432437741729712, 0.11)