# Compare with flDPnn and IUPred3

In [49]:
from tqdm.auto import tqdm

from bin.disorder.data_utils import load_disprot_dataset
from bin.iupred3.iupred3_lib import iupred
import torch
import numpy as np
from torchmetrics import Accuracy
from torchmetrics.classification import BinaryF1Score, BinaryMatthewsCorrCoef

In [44]:
disprot_test_sequences = load_disprot_dataset('../data/disprot/2022/disprot-disorder-2022-test.txt').rows

In [45]:
for i in disprot_test_sequences:
    i['iupred'] = torch.tensor(iupred(i['seq'].replace(' ', ''), 'long')[0])
    i['true_labels'] = torch.tensor([int(s) for s in list(i['label'])])

In [46]:
disprot_test_sequences

[{'acc': '>disprot|DP03211|full acc=Q9BRT9',
  'seq': 'M T E E V D F L G Q D S D G G S E E V V L T P A E L I E R L E Q A W M N E K F A P E L L E S K P E I V E C V M E Q L E H M E E N L R R A K R E D L K V S I H Q M E M E R I R Y V L S S Y L R C R L M K I E K F F P H V L E K E K T R P E G E P S S L S P E E L A F A R E F M A N T E S Y L K N V A L K H M P P N L Q K V D L F R A V P K P D L D S Y V F L R V R E R Q E N I L V E P D T D E Q R D Y V I D L E K G S Q H L I R Y K T I A P L V A S G A V Q L I',
  'label': '1111111111111111111100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000',
  'iupred': tensor([0.3779, 0.4230, 0.4656, 0.5020, 0.5298, 0.5476, 0.5549, 0.5520, 0.5403,
          0.5215, 0.4897, 0.4640, 0.4448, 0.4352, 0.4321, 0.4483, 0.4722, 0.4856,
          0.4881, 0.4773, 0.4735, 0.4624, 0.4346, 0.3873, 0.3358, 0.2990, 0.2879,
   

In [38]:
metric_bac = Accuracy(num_classes=2, average='macro', multiclass=True)
metric_f1 = BinaryF1Score()
metric_mcc = BinaryMatthewsCorrCoef()

In [50]:
for i in tqdm(disprot_test_sequences):
    i['iupred_bac'] = metric_bac(i['iupred'], i['true_labels'])
    i['iupred_f1'] = metric_f1(i['iupred'], i['true_labels'])
    i['iupred_mcc'] = metric_mcc(i['iupred'], i['true_labels'])

  0%|          | 0/383 [00:00<?, ?it/s]

In [51]:
disprot_test_sequences

[{'acc': '>disprot|DP03211|full acc=Q9BRT9',
  'seq': 'M T E E V D F L G Q D S D G G S E E V V L T P A E L I E R L E Q A W M N E K F A P E L L E S K P E I V E C V M E Q L E H M E E N L R R A K R E D L K V S I H Q M E M E R I R Y V L S S Y L R C R L M K I E K F F P H V L E K E K T R P E G E P S S L S P E E L A F A R E F M A N T E S Y L K N V A L K H M P P N L Q K V D L F R A V P K P D L D S Y V F L R V R E R Q E N I L V E P D T D E Q R D Y V I D L E K G S Q H L I R Y K T I A P L V A S G A V Q L I',
  'label': '1111111111111111111100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000',
  'iupred': tensor([0.3779, 0.4230, 0.4656, 0.5020, 0.5298, 0.5476, 0.5549, 0.5520, 0.5403,
          0.5215, 0.4897, 0.4640, 0.4448, 0.4352, 0.4321, 0.4483, 0.4722, 0.4856,
          0.4881, 0.4773, 0.4735, 0.4624, 0.4346, 0.3873, 0.3358, 0.2990, 0.2879,
   

In [55]:
mean_iupred_bac = np.mean([i['iupred_bac'] for i in disprot_test_sequences])
mean_iupred_f1 = np.mean([i['iupred_f1'] for i in disprot_test_sequences])
mean_iupred_mcc = np.mean([i['iupred_mcc'] for i in disprot_test_sequences])
print("IUPred3 on DisProt 9.2: BAC={}, F1={}, MCC={}".format(mean_iupred_bac, mean_iupred_f1, mean_iupred_mcc))

IUPred3 on DisProt 9.2: BAC=0.6164774894714355, F1=0.3577887713909149, MCC=0.2194518439274758


In [57]:
# Export for flDPnn
with open('../data/disprot/2022/fldpnn_sequences.fasta', 'w') as handle:
    for i in disprot_test_sequences:
        handle.write('{}\n{}\n'.format(i['acc'], i['seq'].replace(' ', '')))