In [24]:
import pandas as pd
import numpy as np
import h5py

from selfpeptide.utils.data_utils import Self_NonSelf_PeptideDataset

import torch
import torch.nn.functional as F
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
from selfpeptide.utils.training_utils import eval_classification_metrics

In [26]:
N_SAMPLES = 50000
hdf5_file = "../processed_data/pre_tokenized_peptides_dataset.hdf5"
dset = Self_NonSelf_PeptideDataset(hdf5_file, gen_size=N_SAMPLES, val_size=0)

In [27]:
data_matrix = dset.peptides
labels = dset.labels.detach().numpy()

In [28]:
labels = (labels+1)/2
labels

array([1., 0., 1., ..., 0., 1., 0.])

In [29]:
onehot_data_matrix = F.one_hot(data_matrix)
onehot_data_matrix

tensor([[[0, 1, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 1, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 1],
         [0, 0, 0,  ..., 0, 0, 1],
         [0, 0, 0,  ..., 0, 0, 1]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 1, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        ...,

        [[0, 1, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0,

In [30]:
onehot_data_matrix = onehot_data_matrix.reshape(N_SAMPLES, -1).detach().numpy()
onehot_data_matrix.shape

(50000, 276)

In [31]:
onehot_data_matrix[0].sum()

12

In [32]:
skf = StratifiedKFold(n_splits=10)


results = []
for i, (train_index, test_index) in tqdm(enumerate(skf.split(onehot_data_matrix, labels))):
    X_train = onehot_data_matrix[train_index]
    X_test = onehot_data_matrix[test_index]
    y_train = labels[train_index]
    y_test = labels[test_index]
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    metrics = eval_classification_metrics(y_test, y_pred, is_logit=False, threshold=0.5)
    results.append(metrics)
results = pd.DataFrame(results)
results

0it [00:00, ?it/s]

Unnamed: 0,MCC,FPR,FNR,Specificity,NPV,FDR,Precision,Recall,F1,BalancedAccuracy,AUROC,AUPRC
0,0.124116,0.4596,0.4164,0.5404,0.564799,0.440567,0.559433,0.5836,0.571261,0.562,0.562,0.534685
1,0.089651,0.472,0.4384,0.528,0.546358,0.456656,0.543344,0.5616,0.552321,0.5448,0.5448,0.524342
2,0.094167,0.4828,0.4232,0.5172,0.549979,0.455644,0.544356,0.5768,0.560109,0.547,0.547,0.525585
3,0.09208,0.4748,0.4332,0.5252,0.547997,0.455837,0.544163,0.5668,0.555251,0.546,0.546,0.525031
4,0.084538,0.4864,0.4292,0.5136,0.54476,0.460083,0.539917,0.5708,0.554929,0.5422,0.5422,0.522784
5,0.099717,0.4744,0.426,0.5256,0.552333,0.452499,0.547501,0.574,0.560437,0.5498,0.5498,0.527266
6,0.0909,0.478,0.4312,0.522,0.547629,0.45663,0.54337,0.5688,0.555794,0.5454,0.5454,0.524669
7,0.092027,0.466,0.442,0.534,0.547131,0.455078,0.544922,0.558,0.551383,0.546,0.546,0.525066
8,0.097273,0.4708,0.432,0.5292,0.550562,0.453215,0.546785,0.568,0.557191,0.5486,0.5486,0.526574
9,0.113653,0.4584,0.428,0.5416,0.558581,0.444876,0.555124,0.572,0.563436,0.5568,0.5568,0.531531
