In [1]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import sklearn
from sklearn.metrics import roc_curve, auc, average_precision_score, matthews_corrcoef, f1_score, precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [45]:
def check_duplicates_fast (folder, trial_num):
    """
    Checks if there are sequences in the test dataset idential to the train/validation datasets
    Saves new test sets if there are identical sequences
    Faster version for larger datasets
    """
    x_train =np.load(f'{folder}/x_train_emb_no_labels_{trial_num}.npy', allow_pickle = True)
    x_val =np.load(f'{folder}/x_validation_emb_no_labels_{trial_num}.npy', allow_pickle = True)
    x_test =np.load(f'{folder}/x_test_emb_no_labels_{trial_num}.npy', allow_pickle = True)
    y_test = np.load(f'{folder}/y_test_emb_no_labels_{trial_num}.npy', allow_pickle = True).astype('float32')


    x_trainl =np.load(f'{folder}/x_train_emb_with_labels_{trial_num}.npy', allow_pickle = True)
    x_vall =np.load(f'{folder}/x_validation_emb_with_labels_{trial_num}.npy', allow_pickle = True)
    x_testl =np.load(f'{folder}/x_test_emb_with_labels_{trial_num}.npy', allow_pickle = True)
    y_testl = np.load(f'{folder}/y_test_emb_with_labels_{trial_num}.npy', allow_pickle = True).astype('float32')
    
    train=np.concatenate((x_train, x_val), axis=0)
    trainl=np.concatenate((x_trainl, x_vall), axis=0)
    print(np.shape(trainl))

    tsset=set()
    trset=set()
    size=len(x_test)
    for kmer in range(0, size):
        tsset.add(tuple(x_test[kmer]))
    for kmer in range(0, len(train)):
        trset.add(tuple(train[kmer]))
    inter=tsset.intersection(trset)

    mask=[]
    count=0
    for kmer in tqdm(range(0, size)):
        skmer={tuple(x_test[kmer])}
        if skmer.intersection(inter) != set():
            count+=1
            mask.append(kmer)
    print (f"Unlabeled duplicates: {count}")
    #print (mask)
    if count > 0:
        x_test_new=np.delete(x_test, mask, axis=0)
        y_test_new=np.delete(y_test, mask, axis=0)
        np.save(f"{folder}/x_test_emb_no_labels_{trial_num}_unique.npy", x_test_new)
        np.save(f"{folder}/y_test_emb_no_labels_{trial_num}_unique.npy", y_test_new)

    tsset=set()
    trset=set()
    size=len(x_testl)
    for kmer in range(0, size):
        tsset.add(tuple(x_testl[kmer]))
    for kmer in range(0, len(trainl)):
        trset.add(tuple(trainl[kmer]))
    inter=tsset.intersection(trset)

    mask2=[]
    count=0
    for kmer in tqdm(range(0, size)):
        skmer={tuple(x_testl[kmer])}
        if skmer.intersection(inter) != set():
            count+=1
            mask.append(kmer)
    print (f"Labeled duplicates: {count}")
    #print (mask)
    if count > 0:
        x_test_newl=np.delete(x_testl, mask2, axis=0)
        y_test_newl=np.delete(y_testl, mask2, axis=0)
        np.save(f"{folder}/x_test_emb_with_labels_{trial_num}_unique.npy", x_test_newl)
        np.save(f"{folder}/y_test_emb_with_labels_{trial_num}_unique.npy", y_test_newl)

In [48]:
dataset="LMNglyPred"
ptm="N-Gly_sequon_specific"
org="Humans"
orgs=["NGlycoSite", "NGlyDE"]

for org in orgs:
    for trial_num in range (1,11):
        directory=f"{dataset}/{ptm}/{org}/Trial{trial_num}"
        check_duplicates_fast (directory, trial_num)

(24176, 53)


  0%|          | 0/2476 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/2476 [00:00<?, ?it/s]

Labeled duplicates: 0
(23986, 53)


  0%|          | 0/2666 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/2666 [00:00<?, ?it/s]

Labeled duplicates: 0
(23986, 53)


  0%|          | 0/2666 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/2666 [00:00<?, ?it/s]

Labeled duplicates: 0
(23986, 53)


  0%|          | 0/2666 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/2666 [00:00<?, ?it/s]

Labeled duplicates: 0
(23986, 53)


  0%|          | 0/2666 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/2666 [00:00<?, ?it/s]

Labeled duplicates: 0
(23986, 53)


  0%|          | 0/2666 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/2666 [00:00<?, ?it/s]

Labeled duplicates: 0
(23986, 53)


  0%|          | 0/2666 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/2666 [00:00<?, ?it/s]

Labeled duplicates: 0
(23986, 53)


  0%|          | 0/2666 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/2666 [00:00<?, ?it/s]

Labeled duplicates: 0
(23986, 53)


  0%|          | 0/2666 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/2666 [00:00<?, ?it/s]

Labeled duplicates: 0
(23986, 53)


  0%|          | 0/2666 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/2666 [00:00<?, ?it/s]

Labeled duplicates: 0
(2722, 53)


  0%|          | 0/302 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/302 [00:00<?, ?it/s]

Labeled duplicates: 0
(2721, 53)


  0%|          | 0/303 [00:00<?, ?it/s]

Unlabeled duplicates: 1


  0%|          | 0/303 [00:00<?, ?it/s]

Labeled duplicates: 1
(2721, 53)


  0%|          | 0/303 [00:00<?, ?it/s]

Unlabeled duplicates: 1


  0%|          | 0/303 [00:00<?, ?it/s]

Labeled duplicates: 2
(2721, 53)


  0%|          | 0/303 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/303 [00:00<?, ?it/s]

Labeled duplicates: 3
(2721, 53)


  0%|          | 0/303 [00:00<?, ?it/s]

Unlabeled duplicates: 1


  0%|          | 0/303 [00:00<?, ?it/s]

Labeled duplicates: 5
(2721, 53)


  0%|          | 0/303 [00:00<?, ?it/s]

Unlabeled duplicates: 3


  0%|          | 0/303 [00:00<?, ?it/s]

Labeled duplicates: 3
(2721, 53)


  0%|          | 0/303 [00:00<?, ?it/s]

Unlabeled duplicates: 0


  0%|          | 0/303 [00:00<?, ?it/s]

Labeled duplicates: 2
(2721, 53)


  0%|          | 0/303 [00:00<?, ?it/s]

Unlabeled duplicates: 3


  0%|          | 0/303 [00:00<?, ?it/s]

Labeled duplicates: 4
(2721, 53)


  0%|          | 0/303 [00:00<?, ?it/s]

Unlabeled duplicates: 4


  0%|          | 0/303 [00:00<?, ?it/s]

Labeled duplicates: 3
(2721, 53)


  0%|          | 0/303 [00:00<?, ?it/s]

Unlabeled duplicates: 2


  0%|          | 0/303 [00:00<?, ?it/s]

Labeled duplicates: 3
