In [10]:
import os
import numpy as np
import pandas as pd
from ast import literal_eval
from tqdm import tqdm
import random
import tensorflow as tf
import re
import matplotlib
from matplotlib import pyplot as plt
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [128]:

#kmers=np.load(f'{folder}/{ptm}_kmers_Pos.npy', allow_pickle=True)
ptm="Phosphorylation_ST"
dataset="MusiteDeep"
org="All_org"
PR="S"
PR2="T"
sym="@"
sym2="&"
trial_num=1
trial="Trial"+str(trial_num)
directory=f"{dataset}/{ptm}/{org}/{trial}"

def unlabeling(dataset, k=53, k0=53):
    """
    Removes labels from a dataset so can be used in CD-Hit
    """
    new = []
    for i in tqdm(range(len(dataset))):
        data = pd.DataFrame(dataset)
        if k!=53:
            short=np.int64((k0-k)/2)
            data=data[short:k0-short]
        else:
            short=0
        string=dataset[i][short:k0-short].replace('-', '').replace(sym, PR).replace(sym2, PR2) #Add correct PTM
        new.append(string)
    return np.array(new)

#test if any overlap
#ukmers=unlabeling(kmers)
#print (ukmers)

def to_fasta (list, out_file, directory=directory):
    """
    Converts a list of kmers to a fasta where the name is the index of the kmer
    """
    out_file = open(f'{directory}/{out_file}', 'w+')
    out = '\n'.join(['>'+str(i) + "\n" + j for i,j in enumerate(list)])
    out_file.write(out)
    out_file.close()


In [124]:
#Remove similar sequences in test set
alphabet_with_XZ = "ARNDCEQGHILKMFPSTWYV@&-UX" # remove & for all but phos S/T and O-gly
alphabet_without_XZ = "ARNDCEQGHILKMFPSTWYV-UX"

def convert_to_kmer (array, alphabet, k=53, k0=53, labels="no"):
    """
    k0 is the length of the kmer in the data
    k is the desired kmer length where k<=k0
    """
    kmers=[]
    for i in range(len(array)):
        new_string=''
        for j in range(k0):
            idx=array[i, j]
            try:
                char=alphabet[idx]
                new_string=new_string+char
            except IndexError: 
                print (str(i))
                print (str(array[i,j]))
        kmers.append(new_string)
    return unlabeling(kmers, k)



trial_num=1
#without labels
def convert_datasets(trial_num, extra='', k=53):
    """
    Given a dataset of encoded kmers, this takes converts it into fasta files, where the name is the index of the kmer
    -Validation and train sets are combined
    -Labels are removed so CD-Hit can can run. Once CD-Hit removes similar kmers, the indexes (sequence names) can be used to remove similar sequences
    in the dataset
    """
    trial="Trial"+str(trial_num)
    directory=f"{dataset}/{ptm}/{org}/{trial}"
    folder=directory

    x_train =np.load(f'{folder}/x_train_emb_no_labels_{trial_num}.npy', allow_pickle = True)
    y_train = np.load(f'{folder}/y_train_emb_no_labels_{trial_num}.npy', allow_pickle = True).astype('float32')
    x_val =np.load(f'{folder}/x_validation_emb_no_labels_{trial_num}.npy', allow_pickle = True)
    y_val = np.load(f'{folder}/y_validation_emb_no_labels_{trial_num}.npy', allow_pickle = True).astype('float32')
    x_test =np.load(f'{folder}/x_test_emb_no_labels_{trial_num}.npy', allow_pickle = True)
    y_test = np.load(f'{folder}/y_test_emb_no_labels_{trial_num}.npy', allow_pickle = True).astype('float32')


    x_trainl =np.load(f'{folder}/x_train_emb_with_labels_{trial_num}.npy', allow_pickle = True)
    y_trainl = np.load(f'{folder}/y_train_emb_with_labels_{trial_num}.npy', allow_pickle = True).astype('float32')
    x_vall =np.load(f'{folder}/x_validation_emb_with_labels_{trial_num}.npy', allow_pickle = True)
    y_vall = np.load(f'{folder}/y_validation_emb_with_labels_{trial_num}.npy', allow_pickle = True).astype('float32')
    x_testl =np.load(f'{folder}/x_test_emb_with_labels_{trial_num}.npy', allow_pickle = True)
    y_testl = np.load(f'{folder}/y_test_emb_with_labels_{trial_num}.npy', allow_pickle = True).astype('float32')
    


    #print (x_test)

    #with labels


    x_testk=convert_to_kmer(x_test, alphabet_without_XZ, k=k)
    to_fasta (x_testk, f"Test_kmers_no_labels{extra}.fasta", directory=folder)
    
    x_traink=convert_to_kmer(x_train, alphabet_without_XZ, k=k)
    x_valk=convert_to_kmer(x_val, alphabet_without_XZ, k=k)
    x_traink=[*x_valk, *x_traink]
    to_fasta (x_traink, f"Train_kmers_no_labels{extra}.fasta", directory=folder)

    x_testkl=convert_to_kmer(x_testl, alphabet_with_XZ, labels="yes", k=k)
    x_trainkl=convert_to_kmer(x_trainl, alphabet_with_XZ, labels="yes", k=k)
    x_valkl=convert_to_kmer(x_vall, alphabet_with_XZ, labels="yes", k=k)
    x_trainkl=[*x_trainkl, *x_valkl]

    to_fasta (x_testkl, f"Test_kmers_with_labels{extra}.fasta", directory=folder)
    to_fasta (x_trainkl, f"Train_kmers_with_labels{extra}.fasta", directory=folder)
    

In [129]:
alphabet_with_XZ = "ARNDCEQGHILKMFPSTWYV@&-UX" # removed & for all but phos S/T and O-gly
alphabet_without_XZ = "ARNDCEQGHILKMFPSTWYV-UX"

ptm="O-GlcNAc"
dataset="O-GlcNAc"
org="Humans"
PR="S"
PR2="T"
sym="@"
sym2="&"
orgs=["Humans"]

for org in orgs:
    for i in range(1, 2):
        convert_datasets(i, extra='_29', k=29)

100%|██████████| 2038/2038 [00:00<00:00, 2937.82it/s]
100%|██████████| 16297/16297 [00:20<00:00, 800.32it/s]
100%|██████████| 2037/2037 [00:00<00:00, 2386.44it/s]
100%|██████████| 2038/2038 [00:00<00:00, 2529.27it/s]
100%|██████████| 16297/16297 [00:24<00:00, 656.27it/s]
100%|██████████| 2037/2037 [00:00<00:00, 3686.26it/s]
