In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from typing import List
from dataclasses import dataclass, field
from typing import List

In [28]:
#Some helper functions
def convert_to_onehot(data, alphabet):
    #Creates a dict, that maps to every char of alphabet an unique int based on position
    global char_to_int
    char_to_int = dict((c,i) for i,c in enumerate(alphabet))
    encoded_data = []
    #Replaces every char in data with the mapped int
    encoded_data.extend([char_to_int[char] for char in data])
    return encoded_data

def tensor_encoding(x_data, depth, type, alphabet, k=53):
    indices = []
    t2 = []
    for i in range(len(x_data)):
        indices.append(convert_to_onehot(x_data[i], alphabet))
        if len(convert_to_onehot(x_data[i], alphabet)) != k:
            print (x_data[i])
            print ("Length off")
    array=np.stack(indices, axis=0)
    if type == 'emb':
        return array
    for i in tqdm(range(len(indices))):
        t1 = tf.one_hot(indices[i], depth) # output: [9 x 23]
        t2.append(t1)
    return t2

In [29]:
#Change these for you particular model

##############################################################################################################
alphabet_with_labels = "ARNDCEQGHILKMFPSTWYV@&-UX"  #Use alphabets that were used in constructing the datasets
alphabet_without_labels = "ARNDCEQGHILKMFPSTWYV-UX"  #These are the example alphabets
PR="S"
PR2="T"
folder=f"Example_data"
model=f"PreSprint"
sequence=""
model_num=4  #Select model number to use, usually choose 1 as default or the upper median model in metric of choice
##############################################################################################################

In [50]:
def get_kmer(seq, location, k=53):
    """
    Returns kmer of length k from a location and sequence. Ends are padded with "-"

    """
    half=int((k-1)/2)
    if location > len(seq): #Will not add to list of kmers
        print ("Site outside of seq bounds, site: "+str(location)+", sequnce length: "+str(len(seq)))
        kmer=''
    elif location <= half: #To deal with sites near the n terminus
        if location > len(seq)-half: #To deal with sequences shorter than k
            gap="-"*(half-location+1)
            gap2="-"*int(half-(len(seq)-location))
            kmer=seq[0 : int(location+half)]
            kmer=gap+kmer+gap2
        else: 
            gap="-"*(half-location+1)
            kmer=seq[0 : int(location+half)]
            kmer=gap+kmer
    elif location > len(seq)-half: #To deal with sites near the C terminus
        gap="-"*int(half-(len(seq)-location))
        kmer=seq[int(location-half-1): len(seq)]
        kmer=kmer+gap
    else:
        kmer=seq[int(location-half-1): int(location+half)]
    assert len(kmer) == 53
    return kmer

def predict (sequence, model_num, seq_name, folder=folder, model=model, PR=PR, PR2=PR2, k=53):
    """
    Predicts the probability of a modification occuring at a given site
    Runs the with and no labels model (to compare if there is a difference)

    Sequence: Sequence containing only characters in the alphabet
    model_num: Integer for which model to use from the 10-fold validation. Use 1 if unsure
    seq_name: Name of the sequence to report
    """
    trial_num=model_num
    directory=f'{folder}/{model}/Trial{trial_num}'
    labeled_model = tf.keras.models.load_model(f"{directory}/emb_CNN_with_labels_{trial_num}.h5")
    unlabeled_model = tf.keras.models.load_model(f"{directory}/emb_CNN_no_labels_{trial_num}.h5")
    
    PR_sites=[i + 1 for i, char in enumerate(sequence) if char == PR]
    PR_kmers= [get_kmer(sequence, s, k=k) for s in PR_sites]
    if PR2=='':
        PR2_sites=[]
        PR2_kmers=[]
    else:
        PR2_sites=[i + 1 for i, char in enumerate(sequence) if char == PR2]
        PR2_kmers= [get_kmer(sequence, s, k=k) for s in PR2_sites]

    kmers=PR_kmers+PR2_kmers
    sites=PR_sites+PR2_sites

    tensor1 = tensor_encoding(kmers, 23, 'emb', alphabet_without_labels, k=k)
    tensor2 = tensor_encoding(kmers, 23, 'emb', alphabet_with_labels, k=k)

    nl_y_pred = unlabeled_model.predict(tensor1)[:,0]
    l_y_pred = labeled_model.predict(tensor2)[:,0]

    dict={"Site": sites, "No labels model":nl_y_pred, "With PTM labels model":l_y_pred}
    df = pd.DataFrame(dict)

    df.to_csv(f"{folder}/{model}_{seq_name}_prediction_results.csv")
    print (df)
    return (df)
    
    

In [51]:
test_seq_TNNC2="MTDQQAEARSYLSEEMIAEFKAAFDMFDADGGGDISVKELGTVMRMLGQTPTKEELDAIIEEVDEDGSGTIDFEEFLVMMVRQMKEDAKGKSEEELAECFRIFDRNADGYIDPEELAEIFRASGEHVTDEEIESLMKDGDKNNDGRIDFDEFLKMMEGVQ"
predict (test_seq_TNNC2, 2, "TNNC2")
predict (test_seq_TNNC2, 9, "TNNC2", model="PostSprint")

    Site  No labels model  With PTM labels model
0     10         0.661830               0.502870
1     13         0.672783               0.469449
2     36         0.430824               0.408572
3     68         0.161512               0.077070
4     92         0.452066               0.207154
5    123         0.626476               0.641393
6    134         0.631561               0.488063
7      2         0.602398               0.584023
8     42         0.483407               0.305094
9     50         0.746502               0.762836
10    52         0.621401               0.499851
11    70         0.449655               0.540768
12   128         0.773318               0.738562
    Site  No labels model  With PTM labels model
0     10         0.545871               0.700567
1     13         0.566638               0.671037
2     36         0.281840               0.265242
3     68         0.454883               0.455231
4     92         0.470122               0.673074
5    123         0.5

Unnamed: 0,Site,No labels model,With PTM labels model
0,10,0.545871,0.700567
1,13,0.566638,0.671037
2,36,0.28184,0.265242
3,68,0.454883,0.455231
4,92,0.470122,0.673074
5,123,0.574206,0.800132
6,134,0.425192,0.553525
7,2,0.709808,0.735771
8,42,0.66921,0.644399
9,50,0.826953,0.843088
