# import

In [1]:
import tensorflow as tf
import torch
import numpy as np
import pandas as pd
import re
import os
from scipy.io import savemat,loadmat

# my functions and constants

In [11]:
AAs = ['Q', 'L', 'N', 'G', 'R', 'F', '_', 'W', 'T', 'E', 'K', 'I', 'D', 'V', 'Y', 'S', 'A', 'C', 'M', 'H', 'P']

columns = ["Entry","Sequence","Position","SeqWin","Type","Species","PMIDs"]

# dict_Acc_to_Entry
with open("./orig_dataset/uniprot-human-filtered-reviewed_yes+AND+organism__Homo+sapiens+(Human)--.txt") as f:
    text = f.read()

p5 = re.compile("ID   .+\nAC   (.+);\n")

Accessions = [i.split("; ") for i in p5.findall(text)]

dict_Acc_to_Entry = dict()

for Accs in Accessions:
    for Acc in Accs:
        dict_Acc_to_Entry[Acc] = Accs[0]
# dict_Acc_to_Entry done

# df_0
with open("./orig_dataset/uniprot_sp_isoform.fasta") as f:
    text = f.read()

p1 = re.compile(">sp\|(.+?)\|")
p2 = re.compile(">sp\|.+?\n(.+?)(?=\n>sp|\n$)",re.DOTALL)

Entrys = p1.findall(text)

Sequences = [i.replace("\n","") for i in p2.findall(text)]

df_0 = pd.DataFrame(dict(Entry=Entrys,Sequence=Sequences))
# df_0 done

def fun_searchID_in_uniprot(ACC_ID:str):
    import requests
    url = 'https://www.uniprot.org/uploadlists/' 

    params = {
    'from': 'ACC+ID',
    'to': 'ACC',
    'format': 'tab',
    'query': ACC_ID,
    'head': "chrome"
    }

    res_get = requests.request("GET",url,params=params)  
    return res_get

def F_pad(s,pad="_",n=20):
    return pad*n+s+pad*n

def fun_reassign_Position(df,n=7):
    df1 = df[df.apply(lambda x: F_pad(x.Sequence,n=n)[x.Position-1:x.Position+2*n]==x.SeqWin, 1)].copy()
    df2 = df[df.apply(lambda x: F_pad(x.Sequence,n=n)[x.Position-1:x.Position+2*n]!=x.SeqWin, 1)].copy()
    if len(df2)!=0:
        df2 = df2[df2.apply(lambda x: F_pad(x.Sequence,n=n).find(x.SeqWin)!=-1, 1)].copy()
        if len(df2)!=0:
            df2.Position = df2.apply(lambda x: F_pad(x.Sequence,n=n).find(x.SeqWin)+1, 1)
        df = pd.concat([df1,df2])
    else:
        df = df1
    df["SeqWin"] = df.apply(lambda x: F_pad(x.Sequence,n=30)[x.Position-1:x.Position+2*30], 1)
    return df

def fun_Km123e(df,TYPE):  
    temp = df.copy()

    temp["Accession"] = temp.Protein.str.split("|").str[1]

    temp.Position = temp.Position.astype(int)
    temp["SeqWin"] = temp["Sequence"].str.split(";").str[0]
    temp["Species"] = temp.Protein.str.split("_").str[-1]
    temp["Type"] = TYPE
    temp["PMID"] = "30395435"
    temp = temp[["Accession","Position","SeqWin","Type","Species","PMID"]].copy()

    temp = fun_ID_convert_sp_isoform(temp).merge(df_0)
    temp = fun_reassign_Position(temp,n=15)
    temp["Evidence"] = "PMID:"+temp.PMID
    temp = temp[["Entry","Position","Type","Evidence","Species","Sequence","SeqWin"]].copy()
    temp.drop_duplicates(["Entry","Position","Type","Evidence"],inplace=True)
    temp.Species = "human"
    return temp

def fun_reassign_Position(df,n=7):
    df1 = df[df.apply(lambda x: F_pad(x.Sequence,n=n)[x.Position-1:x.Position+2*n]==x.SeqWin, 1)].copy()
    df2 = df[df.apply(lambda x: F_pad(x.Sequence,n=n)[x.Position-1:x.Position+2*n]!=x.SeqWin, 1)].copy()
    if len(df2)!=0:
        df2 = df2[df2.apply(lambda x: F_pad(x.Sequence,n=n).find(x.SeqWin)!=-1, 1)].copy()
        df2.Position = df2.apply(lambda x: F_pad(x.Sequence,n=n).find(x.SeqWin)+1, 1)
        df = pd.concat([df1,df2])
    else:
        df = df1
    df["SeqWin"] = df.apply(lambda x: F_pad(x.Sequence,n=30)[x.Position-1:x.Position+2*30], 1)
    return df

def fun_ID_convert_sp_isoform(df,Acc_col="Accession",inplace=False):
    if inplace==False:
        df = df.copy()
    
    isoform = df[Acc_col].apply(lambda x: "-"+x.split("-")[1] if "-" in x else "")
    
    df[Acc_col] = df[Acc_col].str.split("-").str[0].apply(lambda x: dict_Acc_to_Entry.get(x))
    df[Acc_col] = df[Acc_col]+isoform
    
    df.rename(columns={Acc_col:"Entry"},inplace=True)
    
    if inplace==False:
        return df
def fun_split_PMIDs(df,PMID_col="PMIDs"):
    pmids = re.split("\W",df[PMID_col].iloc[0])
    dfs = list()
    for pmid in pmids:
        temp = df.copy()
        temp[PMID_col] = pmid
        dfs.append(temp)
    return pd.concat(dfs).rename(columns={PMID_col:"PMID"})

dict_Type = {"K.all":"Kme","K.mono":"Km1","K.di":"Km2","K.tri":"Km3"}
def fun_split_Types(df,Type_col="Type"):
    types = re.split(";",df[Type_col].iloc[0])
    dfs = list()
    for tp in types:
        temp = df.copy()
        temp[Type_col] = dict_Type.get(tp)
        dfs.append(temp)
    return pd.concat(dfs)

def fun_keepsame(df1,df2,cols=["Entry","Position","Type"]):
    key1 = df1.apply(lambda x: x[cols[0]]+"_"+str(x[cols[1]])+"_"+x[cols[2]],1)
    key2 = df2.apply(lambda x: x[cols[0]]+"_"+str(x[cols[1]])+"_"+x[cols[2]],1)
    
    return df1[key1.isin(key2)]
def fun_keepdiff(df1,df2,cols=["Entry","Position","Type"]):
    key1 = df1.apply(lambda x: x[cols[0]]+"_"+str(x[cols[1]])+"_"+x[cols[2]],1)
    key2 = df2.apply(lambda x: x[cols[0]]+"_"+str(x[cols[1]])+"_"+x[cols[2]],1)
    
    return df1[~key1.isin(key2)]

# orig_dataset

## PLMD: a database for protein lysine modifications

In [22]:
df_1 = pd.read_csv("./orig_dataset/Methylation.zip",sep='\t').iloc[:,[1,2,3,4,5,6]].query("Species=='Homo sapiens'")

# ID_convert
df_1 = fun_ID_convert_sp_isoform(df_1,"Uniprot Accession").merge(df_0)

# "K"
df_1.Position = df_1.Position.astype(int)
df_1 = df_1[df_1.apply(lambda x: x.Sequence[x.Position-1:x.Position]=="K",1)]

# Km1,Km2,Km3,Kme
print(df_1.Type.unique())

# human
print(df_1.Species.unique())

df_1.Type = "Kme"

df_1.Species = "human"

# PMID
df_1 = df_1.groupby(level=0).apply(lambda x: fun_split_PMIDs(x,"PMIDs"))
df_1.index = range(len(df_1))

df_1["SeqWin"] = df_1.apply(lambda x: F_pad(x.Sequence,n=30)[x.Position-1:x.Position+2*30], 1)

# Evidence
df_1["Evidence"] = "PMID:"+df_1.PMID

df_1 = df_1[["Entry","Position","Type","Evidence","Species","Sequence","SeqWin"]].copy()

df_1.drop_duplicates(["Entry","Position","Type","Evidence"],inplace=True)

df_1

['Methylation']
['Homo sapiens']


Unnamed: 0,Entry,Position,Type,Evidence,Species,Sequence,SeqWin
0,O00139,161,Kme,PMID:23644510,human,MATANFGKIQIGIYVEIKRSDGRIHQAMVTSLNEDNESVTVEWIEN...,QQNGSVSDISPVQAAKKEFGPPSRRKSNCVKEVEKLQEKREKRRLQ...
1,O00139,161,Kme,PMID:25514926,human,MATANFGKIQIGIYVEIKRSDGRIHQAMVTSLNEDNESVTVEWIEN...,QQNGSVSDISPVQAAKKEFGPPSRRKSNCVKEVEKLQEKREKRRLQ...
2,O00139,169,Kme,PMID:23644510,human,MATANFGKIQIGIYVEIKRSDGRIHQAMVTSLNEDNESVTVEWIEN...,ISPVQAAKKEFGPPSRRKSNCVKEVEKLQEKREKRRLQQQELREKR...
3,O00139,169,Kme,PMID:25514926,human,MATANFGKIQIGIYVEIKRSDGRIHQAMVTSLNEDNESVTVEWIEN...,ISPVQAAKKEFGPPSRRKSNCVKEVEKLQEKREKRRLQQQELREKR...
4,O00159,383,Kme,PMID:23161681,human,MALQVELVPTGEIIRVVHPHRPCKLALGSDGVRVTMESALTARDRV...,LTHRKIIAKGEELLSPLNLEQAAYARDALAKAVYSRTFTWLVGKIN...
...,...,...,...,...,...,...,...
2595,B0I1T2,518,Kme,PMID:25514926,human,MEDEEGPEYGKPDFVLLDQVTMEDFMRNLQLRFEKGRIYTYIGEVL...,DMHHRHHLHYTSRQLCPTDKTMEFGRDFRIKHYAGDVTYSVEGFID...
2596,B2RTY4,1885,Kme,PMID:25514926,human,MNINDGGRRRFEDNEHTLRIYPGAISEGTIYCPIPARKNSTAAEVI...,VQIIASVSDLKSMDEFLLKKVNDLDNEDSKKDTLVDVVFKKALKEF...
2597,B2RXF5,247,Kme,PMID:25514926,human,MEFPEHGGRLLGRLRQQRELGFLCDCTVLVGDARFPAHRAVLAACS...,PGAQPLVKDERDSLSEQEESSSSRSPHSPPKPPPVPAAKGLVVGLQ...
2598,B4DYI2,863,Kme,PMID:23644510,human,MENLPFPLKLLSASSLNTPSSTPWVLDIFLTLVFALGFFFLLLPYF...,KSSLLPRMSVSQDPRKLCLMEEAVSEFEPGKATKSETQPQVSATVV...


In [173]:
df_1["Source"] = "PLMD"

In [175]:
df_1.to_csv("./datasets/PLMD.csv",index=False)

## dataset of lysine methylation sites predictor GPS-MSP

In [24]:
df_2 = pd.read_csv("./orig_dataset/k.all.txt",sep='\t',header=None)[[0,1,2,4,5,6]].drop_duplicates()

df_2.columns = ["Accession","Sequence","Position","PMIDs","Type","Species"]

# ID_convert
df_2 = fun_ID_convert_sp_isoform(df_2,"Accession").merge(df_0)

# "K"
df_2.Position = df_2.Position.astype(int)
df_2 = df_2[df_2.apply(lambda x: x.Sequence[x.Position-1:x.Position]=="K",1)]

# Km1,Km2,Km3,Kme
print(df_2.Type.unique())

df_2 = df_2.groupby(level=0).apply(lambda x: fun_split_Types(x,"Type"))
df_2.index = range(len(df_2))

# human
print(df_2.Species.unique())

df_2.Species = "human"

# PMID
df_2 = df_2.groupby(level=0).apply(lambda x: fun_split_PMIDs(x,"PMIDs"))
df_2.index = range(len(df_2))

df_2["SeqWin"] = df_2.apply(lambda x: F_pad(x.Sequence,n=30)[x.Position-1:x.Position+2*30], 1)

# Evidence
df_2["Evidence"] = "PMID:"+df_2.PMID

df_2 = df_2[["Entry","Position","Type","Evidence","Species","Sequence","SeqWin"]].copy()

df_2.drop_duplicates(["Entry","Position","Type","Evidence"],inplace=True)

df_2

['K.all' 'K.mono;K.all' 'K.di;K.all' 'K.tri;K.di;K.all'
 'K.tri;K.di;K.mono;K.all' 'K.tri;K.mono;K.all' 'K.di;K.tri;K.all'
 'K.tri;K.all' 'K.di;K.mono;K.all' 'K.di;K.mono;K.tri;K.all'
 'K.mono;K.tri;K.all' 'K.di;K.tri;K.mono;K.all']
['Homo sapiens']


Unnamed: 0,Entry,Position,Type,Evidence,Species,Sequence,SeqWin
0,O95256,554,Kme,PMID:23644510,human,MLCLGWIFLWLVAGERIKGFNISGCSTKKLLWTYSTRSEEEFVLFC...,PHLVKKALRVLPTVTWRGLKSVPPNSRFWAKMRYHMPVKNSQGFTW...
1,P28845,56,Kme,PMID:23644510,human,MAFMKKYLLPILGLFMAYYYYSANEEFRPEMLQGKKVIVTGASKGI...,EFRPEMLQGKKVIVTGASKGIGREMAYHLAKMGAHVVVTARSKETL...
2,P23527,47,Km1,PMID:uniprot,human,MPDPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKESYSIYVYKVL...,KAVTKAQKKDGKKRKRSRKESYSIYVYKVLKQVHPDTGISSKAMGI...
3,P23527,47,Kme,PMID:uniprot,human,MPDPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKESYSIYVYKVL...,KAVTKAQKKDGKKRKRSRKESYSIYVYKVLKQVHPDTGISSKAMGI...
4,P23527,58,Km2,PMID:uniprot,human,MPDPAKSAPAPKKGSKKAVTKAQKKDGKKRKRSRKESYSIYVYKVL...,KKRKRSRKESYSIYVYKVLKQVHPDTGISSKAMGIMNSFVNDIFER...
...,...,...,...,...,...,...,...
1484,P05496,104,Kme,PMID:7575423,human,MQTAGALFISPALIRCCTRGLIRPVSASFLNSPVNSSKQPSYSNFP...,AATVGVAGSGAGIGTVFGSLIIGYARNPSLKQQLFSYAILGFALSE...
1485,Q6XPR3,233,Kme,PMID:23644510,human,MAQLLNSILSVIDVFHKYAKGNGDCALLCKEELKQLLLAEFGDILQ...,DSSSGKKVSHKSTSGQAKWQGHIFALNRCEKPIQDSHYGQSERHTQ...
1486,Q9Y603,310,Kme,PMID:23644510,human,MQEGELAISPISPVAAMPPLGTHVQARCEAQINLLGEGGICKLPGR...,SRALRHYYKLNIIKKEPGQKLLFRFLKTPGKMVQDKHSHLEPLESQ...
1487,Q5VSL9,722,Kme,PMID:23644510,human,MEPAVGGPGPLIVNNKQPQPPPPPPPAAAQPPPGAPRAAAGLLPGG...,FKSAPILKRALKVKQAMMQLYVLKLLKVQTKYLGRQWRKSNMKTMS...


In [177]:
df_2["Source"] = "GPS-MSP"

In [178]:
df_2.to_csv("./datasets/GPS-MSP.csv",index=False)

## PhosphoSitePlus

In [84]:
df_3 = pd.read_csv("./orig_dataset/Methylation_site_dataset.gz",sep='\t',header=2)[["ACC_ID","MOD_RSD","ORGANISM","SITE_+/-7_AA"]]

df_3 = df_3[(df_3.MOD_RSD.str[0]=="K")].copy()

df_3["Type"] = "K"+df_3.MOD_RSD.str.split("-").str[1]

df_3["Position"] = df_3.MOD_RSD.str.extract("(\d+)").astype(int)

df_3["SeqWin"] = df_3["SITE_+/-7_AA"].str.upper()

df_3 = df_3[["ACC_ID","Position","SeqWin","Type","ORGANISM"]].copy()

df_3.columns = ["Accession","Position","SeqWin","Type","Species"]

df_3 = fun_ID_convert_sp_isoform(df_3,"Accession").merge(df_0)

# human
print(df_3.Species.unique())

# initial SeqWin
print(df_3.SeqWin.str.len().unique(),df_3.SeqWin.str[7].unique())

df_3 = fun_reassign_Position(df_3)

df_3 = df_3[["Entry","Position","Type","Species","Sequence","SeqWin"]].copy()
df_3.drop_duplicates(["Entry","Position","Type"],inplace=True)
df_3

['human']
[15] ['K']


Unnamed: 0,Entry,Position,Type,Species,Sequence,SeqWin
0,P31946,51,Km1,human,MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL...,YDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISS...
1,P62258,153,Km1,human,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,KMKGDYHRYLAEFATGNDRKEAAENSLVAYKAASDIAMTELPPTHP...
2,Q04917,50,Km1,human,MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLS...,YDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWRVISS...
3,P61981,50,Km1,human,MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLS...,YDDMAAAMKNVTELNEPLSNEERNLLSVAYKNVVGARRSSWRVISS...
4,P27348,212,Km1,human,MEKTELIQKAKLAEQAERYDDMATCMKAVTEQGAELSNEERNLLSV...,LNNPELACTLAKTAFDEAIAELDTLNEDSYKDSTLIMQLLRDNLTL...
...,...,...,...,...,...,...
5045,P49750,781,Km1,human,MYPNWGRYGGSSHYPPPPVPPPPPVALPEASPGPGYSSSTTPAAPS...,SSYLESPRGPRFDGPRRFEDLGSRCEGPRPKGPRFEGNRPDGPRPR...
5046,P49750,808,Km1,human,MYPNWGRYGGSSHYPPPPVPPPPPVALPEASPGPGYSSSTTPAAPS...,PRPKGPRFEGNRPDGPRPRYEGHPAEGTKSKWGMIPRGPASQFYIT...
5047,P49750,838,Km1,human,MYPNWGRYGGSSHYPPPPVPPPPPVALPEASPGPGYSSSTTPAAPS...,KWGMIPRGPASQFYITPSTSLSPRQSGPQWKGPKPAFGQQHQQQPK...
5048,P49750,1053,Km1,human,MYPNWGRYGGSSHYPPPPVPPPPPVALPEASPGPGYSSSTTPAAPS...,PGQSRMEDTRDKGLVNRGRGQAISRGPGLVKQEDFRDKMMGRREDS...


In [85]:
import os

path = pd.Series(os.listdir("./ID_convert_list/"))

CSTCS = path[path.str.startswith("PSP-CuratedInfo-CSTCS")]
PMID = path[path.str.startswith("PSP-CuratedInfo-PMID")]

dict1 = dict(Monomethyl="Km1",Dimethyl="Km2",Trimethyl="Km3")

In [86]:
df_3_withEvidence = list()
for i in PMID.values:
    temp = pd.read_excel(f"./ID_convert_list/{i}",header=3)[["ACC#","RSD","ORGANISM","MOD_TYPE","SITE_+/-7_AA"]]
    temp = temp[temp.RSD.str[0]=="K"].copy()
    temp["Type"] = temp.MOD_TYPE.apply(lambda x:dict1.get(x))
    temp["Species"] = temp.ORGANISM
    temp["Position"] = temp.RSD.str[1:].astype(int)
    temp["SeqWin"] = temp["SITE_+/-7_AA"].str.upper()
    temp = fun_ID_convert_sp_isoform(temp,"ACC#")[["Entry","Position","SeqWin","Type","Species"]].merge(df_0)
    if len(temp)==0:
        continue
    temp = fun_reassign_Position(temp)
    temp["PMID"] = i.split("#")[-1].split(".")[0]
    temp["Evidence"] = "PMID:"+temp.PMID
    temp = temp[["Entry","Position","Type","Evidence","Species","Sequence","SeqWin"]].copy()
    temp.drop_duplicates(["Entry","Position","Type","Evidence"],inplace=True)
    df_3_withEvidence.append(temp)
    
for i in CSTCS.values:
    temp = pd.read_excel(f"./ID_convert_list/{i}",header=3)[["ACC#","RSD","ORGANISM","MOD_TYPE","SITE_+/-7_AA"]]
    temp = temp[temp.RSD.str[0]=="K"].copy()
    temp["Type"] = temp.MOD_TYPE.apply(lambda x:dict1.get(x))
    temp["Species"] = temp.ORGANISM
    temp["Position"] = temp.RSD.str[1:].astype(int)
    temp["SeqWin"] = temp["SITE_+/-7_AA"].str.upper()
    temp = fun_ID_convert_sp_isoform(temp,"ACC#")[["Entry","Position","SeqWin","Type","Species"]].merge(df_0)
    temp = fun_reassign_Position(temp)
    temp["CSTCS"] = i.split("#")[-1].split(".")[0]
    temp["Evidence"] = "CSTCS:"+temp.CSTCS
    temp = temp[["Entry","Position","Type","Evidence","Species","Sequence","SeqWin"]].copy()
    temp.drop_duplicates(["Entry","Position","Type","Evidence"],inplace=True)
    
    df_3_withEvidence.append(temp)

df_3_withEvidence = pd.concat(df_3_withEvidence)

df_3_withEvidence.dropna(inplace=True)

In [187]:
df_3_withEvidence["Source"] = "PhosphoSitePlus_withEvidence"
df_3["Source"] = "PhosphoSitePlus"

In [181]:
df_3_withEvidence.to_csv("./datasets/PhosphoSitePlus_withEvidence.csv",index=False)

In [180]:
df_3.to_csv("./datasets/PhosphoSitePlus.csv",index=False)

## dbPTM: an integrated resource for protein post-translational modifications (PTMs)

In [87]:
df_4 = pd.read_csv("./orig_dataset/Methylation.txt.gz",sep='\t',header=None)
df_4 = df_4[df_4[5].str[10]=="K"].copy()
df_4["Species"] = df_4[0].str.split("_").str[1]

df_4["Position"] = df_4[2].astype(int)

df_4["Accession"] = df_4[1]

df_4["Type"] = df_4[3].replace("Methylation","Kme")

df_4["PMIDs"] = df_4[4]
df_4["SeqWin"] = df_4[5]
df_4 = df_4[["Accession","Position","SeqWin","Type","Species","PMIDs"]].copy()

df_4 = fun_ID_convert_sp_isoform(df_4,"Accession").merge(df_0)

df_4 = fun_reassign_Position(df_4,10)
df_4 = df_4.groupby(level=0).apply(lambda x: fun_split_PMIDs(x,"PMIDs"))
df_4["Evidence"] = "PMID:"+df_4.PMID
df_4 = df_4[["Entry","Position","Type","Evidence","Species","Sequence","SeqWin"]].copy()
df_4.drop_duplicates(["Entry","Position","Type","Evidence"],inplace=True)
df_4.Species = "human"
df_4

Unnamed: 0,Unnamed: 1,Entry,Position,Type,Evidence,Species,Sequence,SeqWin
0,0,P62258,215,Kme,PMID:23644510,human,MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLS...,LNSPDRACRLAKAAFDDAIAELDTLSEESYKDSTLIMQLLRDNLTL...
1,1,P27348,212,Kme,PMID:25514926,human,MEKTELIQKAKLAEQAERYDDMATCMKAVTEQGAELSNEERNLLSV...,LNNPELACTLAKTAFDEAIAELDTLNEDSYKDSTLIMQLLRDNLTL...
2,2,P63104,212,Kme,PMID:25514926,human,MDKNELVQKAKLAEQAERYDDMAACMKSVTEQGAELSNEERNLLSV...,LNSPEKACSLAKTAFDEAIAELDTLSEESYKDSTLIMQLLRDNLTL...
3,3,Q96QU6,43,Kme,PMID:23644510,human,MFTLPQKDFRAPTTCLGPTCMQDLGSSHGEDLEGECSRKLDQKLPE...,TTCLGPTCMQDLGSSHGEDLEGECSRKLDQKLPELRGVGDPAMISS...
4,4,Q16537,449,Kme,PMID:23644510,human,MSSAPTTPPSVDKVDGFSRKSVRKARQKRSQSSSQFRSQGKPIELT...,STMFDELTATYKSDRQREKKKEKEREELWKKLEDLELKRGLRRDGI...
...,...,...,...,...,...,...,...,...
3702,3702,Q15942,165,Kme,PMID:7960499,human,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,PQPREKVSSIDLEIDSLSSLLDDMTKNDPFKARVSSGYVPPPVATP...
3703,3703,Q15942,201,Kme,PMID:115978009,human,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,GYVPPPVATPFSSKSSTKPAAGGTAPLPPWKSPSSSQPLPQVPAPA...
3704,3704,Q15942,272,Kme,PMID:87857,human,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,TQPVSLANTQPRGPPASSPAPAPKFSPVTPKFTPVASKFSPGAPGG...
3705,3705,Q15942,279,Kme,PMID:70773,human,MAAPRPSPAISVSVSAPAFYAPQKKFGPVVAPKPKVNPFRPGDSEP...,NTQPRGPPASSPAPAPKFSPVTPKFTPVASKFSPGAPGGSGSQPNQ...


In [None]:
df_4["Source"] = "dbPTM"

In [182]:
df_4.to_csv("./datasets/dbPTM.csv",index=False)

## Affinity Purification of Methyllysine Proteome by Site-Specific Covalent Conjugation

In [151]:
df_Km1 = pd.read_excel("./orig_dataset/Copy+of+proteinGroups+in+solution_wechat_lilei_wangrui.xlsx","K(me)1Sites")

df_Km2 = pd.read_excel("./orig_dataset/Copy+of+proteinGroups+in+solution_wechat_lilei_wangrui.xlsx","K(me)2Sites")

df_Km3 = pd.read_excel("./orig_dataset/Copy+of+proteinGroups+in+solution_wechat_lilei_wangrui.xlsx","k(me)3Sites")

df_Kme = pd.read_excel("./orig_dataset/Copy+of+proteinGroups+in+solution_wechat_lilei_wangrui.xlsx","Kme")

df_Km1 = df_Km1[df_Km1.Specificity.astype(str).str.contains("HP1b")].drop_duplicates(["Proteins","Positions within proteins"])[["Protein","Position",'Sequence window']].copy()

df_Km2 = df_Km2[df_Km2.Specificity.astype(str).str.contains("HP1b")].drop_duplicates(["Proteins","Positions within proteins"])[["Protein","Position",'Sequence window']].copy()

df_Km3 = df_Km3[df_Km3.Specificity.astype(str).str.contains("HP1b")].drop_duplicates(["Proteins","Positions within proteins"])[["Protein","Position",'Sequence window']].copy()

df_Kme = df_Kme[df_Kme.Specificity.astype(str).str.contains("HP1b")].drop_duplicates(["Proteins","Positions within proteins"])[["Protein","Position",'Sequence window']].copy()

df_Km1 = fun_Km123e(df_Km1.rename(columns={"Sequence window":"Sequence"}),"Km1")

df_Km2 = fun_Km123e(df_Km2.rename(columns={"Sequence window":"Sequence"}),"Km2")

df_Km3 = fun_Km123e(df_Km3.rename(columns={"Sequence window":"Sequence"}),"Km3")

df_Kme = fun_Km123e(df_Kme.rename(columns={"Sequence window":"Sequence"}),"Kme")

df_5 = pd.concat([df_Km1,df_Km2,df_Km3,df_Kme])

df_5.drop_duplicates(["Entry","Position","Type","Evidence"],inplace=True)
df_5

Unnamed: 0,Entry,Position,Type,Evidence,Species,Sequence,SeqWin
0,A2RU48,82,Km1,PMID:30395435,human,MAQSDFLYPENPKRREEVNRLHQQLLDCLSDSFDVTNKLTEVLNMH...,ASIEMKRDGTIKENCDLIIQAIMKIQKELQKVDEALKDKLEPTLYR...
1,O14793,141,Km1,PMID:30395435,human,MQKLQLCVYIYLFMLIVAGPVDLNENSEQKENVEKEGLCNACTWRQ...,YHATTETIITMPTESDFLMQVDGKPKCCFFKFSSKIQYNKVVKAQL...
2,O14793,145,Km1,PMID:30395435,human,MQKLQLCVYIYLFMLIVAGPVDLNENSEQKENVEKEGLCNACTWRQ...,TETIITMPTESDFLMQVDGKPKCCFFKFSSKIQYNKVVKAQLWIYL...
3,O14979-3,42,Km1,PMID:30395435,human,MEDMNEYSNIEEFAEGSKINASKNQQDDGKMFIGGLSWDTSKKDLT...,EFAEGSKINASKNQQDDGKMFIGGLSWDTSKKDLTEYLSRFGEVVD...
4,O43395,458,Km1,PMID:30395435,human,MALSKRELDELKPWIEKTVKRVLGFSEPTVVTAALNCVGKGMDKKK...,VDNDTPVTLGVYLTKKEQKKLRRQTRREAQKELQEKVRLGLMPPPE...
...,...,...,...,...,...,...,...
169,Q9UGT4,151,Kme,PMID:30395435,human,MKPALLPWALLLLATALGPGPGPTADAQESCSMRCGALDGPCSCHP...,YESGRIPFTVSLDNGHSFPRAGTWLAVHPNKVSMMEKSELVNETRW...
170,Q9UMZ2-7,1129,Kme,PMID:30395435,human,MALRPGAGSGGGGAAGAGAGSAGGGGFMFPVAGGIRPPQAGLMPMQ...,VYRVTKRVELGIKATAVCSEKLQQLLKDIDKVWNNLIGFMSLATLT...
171,Q9UMZ2-7,1149,Kme,PMID:30395435,human,MALRPGAGSGGGGAAGAGAGSAGGGGFMFPVAGGIRPPQAGLMPMQ...,KLQQLLKDIDKVWNNLIGFMSLATLTCCWEKMTVITKHLSPYHELL...
172,Q9UQ07-4,8,Kme,PMID:30395435,human,MNFDFPFKKGSGIPLLTTNLSPQCLSLLHAMVAYDPDERIAAHQAL...,_______________________MNFDFPFKKGSGIPLLTTNLSPQ...


In [183]:
df_5["Source"] = "Literature"

In [184]:
df_5.to_csv("./datasets/Literature.csv",index=False)

## iPTMnet: a database for PTM

In [152]:
df_6 = pd.read_csv("./orig_dataset/ptm.txt",sep='\t',header=None,low_memory=False)

df_6.columns = ["Type","source","Accession","substrate_genename","Species","Position","enzyme_UniProtAC","enzyme_genename","note","PMIDs"]

columns

df_6 = df_6[["Accession","Position","Type","Species","PMIDs"]].copy()

df_6.query("Type=='METHYLATION'",inplace=True)

df_6 = df_6[df_6.Position.str[0]=="K"].copy()

df_6.Position = df_6.Position.str[1:].astype(int)

df_6.Type = "Kme"

df_6 = fun_ID_convert_sp_isoform(df_6).merge(df_0)

df_6.Species = "human"

df_6 = df_6.groupby(level=0).apply(lambda x: fun_split_PMIDs(x,"PMIDs"))

df_6.index = range(len(df_6))

df_6["SeqWin"] = df_6.apply(lambda x: F_pad(x.Sequence,n=30)[x.Position-1:x.Position+2*30], 1)

df_6["Evidence"] = "PMID:"+df_6.PMID
df_6 = df_6[["Entry","Position","Type","Evidence","Species","Sequence","SeqWin"]].copy()
df_6.drop_duplicates(["Entry","Position","Type","Evidence"],inplace=True)
df_6

Unnamed: 0,Entry,Position,Type,Evidence,Species,Sequence,SeqWin
0,O43169,39,Kme,PMID:24129315,human,MSGSMATAEASGSDGKGQEVETSVTYYRLEEVAKRNSLKELWLVIH...,EASGSDGKGQEVETSVTYYRLEEVAKRNSLKELWLVIHGRVYDVTR...
1,O43524,149,Kme,PMID:22820736,human,MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK...,LSGGTQALLQPQQPLPPPQPGAAGGSGQPRKCSSRRNAWGNLSYAD...
2,O43524,230,Kme,PMID:22820736,human,MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK...,SNSSAGWKNSIRHNLSLHSRFMRVQNEGTGKSSWWIINPDGGKSGK...
3,O43524,262,Kme,PMID:22820736,human,MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK...,SWWIINPDGGKSGKAPRRRAVSMDNSNKYTKSRGRAAKKKAALQTA...
4,O43524,271,Kme,PMID:22820736,human,MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK...,GKSGKAPRRRAVSMDNSNKYTKSRGRAAKKKAALQTAPESADDSPS...
...,...,...,...,...,...,...,...
276,Q9Y232,135,Kme,PMID:18438403,human,MTFQASHRSAWGKSRKKNWQYEGPTQKLFLKRNNVSAPDGPSDPSI...,YIHDFNRRHTEKQKESTLTRTNRTSPNNARKQISRSTNSNFSKTSP...
277,Q9Y2W1,252,Kme,PMID:24129315,human,MSKTNKSKSGSRSSRSRSASRSRSRSFSKSRSRSRSLSRSRKRRLS...,PWPDATYGTGSASRASAVSELSPRERSPALKSPLQSVVVRRRSPRP...
278,Q9Y2Z2,533,Kme,PMID:24129315,human,MFYFRGCGRWVAVSFTKQQFPLARLSSDSAAPRTPHFDVIVIGGGH...,PDNADSRLTLRGYKDAGCVSQQRYERACWMKSSLEEGISVLKSIEF...
279,Q9Y4X4,313,Kme,PMID:18438403,human,MNIHMKRKTIKNINTFENRMLMLDGMPAVRVKTELLESEQGSPNVH...,MNNQKFPCSISPFSIESTRRQRRSESPDSRKRRIHRCDFEGCNKVY...


In [185]:
df_6["Source"] = "iPTMnet"

In [186]:
df_6.to_csv("./datasets/iPTMnet.csv",index=False)

## uniprot

In [11]:
df_0

Unnamed: 0,Entry,Sequence
0,Q9H9K5,MGSLSNYALLQLTLTAFLTILVQPQHLLAPVFRTLSILTNQSNCWL...
1,P04439,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
2,P04439-2,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...
3,P01889,MLVMAPRTVLLLLSAALALTETWAGSHSMRYFYTSVSRPGRGEPRF...
4,P31689,MVKETTYYDVLGVKPNATQEELKKAYRKLALKYHPDKNPNEGEKFK...
...,...,...
42362,P54803-5,MGFMVADLWATSRLLVNYPEPYRSQILDYLFKPNFGASLHILKVEI...
42363,Q99999,MLPPQKKPWESMAKGLVLGALFTSFLLLVYSYAVPPLHAGLASTTP...
42364,Q75VX8,MEKLAAGLAGLRWSMGAFPLDLIVSRCRLPTLACLGPGEYAEGVSE...
42365,Q75VX8-2,MEKLAAGLAGLRWSMGAFPLDLIVSRCRLPTLACLGPGEYAEGVSE...


In [153]:
with open("./orig_dataset/uniprot-human-filtered-reviewed_yes+AND+organism__Homo+sapiens+(Human)--.txt") as f:
    text = f.read()

p1 = re.compile("\n//\n")
p5 = re.compile("ID   .+\nAC   (.+);\n")
p2 = re.compile("\nSQ   SEQUENCE   ")
p3 = re.compile("FT   MOD_RES         ")
p4 = re.compile("FT   MOD_RES         (?=\d)(.+?\n)(?=FT   \S)",re.DOTALL)
p6 = re.compile('^(\d+)\nFT                   /note=\"(.+?)\"(.+)',re.DOTALL)
p7 = re.compile("PubMed:(\d+)")

Proteins = text.split("\n//\n")[:-1]
Entrys = [i.split("; ")[0] for i in p5.findall(text)]
Accessions = [i.split("; ") for i in p5.findall(text)]

Sequences = ["".join(i.split("\nSQ   SEQUENCE   ")[-1].split("\n")[1:]).replace(" ","") for i in Proteins]

MOD_Positionss = [[int(p6.findall(i)[0][0]) for i in p4.findall(protein)] for protein in Proteins]
MOD_notess = [[p6.findall(i)[0][1].replace("\nFT                   ","") for i in p4.findall(protein)] for protein in Proteins]
MOD_evidencess = [[p6.findall(i)[0][2].replace("\nFT                   "," ")[12:-1] for i in p4.findall(protein)] for protein in Proteins]

["By similarity","Potential","Probable"]
["Omega-N-methylated arginine","symmetric dimethylarginine","Omega-N-methylarginine","asymmetric dimethylarginine"]
["N6,N6,N6-trimethyllysine","N6,N6-dimethyllysine","N6-methylated lysine","N6-methyllysine"]

count_Km1 = 0
count_Km2 = 0
count_Km3 = 0
Pos_Km1 = list()
Pos_Km2 = list()
Pos_Km3 = list()
for i in range(20375):
    for j,note in enumerate(MOD_notess[i]):
        if "N6,N6,N6-trimethyllysine" in note and "PubMed" in MOD_evidencess[i][j]:
            #print(i,"Km3")
            count_Km3 += 1
            Pos_Km3.append((i,j,p7.findall(MOD_evidencess[i][j])))
        elif "N6,N6-dimethyllysine" in note and "PubMed" in MOD_evidencess[i][j]:
            #print(i,"Km2")
            count_Km2 += 1
            Pos_Km2.append((i,j,p7.findall(MOD_evidencess[i][j])))
        elif ("N6-methylated lysine" in note or "N6-methyllysine" in note) and "PubMed" in MOD_evidencess[i][j]:
            #print(i,"Km1")
            count_Km1 += 1
            Pos_Km1.append((i,j,p7.findall(MOD_evidencess[i][j])))
print(count_Km1,count_Km2,count_Km3)            

Entry_Km1 = [Entrys[i] for i,j,k in Pos_Km1]
Position_Km1 = [MOD_Positionss[i][j] for i,j,k in Pos_Km1]
PMID_Km1 = [";".join(k) for i,j,k in Pos_Km1]

Entry_Km2 = [Entrys[i] for i,j,k in Pos_Km2]
Position_Km2 = [MOD_Positionss[i][j] for i,j,k in Pos_Km2]
PMID_Km2 = [";".join(k) for i,j,k in Pos_Km2]

Entry_Km3 = [Entrys[i] for i,j,k in Pos_Km3]
Position_Km3 = [MOD_Positionss[i][j] for i,j,k in Pos_Km3]
PMID_Km3 = [";".join(k) for i,j,k in Pos_Km3]

df_uniprot_Km1 = pd.DataFrame(dict(Entry_Km1=Entry_Km1,Position_Km1=Position_Km1,PMID_Km1=PMID_Km1))
df_uniprot_Km1["Type"] = "Km1"
df_uniprot_Km1.columns = ["Entry","Position","PMIDs","Type"]

df_uniprot_Km2 = pd.DataFrame(dict(Entry_Km2=Entry_Km2,Position_Km2=Position_Km2,PMID_Km1=PMID_Km2))
df_uniprot_Km2["Type"] = "Km2"
df_uniprot_Km2.columns = ["Entry","Position","PMIDs","Type"]

df_uniprot_Km3 = pd.DataFrame(dict(Entry_Km3=Entry_Km3,Position_Km3=Position_Km3,PMID_Km1=PMID_Km3))
df_uniprot_Km3["Type"] = "Km3"
df_uniprot_Km3.columns = ["Entry","Position","PMIDs","Type"]

df_7 = pd.concat([df_uniprot_Km1,df_uniprot_Km2,df_uniprot_Km3])

df_7 = df_7.groupby(level=0).apply(lambda x: fun_split_PMIDs(x,"PMIDs"))

df_7.index = range(len(df_7))

df_7 = df_7.merge(df_0)

df_7["SeqWin"] = df_7.apply(lambda x: F_pad(x.Sequence,n=30)[x.Position-1:x.Position+2*30], 1)
df_7["Species"] = "human"
df_7["Evidence"] = "PMID:"+df_7.PMID
df_7 = df_7[["Entry","Position","Type","Evidence","Species","Sequence","SeqWin"]].copy()
df_7.drop_duplicates(["Entry","Position","Type","Evidence"],inplace=True)
df_7

175 65 59


Unnamed: 0,Entry,Position,Type,Evidence,Species,Sequence,SeqWin
0,Q12802,1670,Km1,PMID:24129315,human,MKLNPQQAPLYGDCVVTVLLAEEDKAEDDVVFYLVFLGSTLRHCTS...,VDSLVSLSEEDLESDQREHRMFDQQICHRSKQQGFNYCTSAISSPL...
1,O95785,1162,Km2,PMID:24129315,human,MEGSLAGSLAAPDRPQGPERLPGPAPRENIEGGAEAAEGEGGIFRS...,HISPLAKKLPPPPGSPLGHSPTASPPPTARKMFPGLAAPSLPKKLK...
2,O95785,1162,Km3,PMID:24129315,human,MEGSLAGSLAAPDRPQGPERLPGPAPRENIEGGAEAAEGEGGIFRS...,HISPLAKKLPPPPGSPLGHSPTASPPPTARKMFPGLAAPSLPKKLK...
3,P05141,52,Km3,PMID:24129315,human,MTDAAVSFAKDFLAGGVAAAISKTAVAPIERVKLLLQVQHASKQIT...,SKTAVAPIERVKLLLQVQHASKQITADKQYKGIIDCVVRIPKEQGV...
4,P05141,147,Km1,PMID:24129315,human,MTDAAVSFAKDFLAGGVAAAISKTAVAPIERVKLLLQVQHASKQIT...,LASGGAAGATSLCFVYPLDFARTRLAADVGKAGAEREFRGLGDCLV...
...,...,...,...,...,...,...,...
360,O43169,39,Km1,PMID:24129315,human,MSGSMATAEASGSDGKGQEVETSVTYYRLEEVAKRNSLKELWLVIH...,EASGSDGKGQEVETSVTYYRLEEVAKRNSLKELWLVIHGRVYDVTR...
361,Q9NVM6,264,Km1,PMID:24129315,human,MAVTKELLQMDLYALLGIEEKAADKEVKKAYRQKALSCHPDKNPDN...,EVGLVDNPLKISWLEGQPQDAVGRSHSGLSKGSVLSERDYESLVMM...
362,Q08211,146,Km1,PMID:24129315,human,MGDVKNFLYAWCGKRKMTPSYEIRAVGNKNRQKFMCEVQVEGYNYT...,HLALKAENNSEVGASGYGVPGPTWDRGANLKDYYSRKEEQEVQATL...
363,Q9NQS1,230,Km1,PMID:24129315,human,MQAERGARGGRGRRPGRGRPGGDRHSERPGAAAAVARGGGGGGGGD...,AELVQGTVPLEVPQVKPKRTDDGKGLGMQLKGPLGPGGRGPIFELK...


In [188]:
df_7["Source"] = "Uniprot"

In [189]:
df_7.to_csv("./datasets/Uniprot.csv",index=False)

# datasets

## positive datasets

In [208]:
df_all = pd.concat([df_1,df_2,df_3_withEvidence,df_4,df_5,df_6,df_7,df_3])

In [160]:
df_all.to_csv("./datasets/KmeSites_Collected.csv", index=False)

In [161]:
pd.read_csv("./datasets/KmeSites_Collected.csv")

Unnamed: 0,Entry,Position,Type,Evidence,Species,Sequence,SeqWin,Source
0,O00139,161,Kme,PMID:23644510,human,MATANFGKIQIGIYVEIKRSDGRIHQAMVTSLNEDNESVTVEWIEN...,QQNGSVSDISPVQAAKKEFGPPSRRKSNCVKEVEKLQEKREKRRLQ...,PLMD
1,O00139,161,Kme,PMID:25514926,human,MATANFGKIQIGIYVEIKRSDGRIHQAMVTSLNEDNESVTVEWIEN...,QQNGSVSDISPVQAAKKEFGPPSRRKSNCVKEVEKLQEKREKRRLQ...,PLMD
2,O00139,169,Kme,PMID:23644510,human,MATANFGKIQIGIYVEIKRSDGRIHQAMVTSLNEDNESVTVEWIEN...,ISPVQAAKKEFGPPSRRKSNCVKEVEKLQEKREKRRLQQQELREKR...,PLMD
3,O00139,169,Kme,PMID:25514926,human,MATANFGKIQIGIYVEIKRSDGRIHQAMVTSLNEDNESVTVEWIEN...,ISPVQAAKKEFGPPSRRKSNCVKEVEKLQEKREKRRLQQQELREKR...,PLMD
4,O00159,383,Kme,PMID:23161681,human,MALQVELVPTGEIIRVVHPHRPCKLALGSDGVRVTMESALTARDRV...,LTHRKIIAKGEELLSPLNLEQAAYARDALAKAVYSRTFTWLVGKIN...,PLMD
...,...,...,...,...,...,...,...,...
21580,P49750,781,Km1,,human,MYPNWGRYGGSSHYPPPPVPPPPPVALPEASPGPGYSSSTTPAAPS...,SSYLESPRGPRFDGPRRFEDLGSRCEGPRPKGPRFEGNRPDGPRPR...,PhosphoSitePlus
21581,P49750,808,Km1,,human,MYPNWGRYGGSSHYPPPPVPPPPPVALPEASPGPGYSSSTTPAAPS...,PRPKGPRFEGNRPDGPRPRYEGHPAEGTKSKWGMIPRGPASQFYIT...,PhosphoSitePlus
21582,P49750,838,Km1,,human,MYPNWGRYGGSSHYPPPPVPPPPPVALPEASPGPGYSSSTTPAAPS...,KWGMIPRGPASQFYITPSTSLSPRQSGPQWKGPKPAFGQQHQQQPK...,PhosphoSitePlus
21583,P49750,1053,Km1,,human,MYPNWGRYGGSSHYPPPPVPPPPPVALPEASPGPGYSSSTTPAAPS...,PGQSRMEDTRDKGLVNRGRGQAISRGPGLVKQEDFRDKMMGRREDS...,PhosphoSitePlus


## negative datasets

In [192]:
df_all = pd.read_csv("./datasets/KmeSites_Collected.csv").drop_duplicates(["Entry"])

Neg_train,Neg_test,Neg_valid = fun_build_NegSample(df_0,df_all)

In [195]:
Neg_test = Neg_test.to_frame()

Neg_test.columns = ["Negative_samples_for_test"]

Neg_test.to_csv("./datasets/Negative_samples_for_test.csv",index=False)

In [198]:
Neg_train = Neg_train.to_frame()

Neg_train.columns = ["Negative_samples_for_train"]

Neg_train.to_csv("./datasets/Negative_samples_for_train.csv",index=False)

In [200]:
Neg_valid = Neg_valid.to_frame()

Neg_valid.columns = ["Negative_samples_for_valid"]

Neg_valid.to_csv("./datasets/Negative_samples_for_valid.csv",index=False) # To simplyfy the process, we use Neg_test to replace the Neg_valid in our actual practice.

In [201]:
pd.read_csv("./datasets/Negative_samples_for_test.csv")

Unnamed: 0,Negative_samples_for_test
0,_____________MAKQKRKVPEVTEKKNKKLKKASAEGPLLGPEA...
1,TGTDVAIEAADVVLIRNDLLDVVASIDLSRKTVKRIRINFVFALIY...
2,GAINVTYRYLAATPLQRKRYLTIGLSSVKRKKGNYLLETIKSIFEQ...
3,GVHFQSYPFDFLEFLNHQRFEPMELYGEHAKAVAALPCAPGPPPQP...
4,QMNGNQEKGDKTDRKKDKTGKEKKKDRDKEKDKMKAKKGMLKGLGD...
...,...
19995,CGPCSESPEHMAHSHSPIGWAAEECREKLIKEMDYLWEINQETRNN...
19996,LQNRRVQWLQGFAKLHRSAALVLASNLTELKEQQEMECNEATFQLQ...
19997,TGPAQLETSSEVQSEPAVPKPEDDTPVQDTKM______________...
19998,MNTFLIIYLVILISEAVISTILKYTWQAEEKWDEPWYNQKTEHQRN...


# data

In [214]:
# Randomly deduplicates the sample in different Evidence. Although we set the random seed, when we done this again, the samples reserved still were different.
df_all = pd.read_csv("./datasets/KmeSites_Collected.csv")
df_all = df_all.drop_duplicates()

df_all["id"] = df_all.Evidence.replace(df_all.Evidence.unique(),range(len(df_all.Evidence.unique())))

df_all.id += 1

df_all_Kme = df_all.copy()

df_all_Kme.Type = "Kme"

df_all = pd.concat([df_all,df_all_Kme]).drop_duplicates()

df_all.index = range(len(df_all))

df_all = df_all.groupby(["SeqWin"]).apply(fun_create_label)

df_all = df_all.sample(len(df_all),random_state=2021).drop_duplicates(["SeqWin","Type","Evidence"])

df_all = df_all.sample(len(df_all),random_state=2021).drop_duplicates(["SeqWin","Evidence"])

df_all_NoEvidence = df_all[df_all.Evidence.isna()]

df_all_Evidence = df_all[~df_all.Evidence.isna()]

df_all_Evidence = df_all_Evidence.sample(len(df_all_Evidence),random_state=2021).drop_duplicates(["SeqWin"])

df_all = pd.concat([df_all_Evidence,df_all_NoEvidence]).drop_duplicates(["SeqWin"])

df_Pos = df_all[df_all.SeqWin.apply(lambda x: set(x).issubset(set(AAs)), 1)].copy() # To remove rare amino acids.

#df_Pos.to_csv("./data/df_Pos.csv", index=False)

In [287]:
df_Pos = pd.read_csv("./data/df_Pos.csv")
df_Neg_train = pd.read_csv("./datasets/Negative_samples_for_train.csv")
df_Neg_test = pd.read_csv("./datasets/Negative_samples_for_test.csv")
df_Neg_valid = pd.read_csv("./datasets/Negative_samples_for_valid.csv")

In [288]:
df_Neg_train.columns = ["SeqWin"]
df_Neg_test.columns = ["SeqWin"]
df_Neg_valid.columns = ["SeqWin"]

In [289]:
df_data = pd.concat([df_Pos,df_Neg_train,df_Neg_test,df_Neg_valid]).fillna(0)

In [290]:
df_Evidence_id = df_data[["Evidence","id"]].drop_duplicates().sort_values("id")

df_Evidence_id.id = df_Evidence_id.id.astype(int)

#df_Evidence_id.to_csv("./data/df_Evidence_id.csv",index=False)

In [291]:
df_Evidence_id = pd.read_csv("./data/df_Evidence_id.csv")

In [295]:
df_data.id = df_data.id.astype(int)

df_data.Km1 = df_data.Km1.astype(int)

df_data.Km2 = df_data.Km2.astype(int)

df_data.Km3 = df_data.Km3.astype(int)

df_data.Kme = df_data.Kme.astype(int)

In [298]:
#df_data.to_csv("./data/df_data.csv",index=False)

In [319]:
df_data = pd.read_csv("./data/df_data.csv")

In [320]:
df_data = pd.concat([df_data.SeqWin.apply(lambda x: pd.Series(list(x))).replace(AAs,range(len(AAs))),df_data[["id","Km1","Km2","Km3","Kme"]]],1)

  df_data = pd.concat([df_data.SeqWin.apply(lambda x: pd.Series(list(x))).replace(AAs,range(len(AAs))),df_data[["id","Km1","Km2","Km3","Kme"]]],1)


In [321]:
np_data = df_data.to_numpy()

np_pos = np_data[:len(df_Pos)]

np_neg_train = np_data[len(df_Pos):len(df_Pos)+20000]
np_neg_test = np_data[len(df_Pos)+20000:len(df_Pos)+40000]
np_neg_valid = np_data[len(df_Pos)+40000:]

np.random.shuffle(np_pos)

Len = len(np_pos)//10

In [322]:
np_tests = list()
np_trains = list()
for i in range(10):
    np_pos_test = np_pos[i*Len:(i+1)*Len]
    np_test = np.concatenate([np_pos_test,np_neg_test])
    np_tests.append(np_test)
    
    np_pos_train = np_pos[np.r_[0:i*Len,(i+1)*Len:10*Len]]
    np_train = np.concatenate([np_pos_train,np_neg_train])
    np_trains.append(np_train)

In [323]:
#torch.save((np_tests,np_trains,np_data),"./data/np_tests_trains_data")
#savemat("./data/np_tests_trains_data.mat",{"np_tests":np_tests,"np_trains":np_trains,"np_data":np_data})
np_tests = loadmat("./data/np_tests_trains_data.mat")["np_tests"]
np_trains = loadmat("./data/np_tests_trains_data.mat")["np_trains"]
np_data = loadmat("./data/np_tests_trains_data.mat")["np_data"]

np_tests,np_trains,np_data = torch.load("./data/np_tests_trains_data")

In [13]:
np_tests,np_trains,np_data = torch.load("./data/np_tests_trains_data")
np_pos = np_data[:-40000]
np_neg = np_data[-40000:]
np.random.shuffle(np_pos)

df_Evidence_id = pd.read_csv("./data/df_Evidence_id.csv")
df_Evidence_id.index = df_Evidence_id.id

index_Km1 = pd.DataFrame(np_pos).groupby(61).apply(lambda x: x.sum()[-4:]).sort_values(62,ascending=False).iloc[3:30].index

index_Km2 = pd.DataFrame(np_pos).groupby(61).apply(lambda x: x.sum()[-4:]).sort_values(63,ascending=False).iloc[1:13].index

index_Km3 = pd.DataFrame(np_pos).groupby(61).apply(lambda x: x.sum()[-4:]).sort_values(64,ascending=False).iloc[1:10].index

index_Kme = pd.DataFrame(np_pos).groupby(61).apply(lambda x: x.sum()[-4:]).sort_values(65,ascending=False).iloc[3:43].index

torch.save([index_Km1,index_Km2,index_Km3,index_Kme],"./data/index_Km")
savemat("./data/index_Km.mat",{"index_Km1":np.array(index_Km1),"index_Km2":np.array(index_Km2),"index_Km3":np.array(index_Km3),"index_Kme":np.array(index_Kme)})
index_Km1 = loadmat("./data/index_Km.mat")["index_Km1"][0]
index_Km2 = loadmat("./data/index_Km.mat")["index_Km2"][0]
index_Km3 = loadmat("./data/index_Km.mat")["index_Km3"][0]
index_Kme = loadmat("./data/index_Km.mat")["index_Kme"][0]


index_Km1,index_Km2,index_Km3,index_Kme = torch.load("./data/index_Km")

len(index_Km1),len(index_Km2),len(index_Km3),len(index_Kme)

(27, 12, 9, 40)

# model training

All models were saved to the "Model_split" directory.
if you have used torch just now, please restart the kernel and not import torch like belows, or the the cuDNN may be failed to initialize.

In [1]:
import tensorflow as tf
#import torch
import numpy as np
import pandas as pd
import re
import os
from scipy.io import savemat,loadmat

In [2]:
class Model(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.compile(optimizer=tf.optimizers.Adam()
                     ,loss=tf.losses.BinaryCrossentropy()
                     ,metrics=[tf.metrics.AUC(1000)]
                    )

        self.cnn1 = tf.keras.Sequential([
            tf.keras.layers.Reshape([61,21]),
            tf.keras.layers.Conv1D(256,9,1,"valid"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.MaxPool1D(),
            tf.keras.layers.Dropout(0.7),
        ])
        self.cnn2 = tf.keras.Sequential([
            tf.keras.layers.Reshape([26,256]),
            tf.keras.layers.Conv1D(32,7,1,"valid"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.MaxPool1D(),
            tf.keras.layers.Dropout(0.5),
        ])
        self.simple = tf.keras.Sequential([
            tf.keras.layers.Reshape([10*32]),
            tf.keras.layers.Dense(128),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(4,activation="sigmoid"),
        ])
    def call(self, inputs):

        x = self.cnn1(inputs)
        x = self.cnn2(x)
        x = self.simple(x)

        return x
model = Model()
model.build((None,61*21))

In [3]:
df_Evidence_id = pd.read_csv("./data/df_Evidence_id.csv")

np_tests = loadmat("./data/np_tests_trains_data.mat")["np_tests"]
np_trains = loadmat("./data/np_tests_trains_data.mat")["np_trains"]
np_data = loadmat("./data/np_tests_trains_data.mat")["np_data"]
np_pos = np_data[:-40000]
np_neg = np_data[-40000:]

index_Km1 = loadmat("./data/index_Km.mat")["index_Km1"][0]
index_Km2 = loadmat("index_Km1.mat")["index_Km2"][0]
index_Km3 = loadmat("index_Km1.mat")["index_Km3"][0]
index_Kme = loadmat("index_Km1.mat")["index_Kme"][0]

## Kme1

In [8]:
earlystopping = tf.keras.callbacks.EarlyStopping("val_loss",patience=15,mode="min",restore_best_weights=True)

for ID in index_Km1:
    name = df_Evidence_id.query(f"id=={ID}").iloc[0,0]
    if os.path.exists(f"./Model_split/Km1_{name}.hdf5".replace(":","_"))==True:
        continue
    
    np_train = np.concatenate([np_pos[np_pos[:,61]!=ID],np_neg[:20000]])
    np.random.shuffle(np_train)
    np_test = np.concatenate([np_pos[np_pos[:,61]==ID],np_neg[20000:]])

    x_test = tf.one_hot(np_test[:,:61],21).numpy().reshape([-1,61*21])
    y_test = np_test[:,-4:]

    x_train = tf.one_hot(np_train[:,:61],21).numpy().reshape([-1,61*21])
    y_train = np_train[:,-4:]


    sample_weight = np.array([(1/3 if i==0 else 1) for i in np_train[:,-1]])

    tf.keras.backend.clear_session()
    tf.random.set_seed(2021)
    model = Model()
    
    model.fit(x_train,y_train,512,200,2,validation_split=0.1
              ,sample_weight=sample_weight
              ,callbacks=[earlystopping]
             )
    
    model.save_weights(f"./Model_split/Km1_{name}.hdf5".replace(":","_"))
    
    y_pred = model.predict(x_test,1500)
    y_true = y_test

    auc_Km1 = tf.metrics.AUC(1000)(y_true[:,0],y_pred[:,0])
    auc_Km2 = tf.metrics.AUC(1000)(y_true[:,1],y_pred[:,1])
    auc_Km3 = tf.metrics.AUC(1000)(y_true[:,2],y_pred[:,2])
    auc_Kme = tf.metrics.AUC(1000)(y_true[:,3],y_pred[:,3])

    print("Km1: %.3f\nKm2: %.3f\nKm3: %.3f\nKme: %.3f\n"%(auc_Km1,auc_Km2,auc_Km3,auc_Kme))

## Kme2

In [9]:
earlystopping = tf.keras.callbacks.EarlyStopping("val_loss",patience=15,mode="min",restore_best_weights=True)


for ID in index_Km2:
    name = df_Evidence_id.query(f"id=={ID}").iloc[0,0]
    if os.path.exists(f"./Model_split/Km2_{name}.hdf5".replace(":","_"))==True:
        continue
    np_train = np.concatenate([np_pos[np_pos[:,61]!=ID],np_neg[:20000]])
    np.random.shuffle(np_train)
    np_test = np.concatenate([np_pos[np_pos[:,61]==ID],np_neg[20000:]])

    x_test = tf.one_hot(np_test[:,:61],21).numpy().reshape([-1,61*21])
    y_test = np_test[:,-4:]

    x_train = tf.one_hot(np_train[:,:61],21).numpy().reshape([-1,61*21])
    y_train = np_train[:,-4:]


    sample_weight = np.array([(1/3 if i==0 else 1) for i in np_train[:,-1]])

    tf.keras.backend.clear_session()
    tf.random.set_seed(2021)
    model = Model()
    
    model.fit(x_train,y_train,512,200,2,validation_split=0.1
              ,sample_weight=sample_weight
              ,callbacks=[earlystopping]
             )
    
    model.save_weights(f"./Model_split/Km2_{name}.hdf5".replace(":","_"))
    
    y_pred = model.predict(x_test,1500)
    y_true = y_test

    auc_Km1 = tf.metrics.AUC(1000)(y_true[:,0],y_pred[:,0])
    auc_Km2 = tf.metrics.AUC(1000)(y_true[:,1],y_pred[:,1])
    auc_Km3 = tf.metrics.AUC(1000)(y_true[:,2],y_pred[:,2])
    auc_Kme = tf.metrics.AUC(1000)(y_true[:,3],y_pred[:,3])

    print("Km1: %.3f\nKm2: %.3f\nKm3: %.3f\nKme: %.3f\n"%(auc_Km1,auc_Km2,auc_Km3,auc_Kme))

## Kme3

In [10]:
earlystopping = tf.keras.callbacks.EarlyStopping("val_loss",patience=15,mode="min",restore_best_weights=True)


for ID in index_Km3:
    name = df_Evidence_id.query(f"id=={ID}").iloc[0,0]
    if os.path.exists(f"./Model_split/Km3_{name}.hdf5".replace(":","_"))==True:
        continue
    np_train = np.concatenate([np_pos[np_pos[:,61]!=ID],np_neg[:20000]])
    np.random.shuffle(np_train)
    np_test = np.concatenate([np_pos[np_pos[:,61]==ID],np_neg[20000:]])

    x_test = tf.one_hot(np_test[:,:61],21).numpy().reshape([-1,61*21])
    y_test = np_test[:,-4:]

    x_train = tf.one_hot(np_train[:,:61],21).numpy().reshape([-1,61*21])
    y_train = np_train[:,-4:]


    sample_weight = np.array([(1/3 if i==0 else 1) for i in np_train[:,-1]])

    tf.keras.backend.clear_session()
    tf.random.set_seed(2021)
    model = Model()
    
    model.fit(x_train,y_train,512,200,2,validation_split=0.1
              ,sample_weight=sample_weight
              ,callbacks=[earlystopping]
             )
    
    model.save_weights(f"./Model_split/Km3_{name}.hdf5".replace(":","_"))
    
    y_pred = model.predict(x_test,1500)
    y_true = y_test

    auc_Km1 = tf.metrics.AUC(1000)(y_true[:,0],y_pred[:,0])
    auc_Km2 = tf.metrics.AUC(1000)(y_true[:,1],y_pred[:,1])
    auc_Km3 = tf.metrics.AUC(1000)(y_true[:,2],y_pred[:,2])
    auc_Kme = tf.metrics.AUC(1000)(y_true[:,3],y_pred[:,3])

    print("Km1: %.3f\nKm2: %.3f\nKm3: %.3f\nKme: %.3f\n"%(auc_Km1,auc_Km2,auc_Km3,auc_Kme))

## Kme

In [11]:
earlystopping = tf.keras.callbacks.EarlyStopping("val_loss",patience=15,mode="min",restore_best_weights=True)


for ID in index_Kme:
    name = df_Evidence_id.query(f"id=={ID}").iloc[0,0]
    if os.path.exists(f"./Model_split/Kme_{name}.hdf5".replace(":","_"))==True:
        continue
    np_train = np.concatenate([np_pos[np_pos[:,61]!=ID],np_neg[:20000]])
    np.random.shuffle(np_train)
    np_test = np.concatenate([np_pos[np_pos[:,61]==ID],np_neg[20000:]])

    x_test = tf.one_hot(np_test[:,:61],21).numpy().reshape([-1,61*21])
    y_test = np_test[:,-4:]

    x_train = tf.one_hot(np_train[:,:61],21).numpy().reshape([-1,61*21])
    y_train = np_train[:,-4:]


    sample_weight = np.array([(1/3 if i==0 else 1) for i in np_train[:,-1]])

    tf.keras.backend.clear_session()
    tf.random.set_seed(2021)
    model = Model()
    
    model.fit(x_train,y_train,512,200,2,validation_split=0.1
              ,sample_weight=sample_weight
              ,callbacks=[earlystopping]
             )
    
    model.save_weights(f"./Model_split/Kme_{name}.hdf5".replace(":","_"))
    
    y_pred = model.predict(x_test,1500)
    y_true = y_test

    auc_Km1 = tf.metrics.AUC(1000)(y_true[:,0],y_pred[:,0])
    auc_Km2 = tf.metrics.AUC(1000)(y_true[:,1],y_pred[:,1])
    auc_Km3 = tf.metrics.AUC(1000)(y_true[:,2],y_pred[:,2])
    auc_Kme = tf.metrics.AUC(1000)(y_true[:,3],y_pred[:,3])

    print("Km1: %.3f\nKm2: %.3f\nKm3: %.3f\nKme: %.3f\n"%(auc_Km1,auc_Km2,auc_Km3,auc_Kme))

# model test

if you have used torch just now, please restart the kernel and not import torch like belows, or the the cuDNN may be failed to initialize.

In [1]:
import tensorflow as tf
#import torch
import numpy as np
import pandas as pd
import re
import os
from scipy.io import savemat,loadmat

In [2]:
class Model(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.compile(optimizer=tf.optimizers.Adam()
                     ,loss=tf.losses.BinaryCrossentropy()
                     ,metrics=[tf.metrics.AUC(1000)]
                    )

        self.cnn1 = tf.keras.Sequential([
            tf.keras.layers.Reshape([61,21]),
            tf.keras.layers.Conv1D(256,9,1,"valid"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.MaxPool1D(),
            tf.keras.layers.Dropout(0.7),
        ])
        self.cnn2 = tf.keras.Sequential([
            tf.keras.layers.Reshape([26,256]),
            tf.keras.layers.Conv1D(32,7,1,"valid"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.MaxPool1D(),
            tf.keras.layers.Dropout(0.5),
        ])
        self.simple = tf.keras.Sequential([
            tf.keras.layers.Reshape([10*32]),
            tf.keras.layers.Dense(128),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(4,activation="sigmoid"),
        ])
    def call(self, inputs):

        x = self.cnn1(inputs)
        x = self.cnn2(x)
        x = self.simple(x)

        return x
model = Model()
model.build((None,61*21))

In [4]:
df_Pos = pd.read_csv("./data/df_Pos.csv")
df_Evidence_id = pd.read_csv("./data/df_Evidence_id.csv")

np_tests = loadmat("./data/np_tests_trains_data.mat")["np_tests"]
np_trains = loadmat("./data/np_tests_trains_data.mat")["np_trains"]
np_data = loadmat("./data/np_tests_trains_data.mat")["np_data"]
np_pos = np_data[:-40000]
np_neg = np_data[-40000:]

index_Km1 = loadmat("./data/index_Km.mat")["index_Km1"][0]
index_Km2 = loadmat("./data/index_Km.mat")["index_Km2"][0]
index_Km3 = loadmat("./data/index_Km.mat")["index_Km3"][0]
index_Kme = loadmat("./data/index_Km.mat")["index_Kme"][0]

## Kme1

In [6]:
aucs = list()
sizes = list()
for ID in index_Km1:
    name = df_Evidence_id.query(f"id=={ID}").iloc[0,0]
    model.load_weights(f"./Model_split/Km1_{name}.hdf5".replace(":","_"))
    
    np_test = np.concatenate([np_pos[np_pos[:,61]==ID],np_neg[20000:]])
    size = df_Pos.query("Km1==1").Evidence.value_counts()[name]
    sizes.append(size)
    x_test = tf.one_hot(np_test[:,:61],21).numpy().reshape([-1,61*21])
    y_test = np_test[:,-4:]
    
    y_pred = model.predict(x_test,1500)[:,0]
    y_true = y_test[:,0]

    auc = tf.metrics.AUC(1000)(y_true,y_pred) 
    SnSp90 = tf.metrics.SensitivityAtSpecificity(0.9)(y_true,y_pred)
    SnSp95 = tf.metrics.SensitivityAtSpecificity(0.95)(y_true,y_pred)
    SnSp99 = tf.metrics.SensitivityAtSpecificity(0.99)(y_true,y_pred)
    aucs.append(auc)
    print(np_pos[np_pos[:,61]!=ID].shape[0],"\t",np_pos[np_pos[:,61]==ID].shape[0],"\t","%d\t%s"%(size,name),"\tAUC=%5.3f, Sn(Sp=0.9)=%5.3f, Sn(Sp=0.95)=%5.3f, Sn(Sp=0.99)=%5.3f"%(
        auc,SnSp90,SnSp95,SnSp99))

5013 	 216 	 216	PMID:25505155 	AUC=0.809, Sn(Sp=0.9)=0.431, Sn(Sp=0.95)=0.245, Sn(Sp=0.99)=0.093
4948 	 281 	 156	PMID:23644510 	AUC=0.606, Sn(Sp=0.9)=0.173, Sn(Sp=0.95)=0.083, Sn(Sp=0.99)=0.026
5162 	 67 	 67	PMID:27577262 	AUC=0.707, Sn(Sp=0.9)=0.328, Sn(Sp=0.95)=0.194, Sn(Sp=0.99)=0.104
5063 	 166 	 52	PMID:30395435 	AUC=0.577, Sn(Sp=0.9)=0.154, Sn(Sp=0.95)=0.019, Sn(Sp=0.99)=0.000
5169 	 60 	 50	PMID:24129315 	AUC=0.843, Sn(Sp=0.9)=0.660, Sn(Sp=0.95)=0.460, Sn(Sp=0.99)=0.180
5201 	 28 	 28	CSTCS:9897 	AUC=0.929, Sn(Sp=0.9)=0.750, Sn(Sp=0.95)=0.607, Sn(Sp=0.99)=0.321
5201 	 28 	 28	CSTCS:20129 	AUC=0.956, Sn(Sp=0.9)=0.893, Sn(Sp=0.95)=0.750, Sn(Sp=0.99)=0.607
5202 	 27 	 27	CSTCS:20132 	AUC=0.882, Sn(Sp=0.9)=0.815, Sn(Sp=0.95)=0.556, Sn(Sp=0.99)=0.333
5203 	 26 	 26	CSTCS:18852 	AUC=0.906, Sn(Sp=0.9)=0.692, Sn(Sp=0.95)=0.615, Sn(Sp=0.99)=0.385
5204 	 25 	 25	CSTCS:9896 	AUC=0.833, Sn(Sp=0.9)=0.640, Sn(Sp=0.95)=0.560, Sn(Sp=0.99)=0.480
5205 	 24 	 24	CSTCS:20128 	AUC=0.836, Sn(Sp=0.

## Kme2

In [7]:
aucs = list()
sizes = list()
for ID in index_Km2:
    name = df_Evidence_id.query(f"id=={ID}").iloc[0,0]
    model.load_weights(f"./Model_split/Km2_{name}.hdf5".replace(":","_"))
    
    np_test = np.concatenate([np_pos[np_pos[:,61]==ID],np_neg[20000:]])
    size = df_Pos.query("Km2==1").Evidence.value_counts()[name]
    sizes.append(size)
    x_test = tf.one_hot(np_test[:,:61],21).numpy().reshape([-1,61*21])
    y_test = np_test[:,-4:]
    
    y_pred = model.predict(x_test,1500)[:,1]
    y_true = y_test[:,1]

    auc = tf.metrics.AUC(1000)(y_true,y_pred) 
    SnSp90 = tf.metrics.SensitivityAtSpecificity(0.9)(y_true,y_pred)
    SnSp95 = tf.metrics.SensitivityAtSpecificity(0.95)(y_true,y_pred)
    SnSp99 = tf.metrics.SensitivityAtSpecificity(0.99)(y_true,y_pred)
    aucs.append(auc)
    print(np_pos[np_pos[:,61]!=ID].shape[0],"\t",np_pos[np_pos[:,61]==ID].shape[0],"\t","%d\t%s"%(size,name),"\tAUC=%5.3f, Sn(Sp=0.9)=%5.3f, Sn(Sp=0.95)=%5.3f, Sn(Sp=0.99)=%5.3f"%(
        auc,SnSp90,SnSp95,SnSp99))

4948 	 281 	 56	PMID:23644510 	AUC=0.669, Sn(Sp=0.9)=0.232, Sn(Sp=0.95)=0.161, Sn(Sp=0.99)=0.071
5063 	 166 	 52	PMID:30395435 	AUC=0.639, Sn(Sp=0.9)=0.250, Sn(Sp=0.95)=0.115, Sn(Sp=0.99)=0.058
5205 	 24 	 24	CSTCS:5153 	AUC=0.688, Sn(Sp=0.9)=0.208, Sn(Sp=0.95)=0.042, Sn(Sp=0.99)=0.000
5210 	 19 	 19	CSTCS:5154 	AUC=0.596, Sn(Sp=0.9)=0.105, Sn(Sp=0.95)=0.105, Sn(Sp=0.99)=0.053
5211 	 18 	 18	CSTCS:5995 	AUC=0.457, Sn(Sp=0.9)=0.056, Sn(Sp=0.95)=0.056, Sn(Sp=0.99)=0.000
5213 	 16 	 16	CSTCS:3750 	AUC=0.651, Sn(Sp=0.9)=0.188, Sn(Sp=0.95)=0.062, Sn(Sp=0.99)=0.062
5215 	 14 	 14	CSTCS:8357 	AUC=0.647, Sn(Sp=0.9)=0.214, Sn(Sp=0.95)=0.143, Sn(Sp=0.99)=0.143
5215 	 14 	 14	CSTCS:3777 	AUC=0.645, Sn(Sp=0.9)=0.071, Sn(Sp=0.95)=0.071, Sn(Sp=0.99)=0.071
5217 	 12 	 12	CSTCS:5156 	AUC=0.666, Sn(Sp=0.9)=0.167, Sn(Sp=0.95)=0.083, Sn(Sp=0.99)=0.000
5218 	 11 	 11	CSTCS:8356 	AUC=0.812, Sn(Sp=0.9)=0.545, Sn(Sp=0.95)=0.364, Sn(Sp=0.99)=0.091
5212 	 17 	 11	PMID:23161681 	AUC=0.807, Sn(Sp=0.9)=0.545, Sn(

## Kme3

In [9]:
aucs = list()
sizes = list()
for ID in index_Km3:
    name = df_Evidence_id.query(f"id=={ID}").iloc[0,0]
    model.load_weights(f"./Model_split/Km3_{name}.hdf5".replace(":","_"))
    
    np_test = np.concatenate([np_pos[np_pos[:,61]==ID],np_neg[20000:]])
    size = df_Pos.query("Km3==1").Evidence.value_counts()[name]
    sizes.append(size)
    x_test = tf.one_hot(np_test[:,:61],21).numpy().reshape([-1,61*21])
    y_test = np_test[:,-4:]
    
    y_pred = model.predict(x_test,1500)[:,2]
    y_true = y_test[:,2]

    auc = tf.metrics.AUC(1000)(y_true,y_pred) 
    SnSp90 = tf.metrics.SensitivityAtSpecificity(0.9)(y_true,y_pred)
    SnSp95 = tf.metrics.SensitivityAtSpecificity(0.95)(y_true,y_pred)
    SnSp99 = tf.metrics.SensitivityAtSpecificity(0.99)(y_true,y_pred)
    aucs.append(auc)
    print(np_pos[np_pos[:,61]!=ID].shape[0],"\t",np_pos[np_pos[:,61]==ID].shape[0],"\t","%d\t%s"%(size,name),"\tAUC=%5.3f, Sn(Sp=0.9)=%5.3f, Sn(Sp=0.95)=%5.3f, Sn(Sp=0.99)=%5.3f"%(
        auc,SnSp90,SnSp95,SnSp99))

5063 	 166 	 63	PMID:30395435 	AUC=0.588, Sn(Sp=0.9)=0.175, Sn(Sp=0.95)=0.143, Sn(Sp=0.99)=0.016
4948 	 281 	 48	PMID:23644510 	AUC=0.699, Sn(Sp=0.9)=0.229, Sn(Sp=0.95)=0.146, Sn(Sp=0.99)=0.021
5202 	 27 	 27	CSTCS:7364 	AUC=0.800, Sn(Sp=0.9)=0.519, Sn(Sp=0.95)=0.444, Sn(Sp=0.99)=0.407
5218 	 11 	 11	CSTCS:7363 	AUC=0.767, Sn(Sp=0.9)=0.455, Sn(Sp=0.95)=0.455, Sn(Sp=0.99)=0.091
5220 	 9 	 9	CSTCS:8358 	AUC=0.830, Sn(Sp=0.9)=0.444, Sn(Sp=0.95)=0.444, Sn(Sp=0.99)=0.111
5169 	 60 	 8	PMID:24129315 	AUC=0.668, Sn(Sp=0.9)=0.625, Sn(Sp=0.95)=0.375, Sn(Sp=0.99)=0.250
3818 	 1411 	 6	PMID:26750096 	AUC=0.879, Sn(Sp=0.9)=0.500, Sn(Sp=0.95)=0.500, Sn(Sp=0.99)=0.500
5224 	 5 	 5	CSTCS:8359 	AUC=0.738, Sn(Sp=0.9)=0.400, Sn(Sp=0.95)=0.200, Sn(Sp=0.99)=0.000
5211 	 18 	 5	CSTCS:16504 	AUC=0.589, Sn(Sp=0.9)=0.400, Sn(Sp=0.95)=0.200, Sn(Sp=0.99)=0.200


## Kme

In [10]:
aucs = list()
sizes = list()
for ID in index_Kme:
    name = df_Evidence_id.query(f"id=={ID}").iloc[0,0]
    model.load_weights(f"./Model_split/Kme_{name}.hdf5".replace(":","_"))
    
    np_test = np.concatenate([np_pos[np_pos[:,61]==ID],np_neg[20000:]])
    size = df_Pos.query("Kme==1").Evidence.value_counts()[name]
    sizes.append(size)
    x_test = tf.one_hot(np_test[:,:61],21).numpy().reshape([-1,61*21])
    y_test = np_test[:,-4:]
    
    y_pred = model.predict(x_test,1500)[:,3]
    y_true = y_test[:,3]

    auc = tf.metrics.AUC(1000)(y_true,y_pred) 
    SnSp90 = tf.metrics.SensitivityAtSpecificity(0.9)(y_true,y_pred)
    SnSp95 = tf.metrics.SensitivityAtSpecificity(0.95)(y_true,y_pred)
    SnSp99 = tf.metrics.SensitivityAtSpecificity(0.99)(y_true,y_pred)
    aucs.append(auc)
    print(np_pos[np_pos[:,61]!=ID].shape[0],"\t",np_pos[np_pos[:,61]==ID].shape[0],"\t","%d\t%s"%(size,name),"\tAUC=%5.3f, Sn(Sp=0.9)=%5.3f, Sn(Sp=0.95)=%5.3f, Sn(Sp=0.99)=%5.3f"%(
        auc,SnSp90,SnSp95,SnSp99))

4948 	 281 	 281	PMID:23644510 	AUC=0.623, Sn(Sp=0.9)=0.149, Sn(Sp=0.95)=0.100, Sn(Sp=0.99)=0.007
5013 	 216 	 216	PMID:25505155 	AUC=0.789, Sn(Sp=0.9)=0.412, Sn(Sp=0.95)=0.278, Sn(Sp=0.99)=0.106
5063 	 166 	 166	PMID:30395435 	AUC=0.583, Sn(Sp=0.9)=0.133, Sn(Sp=0.95)=0.060, Sn(Sp=0.99)=0.006
5162 	 67 	 67	PMID:27577262 	AUC=0.714, Sn(Sp=0.9)=0.373, Sn(Sp=0.95)=0.254, Sn(Sp=0.99)=0.090
5169 	 60 	 60	PMID:24129315 	AUC=0.807, Sn(Sp=0.9)=0.533, Sn(Sp=0.95)=0.467, Sn(Sp=0.99)=0.233
5172 	 57 	 57	PMID:23748837 	AUC=0.753, Sn(Sp=0.9)=0.368, Sn(Sp=0.95)=0.263, Sn(Sp=0.99)=0.140
5201 	 28 	 28	CSTCS:9897 	AUC=0.928, Sn(Sp=0.9)=0.679, Sn(Sp=0.95)=0.536, Sn(Sp=0.99)=0.429
5201 	 28 	 28	CSTCS:20129 	AUC=0.974, Sn(Sp=0.9)=0.929, Sn(Sp=0.95)=0.857, Sn(Sp=0.99)=0.571
5202 	 27 	 27	CSTCS:20132 	AUC=0.869, Sn(Sp=0.9)=0.741, Sn(Sp=0.95)=0.556, Sn(Sp=0.99)=0.333
5202 	 27 	 27	CSTCS:7364 	AUC=0.817, Sn(Sp=0.9)=0.407, Sn(Sp=0.95)=0.259, Sn(Sp=0.99)=0.074
5203 	 26 	 26	CSTCS:18852 	AUC=0.867, Sn(Sp