In [1]:
import os
import numpy as np
import pandas as pd
import json
import random

from utils.file import load_tab, save_np, load_np, read_json2dict, dump_list2json, read_json2list
import params.PDBparser as paramsPDB

# 1. Add cluster information (skippable)

**IDR dataset**

In [4]:
df_pdbEntityInfo = pd.read_csv(paramsPDB.path_pdb_featureEntity100)
df_pdbEntityInfo[:2]

Unnamed: 0,rcsb_id,entity_id,uniprot_accession,sequence_length,chain_id,auth_chain_id,id
0,120L,1,P00720,164,A,A,120L_1
1,13GS,1,P09211,210,A,A,13GS_1


In [35]:
list_entityDataset = read_json2list(paramsPDB.path_IDRdataset_100)
df_entityDataset = pd.DataFrame(list_entityDataset)
df_entityDataset = df_entityDataset.drop(['clstr_id'], axis=1)
df_entityDataset[:2]

Unnamed: 0,id,sequence,reference,fold
0,120L_1,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSAKSEL...,0000000000000000000000000000000000000000000000...,5
1,13PK_1,EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,0000000000000000000000000000000000000000000000...,4


In [36]:
df_pdbEntityInfo.shape[0], len(df_pdbEntityInfo['id'].unique()), len(df_pdbEntityInfo['rcsb_id'].unique()),  df_entityDataset.shape[0]

(470006, 231624, 168082, 78968)

**Clustering**

In [37]:
# MMseqs2 clustering, set min-seq-i=1
df_clstr_100 = pd.read_csv(paramsPDB.path_cls100_tab, header=None, names=['clstr_id', 'id'], sep='\t')
df_clstr_100[:2]

Unnamed: 0,clstr_id,id
0,120L_1,120L_1
1,13PK_1,13PK_1


In [38]:
df_clstr = pd.read_csv(paramsPDB.path_cls30_tab, header=None, names=['clstr_id', 'id'], sep='\t')
df_clstr[:2]

Unnamed: 0,clstr_id,id
0,1O6P_2,1O6P_2
1,1O82_1,1O82_1


In [39]:
df_clstr.shape[0], len(df_clstr.iloc[:, 0].unique()),  len(df_clstr_100.iloc[:, 0].unique())

(231624, 23581, 78968)

In [40]:
df_entityDataset.shape[0]

78968

**Merge & Save**

In [41]:
# take only MMseqs100 sequences
df_entityDataset = df_entityDataset.loc[df_entityDataset['id'].isin(list(df_clstr_100['clstr_id']))]

In [42]:
df_entityDataset.shape[0]

78968

In [43]:
df_entityDataset

Unnamed: 0,id,sequence,reference,fold
0,120L_1,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSAKSEL...,0000000000000000000000000000000000000000000000...,5
1,13PK_1,EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,0000000000000000000000000000000000000000000000...,4
2,155L_1,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,0000000000000000000000000000000000000000000000...,5
3,15C8_1,DIVLTQSPAIMSASLGERVTMTCTASSSVSSSNLHWYQQKPGSSPK...,0000000000000000000000000000000000000000000000...,5
4,15C8_2,EVQLQQSGAELVKPGASVKLSCTASGFNIKDTYMHWVKQKPEQGLE...,0000000000000000000000000000000000000000000000...,5
...,...,...,...,...
78963,8I2E_2,GSQSIKVKKGDTLWDLSRKYDTTISKIKSENHLRSDIIYVGQTLSI...,1111111111111111111111111111111111111111111111...,2
78964,8IBS_1,MEHRAFKWPQPLAGNKPRIWYGGDYNPDQWPEEVWDEDVALMQQAG...,0000000000000000000000000000000000000000000000...,4
78965,8OEP_2,RQERLQRRRETQV,1100000000000,2
78966,8P5O_1,GPDSITEYPDKTIHQLFTEQVEKTPEHVAVVFEDEKVTYRELHERS...,1111111000000000000000000000000000000000000000...,1


In [44]:
df_entityDataset = pd.merge(df_entityDataset, df_clstr, on='id')

In [45]:
df_entityDataset[:2]

Unnamed: 0,id,sequence,reference,fold,clstr_id
0,120L_1,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSAKSEL...,0000000000000000000000000000000000000000000000...,5,7DDZ_1
1,13PK_1,EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,0000000000000000000000000000000000000000000000...,4,1LTK_1


In [46]:
# save
list_entityDataset = df_entityDataset.to_dict(orient='records')
dump_list2json(list_entityDataset, paramsPDB.path_IDRdataset_100)

# 2. Separate dataset into 5 folds

In [47]:
list_entityDataset = read_json2list(paramsPDB.path_IDRdataset_100)
df_entityDataset = pd.DataFrame(list_entityDataset)
df_entityDataset[:2]

Unnamed: 0,id,sequence,reference,fold,clstr_id
0,120L_1,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSAKSEL...,0000000000000000000000000000000000000000000000...,5,7DDZ_1
1,13PK_1,EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKV...,0000000000000000000000000000000000000000000000...,4,1LTK_1


In [48]:
df_clstr = pd.read_csv(paramsPDB.path_cls30_tab, header=None, names=['clstr_id', 'id'], sep='\t')
df_clstr[:2]

Unnamed: 0,clstr_id,id
0,1O6P_2,1O6P_2
1,1O82_1,1O82_1


In [49]:
list_clstr = list(df_clstr['clstr_id'].unique())
# number of clusters in each fold
num_k = len(list_clstr)//5
len(list_clstr), num_k

(23581, 4716)

In [50]:
# initial fold info
df_entityDataset['fold'] = 0

In [51]:
# shuffle the clusters
random.shuffle(list_clstr)

In [52]:
for k in range(1, 6):
    if k==5:
        clstr_foldK = list_clstr[(k-1)*num_k:]
    else:
        clstr_foldK = list_clstr[(k-1)*num_k: k*num_k]
    df_entityDataset.loc[df_entityDataset['clstr_id'].isin(clstr_foldK), 'fold'] = k

In [53]:
len(df_entityDataset[df_entityDataset['fold']==4]['clstr_id'].unique())

4702

In [54]:
# save
list_entityDataset = df_entityDataset.to_dict(orient='records')
dump_list2json(list_entityDataset, paramsPDB.path_IDRdataset_100)