In [1]:
import pandas as pd
import numpy as np

## Get Data

In [2]:
# !wget https://scop2.mrc-lmb.cam.ac.uk/files/scop-cla-latest.txt
# !wget https://scop2.mrc-lmb.cam.ac.uk/files/scop-des-latest.txt
# !wget https://scop2.mrc-lmb.cam.ac.uk/files/scop-represented-structures-latest.txt
# !wget https://scop2.mrc-lmb.cam.ac.uk/files/scop_fa_represeq_lib_latest.fa
# !wget https://scop2.mrc-lmb.cam.ac.uk/files/scop_sf_represeq_lib_latest.fa

### SCOP domain definitions and classification.

In [3]:
# FA-DOMID FA-PDBID FA-PDBREG FA-UNIID FA-UNIREG SF-DOMID SF-PDBID SF-PDBREG SF-UNIID SF-UNIREG SCOPCLA
lines = []
with open('scop/scop-cla-latest.txt', 'r') as r:
    for line in r.readlines()[6:]:
        split_lines = line.split(' ')
        assert len(split_lines) == 11
        lines.append({
            "FA-DOMID": split_lines[0],
            "FA-PDBID": split_lines[1],
            "FA-PDBREG": split_lines[2],
            "FA-UNIID": split_lines[3],
            "FA-UNIREG": split_lines[4],
            "SF-DOMID": split_lines[5],
            "SF-PDBID": split_lines[6],
            "SF-PDBREG": split_lines[7],
            "SF-UNIID": split_lines[8],
            "SF-UNIREG": split_lines[9],
            "SCOPCLA": split_lines[10].strip(),
        })
        # break

In [4]:
len(lines)

36900

In [5]:
cla = pd.DataFrame(lines)
cla.shape

(36900, 11)

In [6]:
cla.head()

Unnamed: 0,FA-DOMID,FA-PDBID,FA-PDBREG,FA-UNIID,FA-UNIREG,SF-DOMID,SF-PDBID,SF-PDBREG,SF-UNIID,SF-UNIREG,SCOPCLA
0,8045703,3H8D,C:1143-1264,Q64331,1143-1264,8091604,3H8D,C:1143-1264,Q64331,1143-1264,"TP=1,CL=1000003,CF=2001470,SF=3002524,FA=4004627"
1,8094330,6J56,A:1158-1282,Q9UM54,1167-1291,8094331,6J56,A:1158-1282,Q9UM54,1167-1291,"TP=1,CL=1000003,CF=2001470,SF=3002524,FA=4004627"
2,8017835,3FKQ,A:1-116,D0VX10,1-116,8017836,3FKQ,A:1-116,D0VX10,1-116,"TP=1,CL=1000002,CF=2000016,SF=3001156,FA=4003986"
3,8021315,1XHF,A:2-122,P0A9Q1,2-122,8033695,1XHF,A:2-122,P0A9Q1,2-122,"TP=1,CL=1000002,CF=2000016,SF=3001156,FA=4003632"
4,8021787,1Y7P,B:79-215,O28869,79-215,8034167,1Y7P,B:79-215,O28869,79-215,"TP=1,CL=1000002,CF=2000016,SF=3001156,FA=4002481"


In [7]:
cla.to_csv('scop/scop-cla-latest.csv', index=None)

### SCOP node descriptions.

In [11]:
# NODE_ID NODE_NAME
lines = []
with open('scop/scop-des-latest.txt', 'r') as r:
    for line in r.readlines()[6:]:
        split_lines = line.split(' ')
        lines.append({
            "NODE_ID": split_lines[0],
            "NODE_NAME": " ".join(split_lines[1:]).strip(),
        })
        # break

In [12]:
len(lines), lines[:3]

(10340,
 [{'NODE_ID': '1', 'NODE_NAME': 'Globular proteins'},
  {'NODE_ID': '2', 'NODE_NAME': 'Membrane proteins'},
  {'NODE_ID': '3', 'NODE_NAME': 'Fibrous proteins'}])

In [13]:
des = pd.DataFrame(lines)
des.shape

(10340, 2)

In [14]:
des.head()

Unnamed: 0,NODE_ID,NODE_NAME
0,1,Globular proteins
1,2,Membrane proteins
2,3,Fibrous proteins
3,4,Non-globular/Intrinsically unstructured proteins
4,1000000,All alpha proteins


In [15]:
des.to_csv('scop/scop-des-latest.csv', index=None)

### SCOP represented structures.

In [18]:
# DOMID REPRESENTED-PDBID REPRESENTED-PDBCHAIN
lines = []
with open('scop/scop-represented-structures-latest.txt', 'r') as r:
    for line in r.readlines()[6:]:
        split_lines = line.split(' ')
        assert len(split_lines) == 3
        lines.append({
            "DOMID": split_lines[0],
            "REPRESENTED-PDBID": split_lines[1],
            "REPRESENTED-PDBCHAIN": split_lines[2].strip(),
        })
        # break

In [19]:
len(lines), lines[:3]

(861539,
 [{'DOMID': '8000061',
   'REPRESENTED-PDBID': '2DT5',
   'REPRESENTED-PDBCHAIN': 'B'},
  {'DOMID': '8000376',
   'REPRESENTED-PDBID': '2FR1',
   'REPRESENTED-PDBCHAIN': 'A'},
  {'DOMID': '8000376',
   'REPRESENTED-PDBID': '6W7S',
   'REPRESENTED-PDBCHAIN': 'A'}])

In [20]:
rep = pd.DataFrame(lines)
rep.shape

(861539, 3)

In [21]:
rep.head()

Unnamed: 0,DOMID,REPRESENTED-PDBID,REPRESENTED-PDBCHAIN
0,8000061,2DT5,B
1,8000376,2FR1,A
2,8000376,6W7S,A
3,8000376,6WH9,A
4,8000376,6WH9,D


In [22]:
rep.to_csv('scop/scop-represented-structures-latest.csv', index=None)

### SCOP sequence libraries

In [23]:
from Bio.SeqIO.FastaIO import SimpleFastaParser

#### FA

In [29]:
lines = []
with open('scop/scop_fa_represeq_lib_latest.fa','r') as ffile:
    for title, sequence in SimpleFastaParser(ffile):
        title_split = title.split(' ')
        lines.append({
            "DOMID": title_split[0],
            "FA": title_split[1].split('=')[1],
            "FA-PDBID": title_split[2].split('=')[1],
            "FA-UNIID": title_split[3].split('=')[1],
            "sequence": sequence,
        })
        # break

In [30]:
len(lines), lines[:3]

(35644,
 [{'DOMID': '8072807',
   'FA': '4003632',
   'FA-PDBID': '3KCN_A',
   'FA-UNIID': 'Q7UJS6',
   'sequence': 'NERILLVDDDYSLLNTLKRNLSFDFEVTTCESGPEALACIKKSDPFSVIMVDMRMPGMEGTEVIQKARLISPNSVYLMLTGNQDLTTAMEAVNEGQVFRFLNKPCQMSDIKAAINAGIKQYDLVTSKEELLKKT'},
  {'DOMID': '8072797',
   'FA': '4003632',
   'FA-PDBID': '3HZH_A',
   'FA-UNIID': 'O51615',
   'sequence': 'SKPRGINYDTGIPFNVLIVDDSVFTVKQLTQIFTSEGFNIIDTAADGEEAVIKYKNHYPNIDIVTLDITMPKMDGITCLSNIMEFDKNARVIMISALGKEQLVKDCLIKGAKTFIVKPLDRAKVLQRVMSVFVK'},
  {'DOMID': '8022848',
   'FA': '4001912',
   'FA-PDBID': '2OP5_A',
   'FA-UNIID': '2OP5_A',
   'sequence': 'TDETAFLNSLFMDFTSENELELFLKSLDEVWSEDLYSRLSAAGLIRHVISKVWNKEQHRISMVFEYDSKEGYQKCQEIIDKEFGITLKEKLKKFVFKIHNNRGVVVSEFIRS'}])

In [31]:
far = pd.DataFrame(lines)
far.shape

(35644, 5)

In [32]:
far.head()

Unnamed: 0,DOMID,FA,FA-PDBID,FA-UNIID,sequence
0,8072807,4003632,3KCN_A,Q7UJS6,NERILLVDDDYSLLNTLKRNLSFDFEVTTCESGPEALACIKKSDPF...
1,8072797,4003632,3HZH_A,O51615,SKPRGINYDTGIPFNVLIVDDSVFTVKQLTQIFTSEGFNIIDTAAD...
2,8022848,4001912,2OP5_A,2OP5_A,TDETAFLNSLFMDFTSENELELFLKSLDEVWSEDLYSRLSAAGLIR...
3,8107333,4007725,7V9L_R,Q9HB45,FSTVKIIYTVGHSISIVALFVAITILVALRRLHCPRNYVHTQLFTT...
4,8097333,4000755,6DRS_A,B8NBN5,PTNPLTLIVATTPIPTREKTLLGIGLNGTLPWPRIKADMSFFARVT...


In [33]:
far.to_csv('scop/scop_fa_represeq_lib_latest.csv', index=None)

#### SF

In [34]:
lines = []
with open('scop/scop_sf_represeq_lib_latest.fa','r') as ffile:
    for title, sequence in SimpleFastaParser(ffile):
        title_split = title.split(' ')
        lines.append({
            "DOMID": title_split[0],
            "SF": title_split[1].split('=')[1],
            "SF-PDBID": title_split[2].split('=')[1],
            "SF-UNIID": title_split[3].split('=')[1],
            "sequence": sequence,
        })
        # break

In [35]:
len(lines), lines[:3]

(36900,
 [{'DOMID': '8107332',
   'SF': '3002300',
   'SF-PDBID': '7ECQ_A',
   'SF-UNIID': 'Q9D8T0',
   'sequence': 'KYKCGLPQPCPEEHLSFRIVSGAANVIGPKICLEDKMLMSSVKDNVGRGLNIALVNGVSGELLEARAFDMWAGDVNDLLKFIRPLHEGTLVFVASYDDPATKMNEETRKLFSELGSRNAKDLAFRDSWVFVGAKGVQNKSPFEQHMKNSKHTNKYEGWPEALEMEGCIPRRSIAG'},
  {'DOMID': '8097332',
   'SF': '3001746',
   'SF-PDBID': '5ZKU_F',
   'SF-UNIID': 'B8HDZ1',
   'sequence': 'PSNNRYDVTEWPAGNPAKDIGEVINSIIADIKARQGAADVDDGGKPGAVIYLPPGDYHLRTQVLIDISFLRIEGSGHGFTSSSIRFNVPEEEWPDLHELWPGGSRVIVDLPAGGAGDSAAGAAFLVAREGSPRISSVEFSNFCIDGLHFTADGSGRHPENTYANGKTGIHVASANDSFRVTDMGFVYLENALTIHKADALSIHHNFIAECGSCIELRGWGQASKITDNLVGAGPRGHSIYAENHGGLLVTANNVFPRGASSVHFKGVTRSSVTNNRLHAFYPGMVRLEENSSENLVATNHFLRDHEPWTPFFGVDNGLDDLTGLLSISGNNNSVIGNHFSEVVDANEIRPEGATPVIIRLTAGTGNFVSTNHVVAMDVDAASSDSCFEAQVDALLATEAADLAVTAVLVDPGSARNTILDSGSDTQVVADRAVNAIRATPTVG'},
  {'DOMID': '8072808',
   'SF': '3001156',
   'SF-PDBID': '3KCN_A',
   'SF-UNIID': 'Q7UJS6',
   'sequence': 'NERILLVDDDYSLLNTLKRNLSFDFEVTTCESGPEALAC

In [36]:
sfr = pd.DataFrame(lines)
sfr.shape

(36900, 5)

In [37]:
sfr.head()

Unnamed: 0,DOMID,SF,SF-PDBID,SF-UNIID,sequence
0,8107332,3002300,7ECQ_A,Q9D8T0,KYKCGLPQPCPEEHLSFRIVSGAANVIGPKICLEDKMLMSSVKDNV...
1,8097332,3001746,5ZKU_F,B8HDZ1,PSNNRYDVTEWPAGNPAKDIGEVINSIIADIKARQGAADVDDGGKP...
2,8072808,3001156,3KCN_A,Q7UJS6,NERILLVDDDYSLLNTLKRNLSFDFEVTTCESGPEALACIKKSDPF...
3,8072798,3001156,3HZH_A,O51615,SKPRGINYDTGIPFNVLIVDDSVFTVKQLTQIFTSEGFNIIDTAAD...
4,8107334,3000473,7V9L_R,Q9HB45,FSTVKIIYTVGHSISIVALFVAITILVALRRLHCPRNYVHTQLFTT...


In [38]:
sfr.to_csv('scop/scop_sf_represeq_lib_latest.csv', index=None)

## Process Data

In [2]:
cla = pd.read_csv('scop/scop-cla-latest.csv', usecols=['FA-DOMID', 'FA-PDBID', 'SF-DOMID', 'SF-PDBID', 'SF', 'FA'])
cla.shape

(35644, 6)

In [3]:
cla.head()

Unnamed: 0,FA-DOMID,FA-PDBID,SF-DOMID,SF-PDBID,SF,FA
0,8045703,3H8D,8091604,3H8D,3002524,4004627
1,8094330,6J56,8094331,6J56,3002524,4004627
2,8017835,3FKQ,8017836,3FKQ,3001156,4003986
3,8021315,1XHF,8033695,1XHF,3001156,4003632
4,8021787,1Y7P,8034167,1Y7P,3001156,4002481


In [4]:
cla['SF-DOMID'].nunique()

35644

In [5]:
cla = cla.drop_duplicates(subset=['FA-DOMID', 'FA-PDBID', 'SF-DOMID', 'SF-PDBID', 'SF', 'FA'])

In [6]:
cla.shape

(35644, 6)

In [7]:
cla.head()

Unnamed: 0,FA-DOMID,FA-PDBID,SF-DOMID,SF-PDBID,SF,FA
0,8045703,3H8D,8091604,3H8D,3002524,4004627
1,8094330,6J56,8094331,6J56,3002524,4004627
2,8017835,3FKQ,8017836,3FKQ,3001156,4003986
3,8021315,1XHF,8033695,1XHF,3001156,4003632
4,8021787,1Y7P,8034167,1Y7P,3001156,4002481


In [7]:
# cla['TP'] = cla['SCOPCLA'].apply(lambda x: x.split(',')[0].split('=')[1])
# cla['CL'] = cla['SCOPCLA'].apply(lambda x: x.split(',')[1].split('=')[1])
# cla['CF'] = cla['SCOPCLA'].apply(lambda x: x.split(',')[2].split('=')[1])
# cla['SF'] = cla['SCOPCLA'].apply(lambda x: x.split(',')[3].split('=')[1])
# cla['FA'] = cla['SCOPCLA'].apply(lambda x: x.split(',')[4].split('=')[1])

In [8]:
# cla.to_csv('scop/scop-cla-latest.csv', index=None)

In [9]:
cla = cla.reset_index()
cla.rename({'index':'uid'}, axis=1, inplace=True)

In [10]:
cla.head()

Unnamed: 0,uid,CL,CF,SF,FA
0,0,1000003,2001470,3002524,4004627
1,2,1000002,2000016,3001156,4003986
2,3,1000002,2000016,3001156,4003632
3,4,1000002,2000016,3001156,4002481
4,20,1000002,2000016,3001156,4002478


In [11]:
from itertools import combinations

In [12]:
%%time
uids = cla.uid.values.tolist()
pairs = list(combinations(uids, 2))

CPU times: user 1.22 s, sys: 413 ms, total: 1.63 s
Wall time: 1.8 s


In [13]:
len(uids), len(pairs)

(5945, 17668540)

In [14]:
pairs[:5]

[(0, 2), (0, 3), (0, 4), (0, 20), (0, 25)]

In [15]:
# pd.concat([cla[cla.uid==0].add_suffix('_query'),cla[cla.uid==1].add_suffix('_context')], axis=1)
# (cla[cla.uid==0].add_suffix('_query')).join(cla[cla.uid==1].add_suffix('_context')).values
# np.hstack([cla[cla.uid==0].values,cla[cla.uid==1].values])
cla[cla.uid==0].values.tolist()[0] + cla[cla.uid==2].values.tolist()[0]

[0, 1000003, 2001470, 3002524, 4004627, 2, 1000002, 2000016, 3001156, 4003986]

In [16]:
columns = [f'{col}_{suf}' for suf in ['query','context'] for col in cla.columns]
columns

['uid_query',
 'CL_query',
 'CF_query',
 'SF_query',
 'FA_query',
 'uid_context',
 'CL_context',
 'CF_context',
 'SF_context',
 'FA_context']

In [17]:
import multiprocessing

def double(a, b):
    return cla[cla.uid==a].values.tolist()[0] + cla[cla.uid==b].values.tolist()[0]

def driver_func():
    PROCESSES = 4
    with multiprocessing.Pool(PROCESSES) as pool:
        results = [pool.apply_async(double, q, c) for q, c in pairs]

    return results

In [18]:
%%time
results = driver_func()
results = pd.DataFrame(results, columns=columns)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/sasu/anaconda3/envs/llm-stock-screener/lib/python3.11/site-packages/IPython/core/magics/execution.py", line 1340, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/Users/sasu/anaconda3/envs/llm-stock-screener/lib/python3.11/site-packages/pandas/core/frame.py", line 841, in __init__
    raise ValueError("DataFrame constructor not properly called!")
ValueError: DataFrame constructor not properly called!

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/sasu/anaconda3/envs/llm-stock-screener/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 2144, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sasu/anaconda3/envs/llm-stock-screener/lib/python3.11/site-packages/IPython/core/ultratb.py", line 1435, in structured_traceback
    ret

In [None]:
def calculate_remote_homologs(sfi, fai, sfj, faj):
    if sfi == sfj and fai != faj:
        return True
    else:
        return False

In [None]:
%%time
pairs['remote_homologs'] = pairs.apply(lambda row: calculate_remote_homologs(row['SF_query'], row['FA_query'], row['SF_context'], row['FA_context']), axis=1)

## Pre Process

In [2]:
data = pd.read_csv('scop/scop_dataset.csv')
data.shape

(139296, 19)

In [3]:
data.head()

Unnamed: 0,uid_query,FA-DOMID_query,FA-PDBID_query,SF-DOMID_query,SF-PDBID_query,SF_query,FA_query,uid_context,FA-DOMID_context,FA-PDBID_context,SF-DOMID_context,SF-PDBID_context,SF_context,FA_context,remote_homologs,FA_seq_query,FA_seq_context,SF_seq_query,SF_seq_context
0,6034,8063839,5O5J,8063840,5O5J,3001397,4003744,6021,8063711,5UQI,8063712,5UQI,3001397,4003354,True,MAVVTMKQLLDSGAHFGHQTRRWNPKMKRFIFTDRNGIYIIDLQQT...,MNNTDLIHLIKHFMHNELKAVEEVIDSPLSEFANLIKVLQSCQGKV...,MAVVTMKQLLDSGAHFGHQTRRWNPKMKRFIFTDRNGIYIIDLQQT...,MNNTDLIHLIKHFMHNELKAVEEVIDSPLSEFANLIKVLQSCQGKV...
1,6034,8063839,5O5J,8063840,5O5J,3001397,4003744,6039,8081229,4S12,8081230,4S12,3001397,4003354,True,MAVVTMKQLLDSGAHFGHQTRRWNPKMKRFIFTDRNGIYIIDLQQT...,MKLGALISESRNPDTMDLDTLSTLEMLTRINDEDRKVPEAIRLVIP...,MAVVTMKQLLDSGAHFGHQTRRWNPKMKRFIFTDRNGIYIIDLQQT...,MKLGALISESRNPDTMDLDTLSTLEMLTRINDEDRKVPEAIRLVIP...
2,6034,8063839,5O5J,8063840,5O5J,3001397,4003744,6000,8024588,1JEO,8036967,1JEO,3001397,4003354,True,MAVVTMKQLLDSGAHFGHQTRRWNPKMKRFIFTDRNGIYIIDLQQT...,LEELDIVSNNILILKKFYTNDEWKNKLDSLIDRIIKAKKIFIFGVG...,MAVVTMKQLLDSGAHFGHQTRRWNPKMKRFIFTDRNGIYIIDLQQT...,LEELDIVSNNILILKKFYTNDEWKNKLDSLIDRIIKAKKIFIFGVG...
3,6034,8063839,5O5J,8063840,5O5J,3001397,4003744,6009,8031210,1WIW,8043588,1WIW,3001397,4000802,True,MAVVTMKQLLDSGAHFGHQTRRWNPKMKRFIFTDRNGIYIIDLQQT...,MRDLDREETYLVDRTGLALELRDLVGTGPVPGEAYPGPHAALGYGE...,MAVVTMKQLLDSGAHFGHQTRRWNPKMKRFIFTDRNGIYIIDLQQT...,GPVPGEAYPGPHAALGYGEGQFAALLSGLPDWGEEGTLFLLEGGYD...
4,6034,8063839,5O5J,8063840,5O5J,3001397,4003744,5997,8021436,1IAT,8033816,1IAT,3001397,4000799,True,MAVVTMKQLLDSGAHFGHQTRRWNPKMKRFIFTDRNGIYIIDLQQT...,AALTRDPQFQKLQQWYREHRSELNLRRLFDANKDRFNHFSLTLNTN...,MAVVTMKQLLDSGAHFGHQTRRWNPKMKRFIFTDRNGIYIIDLQQT...,VNKVLDKMKSFCQRVRSGDWKGYTGKTITDVINIGIGGSDLGPLMV...


In [4]:
data = data.sample(frac=1)
data.shape

(139296, 19)

In [5]:
data = data.reset_index()
data.rename({'index': 'pair_id'}, axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,pair_id,uid_query,FA-DOMID_query,FA-PDBID_query,SF-DOMID_query,SF-PDBID_query,SF_query,FA_query,uid_context,FA-DOMID_context,FA-PDBID_context,SF-DOMID_context,SF-PDBID_context,SF_context,FA_context,remote_homologs,FA_seq_query,FA_seq_context,SF_seq_query,SF_seq_context
0,23139,11924,8028950,1VFG,8041329,1VFG,3000129,4003222,11927,8029746,1HUZ,8042125,1HUZ,3000129,4001836,True,VGQIAKEMGLRAYIVGGVVRDILLGKEVWDVDFVVEGNAIELAKEL...,RIPREEMLQMQDIVLNEVKKLDPEYIATVCGSFRRGAESSGDMDVL...,VGQIAKEMGLRAYIVGGVVRDILLGKEVWDVDFVVEGNAIELAKEL...,RIPREEMLQMQDIVLNEVKKLDPEYIATVCGSFRRGAESSGDMDVL...
1,18554,3530,8048302,3UFI,8102549,3UFI,3002736,4006044,13415,8067337,5D66,8067338,5D66,3000154,4000366,False,RAYKPIEIYGNINEVVNNVQETRAVGAAWGSDDRIGVTVEADEDNA...,DEYPLHMAAANDDIQLIKHILSQKTLIDARDETGSTALMVATRANN...,RAYKPIEIYGNINEVVNNVQETRAVGAAWGSDDRIGVTVEADEDNA...,DEYPLHMAAANDDIQLIKHILSQKTLIDARDETGSTALMVATRANN...
2,35966,34174,8096125,6L3A,8096126,6L3A,3001061,4001206,5864,8020531,1DXH,8032911,1DXH,3001396,4000794,False,GKACPYPFAEMERLEIHPEYNRLRDAGELGRVLMPYGGETWLATSW...,KPLHDISYAYLGDARNNMGNSLLLIGAKLGMDVRIAAPKALWPHDE...,GKACPYPFAEMERLEIHPEYNRLRDAGELGRVLMPYGGETWLATSW...,KPLHDISYAYLGDARNNMGNSLLLIGAKLGMDVRIAAPKALWPHDE...
3,121396,5837,8102161,7ANC,8102162,7ANC,3000038,4000088,1464,8065459,3BX4,8065460,3BX4,3001261,4002453,False,MQTNSKIYIAGHKGTAGTALVENLQKRGFNNLVLKTRQELDLVNQQ...,CPSGWSSYEGHCYKPFNEPKNWADAERFCKLQPKHSHLVSFQSAEE...,MQTNSKIYIAGHKGTAGTALVENLQKRGFNNLVLKTRQELDLVNQQ...,CPSGWSSYEGHCYKPFNEPKNWADAERFCKLQPKHSHLVSFQSAEE...
4,25085,568,8048542,3P4H,8092048,3P4H,3002555,4006141,14649,8027365,1SW3,8039744,1SW3,3000190,4003038,False,MPRFVVQEHHARRLHWDLRLEMDNVLKSWALPKGVPEKRGVKRLAI...,APRKFFVGGNWKMNGDKKSLGELIHTLNGAKLSADTEVVCGAPSIY...,MPRFVVQEHHARRLHWDLRLEMDNVLKSWALPKGVPEKRGVKRLAI...,APRKFFVGGNWKMNGDKKSLGELIHTLNGAKLSADTEVVCGAPSIY...


In [7]:
data.remote_homologs.value_counts()

remote_homologs
True     69648
False    69648
Name: count, dtype: int64

In [10]:
data[data.remote_homologs == True].sample(10)

Unnamed: 0,pair_id,uid_query,FA-DOMID_query,FA-PDBID_query,SF-DOMID_query,SF-PDBID_query,SF_query,FA_query,uid_context,FA-DOMID_context,FA-PDBID_context,SF-DOMID_context,SF-PDBID_context,SF_context,FA_context,remote_homologs,FA_seq_query,FA_seq_context,SF_seq_query,SF_seq_context
20375,117190,10891,8029339,2G72,8041718,2G72,3000118,4003307,10824,8022132,2IH2,8034512,2IH2,3000118,4003613,True,PGQAAVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQ...,VETPPEVVDFMVSLAEAPRGGRVLEPACAHGPFLRAFREAHGTAYR...,PGQAAVASAYQRFEPRAYLRNNYAPPRGDLCNPNGVGPWKLRCLAQ...,VETPPEVVDFMVSLAEAPRGGRVLEPACAHGPFLRAFREAHGTAYR...
109589,95396,17887,8065753,5CG0,8065754,5CG0,3000313,4003184,18082,8098077,6SAV,8098078,6SAV,3000313,4003138,True,RRFPDDFLFGTATASYQIEGAWDEDGKGENIWDYMVHNTPEVIRDL...,ATSDDWKSKAIYQLLTDRFGRADDSTSNCSNLSNYCGGTYEGITKH...,RRFPDDFLFGTATASYQIEGAWDEDGKGENIWDYMVHNTPEVIRDL...,ATSDDWKSKAIYQLLTDRFGRADDSTSNCSNLSNYCGGTYEGITKH...
63009,82529,24458,8031938,1D2M,8044316,1D2M,3002019,4000282,24462,8031943,1HEI,8044321,1HEI,3002019,4000400,True,TFRYRGPSPKGDQPKAIAGLVEALRDGERFVTLLGATGTGKTVTMA...,NSSPPAVPQSFQVAHLHAPTGSGKSTKVPAAYAAKGYKVLVLNPSV...,TFRYRGPSPKGDQPKAIAGLVEALRDGERFVTLLGATGTGKTVTMA...,NSSPPAVPQSFQVAHLHAPTGSGKSTKVPAAYAAKGYKVLVLNPSV...
19907,11538,18619,8020264,1VQO,8032644,1VQO,3000334,4001752,18689,8082420,5MYJ,8082421,5MYJ,3000334,4001761,True,PSSNGPLEGTRGKLKNKPRDRGTSPPQRAVEEFDDGEKVHLKIDPS...,MNLIESINAAQLRTDIPDFRPGDTVRVHAKVVEGTRERIQIFEGVV...,PSSNGPLEGTRGKLKNKPRDRGTSPPQRAVEEFDDGEKVHLKIDPS...,MNLIESINAAQLRTDIPDFRPGDTVRVHAKVVEGTRERIQIFEGVV...
58108,114779,12668,8103209,7NHN,8103210,7NHN,3000135,4001909,12415,8029350,1UE1,8041729,1UE1,3000135,4000346,True,RNQRKVYTGRVVSDKMDKTITVVVETYKKHGLYGKRVKYSKKFKAH...,GDTTITIVGNLTADPELRFTPSGAAVANFTVASTPRIYDRQTGEWK...,RNQRKVYTGRVVSDKMDKTITVVVETYKKHGLYGKRVKYSKKFKAH...,GDTTITIVGNLTADPELRFTPSGAAVANFTVASTPRIYDRQTGEWK...
79882,37705,3258,8070987,2DMT,8070988,2DMT,3000001,4000291,3109,8002271,1ETO,8002272,1ETO,3000001,4000189,True,GEPGTKAKKGRRSRTVFTELQLMGLEKRFEKQKYLSTPDRIDLAES...,NDLYELVLAEVEQPLLDMVMQYTRGNQTRAALMMGINRGTLRKKLK...,GEPGTKAKKGRRSRTVFTELQLMGLEKRFEKQKYLSTPDRIDLAES...,NDLYELVLAEVEQPLLDMVMQYTRGNQTRAALMMGINRGTLRKKLK...
128906,51889,5763,8087300,5Y1D,8087301,5Y1D,3000038,4000088,5615,8066945,5U2W,8066946,5U2W,3000038,4000029,True,MILVTGALGQIGTELVLALQEKYGNDKIIASDLKEPENYHCKFEKC...,NRLQGKRALVTGGSRGIGAAIAKRLAADGADVAITYEKSAERAQAV...,MILVTGALGQIGTELVLALQEKYGNDKIIASDLKEPENYHCKFEKC...,NRLQGKRALVTGGSRGIGAAIAKRLAADGADVAITYEKSAERAQAV...
13053,27941,7683,8107519,7Q0F,8107520,7Q0F,3000068,4003647,7526,8027258,2D8Y,8039637,2D8Y,3000068,4003634,True,AHENVWFSHPRNFGKGSRQCRHCSSHSGLIRKYGLDLCRQCFREKA...,RCSYCNNKLSLGTYASLHGRIYCKPHFNQLFKSKGNYDEGFG,AHENVWFSHPRNFGKGSRQCRHCSSHSGLIRKYGLDLCRQCFREKA...,RCSYCNNKLSLGTYASLHGRIYCKPHFNQLFKSKGNYDEGFG
14876,119731,32305,8027633,1T62,8040012,1T62,3001015,4001910,32319,8060288,5J3E,8060289,5J3E,3001015,4002214,True,MLKNVEVFWQNFLDKHELDMLMPDVWMFGDGSSEMGNRLGQLVVSG...,SSHWLMKSEPESRLEKGVDVKFSIEDLKAQPKQTTCWDGVRNYQAR...,MLKNVEVFWQNFLDKHELDMLMPDVWMFGDGSSEMGNRLGQLVVSG...,SSHWLMKSEPESRLEKGVDVKFSIEDLKAQPKQTTCWDGVRNYQAR...
61002,95565,10551,8076478,3D23,8076479,3D23,3000114,4002259,10453,8029749,2CGA,8042128,2CGA,3000114,4000286,True,SGIVKMVSPTSKIEPCIVSVTYGSMTLNGLWLDDKVYCPRHVICSS...,CGVPAIQPVLSGLSRIVNGEEAVPGSWPWQVSLQDKTGFHFCGGSL...,SGIVKMVSPTSKIEPCIVSVTYGSMTLNGLWLDDKVYCPRHVICSS...,CGVPAIQPVLSGLSRIVNGEEAVPGSWPWQVSLQDKTGFHFCGGSL...


In [11]:
import jsonlines, json

In [12]:
from sklearn.model_selection import train_test_split

In [14]:
train, test = train_test_split(data, train_size=.8, stratify=data[['remote_homologs']])

In [15]:
train.shape, test.shape

((111436, 20), (27860, 20))

In [18]:
test.remote_homologs.value_counts()

remote_homologs
False    13930
True     13930
Name: count, dtype: int64

In [21]:
with jsonlines.open("scop/scop_train_dataset.json", 'w') as w:
    for idx, row in train.iterrows():
        record = {
            'fa_query': row['FA_seq_query'],
            'fa_context': row['FA_seq_context'],
            'sf_query': row['SF_seq_query'],
            'sf_context': row['SF_seq_context'],
            'output': row['remote_homologs']
        }
        w.write(record)

In [22]:
with jsonlines.open("scop/scop_test_dataset.json", 'w') as w:
    for idx, row in test.iterrows():
        record = {
            'fa_query': row['FA_seq_query'],
            'fa_context': row['FA_seq_context'],
            'sf_query': row['SF_seq_query'],
            'sf_context': row['SF_seq_context'],
            'output': row['remote_homologs']
        }
        w.write(record)

In [32]:
test_ = jsonlines.open("scop/scop_test_dataset.json", 'r').iter()