In [5]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
import os
import torch
import json


In [4]:
from chembl_webresource_client.new_client import new_client

available_resources = [resource for resource in dir(new_client) if not resource.startswith('_')]
print(available_resources)



In [4]:
from chembl_webresource_client.new_client import new_client

molecule = new_client.molecule
mols = molecule.filter(pref_name__iexact='aspirin')
mols



## DRUG - PROTEIN PAIR

In [8]:
# pip install chembl-webresource-client pandas tqdm rdkit-pypi
from chembl_webresource_client.new_client import new_client
import pandas as pd
from tqdm import tqdm
import random
from collections import defaultdict

# ---------- knobs (tune scale here) ----------
MAX_TARGETS            = 20      # ~80 human single proteins
MIN_ACTS_PER_TARGET    = 80      # skip sparse targets
MAX_ACTS_PER_TARGET    = 400     # cap per target to keep it light
MAKE_BALANCED_1TO1     = True    # add random negatives to balance
RAND_SEED              = 42
# --------------------------------------------

random.seed(RAND_SEED)

# Sanity: show that 'target_component' exists (not 'component')
# print('has target_component?', hasattr(new_client, 'target_component'))

target_api   = new_client.target
act_api      = new_client.activity
assay_api    = new_client.assay
mol_api      = new_client.molecule
tcomp_api    = new_client.target_component

# 1) pick human single-protein targets
targets = target_api.filter(target_type='SINGLE PROTEIN', organism='Homo sapiens') \
                    .only(['target_chembl_id','pref_name','target_components'])

print(targets)
print(type(targets))

# Shuffle for diversity and iterate
# targets = list(targets)


[{'pref_name': 'Maltase-glucoamylase', 'target_chembl_id': 'CHEMBL2074', 'target_components': [{'accession': 'O43451', 'component_description': 'Maltase-glucoamylase', 'component_id': 434, 'component_type': 'PROTEIN', 'relationship': 'SINGLE PROTEIN', 'target_component_synonyms': [{'component_synonym': '3.2.1.20', 'syn_type': 'EC_NUMBER'}, {'component_synonym': 'Alpha-1,4-glucosidase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'Maltase-glucoamylase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'MGA', 'syn_type': 'GENE_SYMBOL_OTHER'}, {'component_synonym': 'MGAM', 'syn_type': 'GENE_SYMBOL'}, {'component_synonym': 'MGAML', 'syn_type': 'GENE_SYMBOL_OTHER'}, {'component_synonym': 'Synonyms=MGA', 'syn_type': 'GENE_SYMBOL_OTHER'}], 'target_component_xrefs': [{'xref_id': 'O43451', 'xref_name': None, 'xref_src_db': 'AlphaFoldDB'}, {'xref_id': 'O43451', 'xref_name': None, 'xref_src_db': 'ExpressionAtlas'}, {'xref_id': 'GO:0005886', 'xref_name': 'plasma membrane', 'xref_src_db': 'GoCompone

In [10]:
print("TOTAL SINGLE PROTEIN organism 'Homo sapiens': ",len(targets))

sample_target = targets[0]
print('sample :',sample_target)
print(type(sample_target))

TOTAL SINGLE PROTEIN organism 'Homo sapiens':  4387
sample : {'pref_name': 'Maltase-glucoamylase', 'target_chembl_id': 'CHEMBL2074', 'target_components': [{'accession': 'O43451', 'component_description': 'Maltase-glucoamylase', 'component_id': 434, 'component_type': 'PROTEIN', 'relationship': 'SINGLE PROTEIN', 'target_component_synonyms': [{'component_synonym': '3.2.1.20', 'syn_type': 'EC_NUMBER'}, {'component_synonym': 'Alpha-1,4-glucosidase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'Maltase-glucoamylase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'MGA', 'syn_type': 'GENE_SYMBOL_OTHER'}, {'component_synonym': 'MGAM', 'syn_type': 'GENE_SYMBOL'}, {'component_synonym': 'MGAML', 'syn_type': 'GENE_SYMBOL_OTHER'}, {'component_synonym': 'Synonyms=MGA', 'syn_type': 'GENE_SYMBOL_OTHER'}], 'target_component_xrefs': [{'xref_id': 'O43451', 'xref_name': None, 'xref_src_db': 'AlphaFoldDB'}, {'xref_id': 'O43451', 'xref_name': None, 'xref_src_db': 'ExpressionAtlas'}, {'xref_id': 'GO:0005886

In [11]:
PICKED_TARGETS = []
for t in targets:
    if len(PICKED_TARGETS) >= MAX_TARGETS:
        break
    PICKED_TARGETS.append({'target_chembl_id': t['target_chembl_id'],
                   'pref_name': t.get('pref_name', None),
                   'accession':t['target_components'][0]['accession'],
                   'component_id':t['target_components'][0]['component_id'],
                   'component_type':t['target_components'][0]['component_type'],
                   'relationship':t['target_components'][0]['relationship'],

                   })
PICKED_TARGETS

[{'target_chembl_id': 'CHEMBL2074',
  'pref_name': 'Maltase-glucoamylase',
  'accession': 'O43451',
  'component_id': 434,
  'component_type': 'PROTEIN',
  'relationship': 'SINGLE PROTEIN'},
 {'target_chembl_id': 'CHEMBL1971',
  'pref_name': 'Sulfonylurea receptor 2',
  'accession': 'O60706',
  'component_id': 294,
  'component_type': 'PROTEIN',
  'relationship': 'SINGLE PROTEIN'},
 {'target_chembl_id': 'CHEMBL1827',
  'pref_name': 'Phosphodiesterase 5A',
  'accession': 'O76074',
  'component_id': 124,
  'component_type': 'PROTEIN',
  'relationship': 'SINGLE PROTEIN'},
 {'target_chembl_id': 'CHEMBL1859',
  'pref_name': 'Voltage-gated T-type calcium channel alpha-1H subunit',
  'accession': 'O95180',
  'component_id': 167,
  'component_type': 'PROTEIN',
  'relationship': 'SINGLE PROTEIN'},
 {'target_chembl_id': 'CHEMBL202',
  'pref_name': 'Dihydrofolate reductase',
  'accession': 'P00374',
  'component_id': 396,
  'component_type': 'PROTEIN',
  'relationship': 'SINGLE PROTEIN'},
 {'targ

In [None]:
picked_sample_target = PICKED_TARGETS[3]
# picked_sample_target['accession']

## Get sequence - organism from accession id: TESTER

In [14]:
all_seq = tcomp_api.filter(accession= 'B2RXH2').only(['organism','sequence'])
print("leng" ,len(all_seq))
print(all_seq)

leng 1
[{'organism': 'Homo sapiens', 'sequence': 'MKSVHSSPQNTSHTIMTFYPTMEEFADFNTYVAYMESQGAHQAGLAKVIPPKEWKARQMYDDIEDILIATPLQQVTSGQGGVFTQYHKKKKAMRVGQYRRLANSKKYQTPPHQNFADLEQRYWKSHPGNPPIYGADISGSLFEESTKQWNLGHLGTILDLLEQECGVVIEGVNTPYLYFGMWKTTFAWHTEDMDLYSINYLHFGEPKTWYVVPPEHGQHLERLARELFPDISRGCEAFLRHKVALISPTVLKENGIPFNCMTQEAGEFMVTFPYGYHAGFNHGFNCAEAINFATPRWIDYGKMASQCSCGESTVTFSMDPFVRIVQPESYELWKHRQDLAIVEHTEPRVAESQELSNWRDDIVLRRAALGLRLLPNLTAQCPTQPVSSGHCYNPKGCGTDAVPGSAFQSSAYHTQTQSLTLGMSARVLLPSTGSWGSGRGRGRGQGQGRGCSRGRGHGCCTRELGTEEPTVQPASKRRLLMGTRSRAQGHRPQLPLANDLMTNLSL'}]


In [9]:
homosep_seq = tcomp_api.filter(organism='Homo sapiens',target_chembl_id=picked_sample_target['component_id']).only(['organism','sequence'])
homosep_seq

[{'organism': 'Homo sapiens', 'sequence': 'MEPWPLLLLFSLCSAGLVLGSEHETRLVAKLFKDYSSVVRPVEDHRQVVEVTVGLQLIQLINVDEVNQIVTTNVRLKQQWVDYNLKWNPDDYGGVKKIHIPSEKIWRPDLVLYNNADGDFAIVKFTKVLLQYTGHITWTPPAIFKSYCEIIVTHFPFDEQNCSMKLGTWTYDGSVVAINPESDQPDLSNFMESGEWVIKESRGWKHSVTYSCCPDTPYLDITYHFVMQRLPLYFIVNVIIPCLLFSFLTGLVFYLPTDSGEKMTLSISVLLSLTVFLLVIVELIPSTSSAVPLIGKYMLFTMVFVIASIIITVIVINTHHRSPSTHVMPNWVRKVFIDTIPNIMFFSTMKRPSREKQDKKIFTEDIDISDISGKPGPPPMGFHSPLIKHPEVKSAIEGIKYIAETMKSDQESNNAAAEWKYVAMVMDHILLGVFMLVCIIGTLAVFAGRLIELNQQG'}, {'organism': 'Homo sapiens', 'sequence': 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD'}, {'organism': 'Homo sapiens', 'sequence': 'MRARPRPRPLWATVL

## PULL amino seq info list based on accession list

In [16]:
# pull amino sequence info
# Accession = UniProtein ID
def fetch_sequence_bundle(accession_ID):
    rows = tcomp_api.filter(accession = accession_ID, organism='Homo sapiens').only(['organism','sequence'])  # accession is UniProt
    
    if len(rows) == 1:
        rows = rows[0]
    else: 
        print("SO THERE IS A CASE WHERE 1 ACCESSION is >1 protein sequence???")

    print(f'AMINO SEQUENCE of Accession {accession_ID}: ', rows['sequence'])
    # organism = sorted({r.get('organism') for r in rows if r.get('organism')})
    # seqs       = [r.get('sequence') for r in rows if r.get('sequence')]
    # Prefer the longest sequence (single protein targets normally have one)
    # seq = max(seqs, key=len) if seqs else None
    # return organism, seq
    return rows

# 2) pull binding activities with strong QC & pChEMBL present
def fetch_clean_activities(tid):
    # Assay filter first (faster if you ever expand): relationship D, binding B
    assays = assay_api.filter(target_chembl_id=tid,
                              relationship_type='D',
                              assay_type='B').only(['assay_chembl_id'])
    assay_ids = [a['assay_chembl_id'] for a in assays]
    # print(assay_ids)
    # print("assay_ids: ",assay_ids)

    if not assay_ids:
        return pd.DataFrame()

    # Chunk assay_ids with __in; and require pChEMBL present & valid
    CHUNK = 200
    recs = []
    for i in range(0, len(assay_ids), CHUNK):
        chunk = assay_ids[i:i+CHUNK]
        # print("chunk:",chunk)
        acts = act_api.filter(assay_chembl_id__in=chunk, #";".join(chunk)
                              pchembl_value__isnull=False,
                              data_validity_comment__isnull=True) \
                      .only(['molecule_chembl_id','pchembl_value','standard_type',
                             'standard_units','standard_relation'])
        # print("acts:",acts)
        recs.extend(acts)

    if not recs:
        return pd.DataFrame()
    # print("RECS: ",recs)
    df = pd.DataFrame.from_records(recs)
    # Keep exact values (relation "=") if present; pChEMBL already standardizes, but keep clean
    if 'standard_relation' in df.columns:
        df = df[(df['standard_relation'].isna()) | (df['standard_relation'] == '=')]
    # Deduplicate molecule per target by best pChEMBL
    df = df.sort_values('pchembl_value', ascending=False) \
           .drop_duplicates(subset=['molecule_chembl_id'])
    return df[['molecule_chembl_id','pchembl_value']]

In [26]:

# 3) iterate targets → compute per-target median, label, keep label==1
all_pos = []
target_info = {}
for t in tqdm(PICKED_TARGETS, desc='Targets'):
    accession = t['accession']
    # print(t)
    org_seq = fetch_sequence_bundle(accession)
    # print(df,a)
    target_info[accession] = org_seq['sequence']
    t['sequence'] = org_seq['sequence']
    t['organism'] = org_seq['organism']
print("length:", len(PICKED_TARGETS))
# print('PICKED_TARGETS', json.dumps(PICKED_TARGETS,indent=2))
print('picked 2', json.dumps(PICKED_TARGETS[1],indent=2))
print('picked 16', json.dumps(PICKED_TARGETS[15],indent=2))
#     if df.shape[0] < MIN_ACTS_PER_TARGET:
#         continue
#     df = df.head(MAX_ACTS_PER_TARGET)  # cap
#     print(df.head())
#     df['pchembl_value'] = df['pchembl_value'].apply(lambda x: round(float(x),2))
#     med = df['pchembl_value'].median()
#     print("==== MEDIAN : ", med)
#     df['label'] = (df['pchembl_value'] > med).astype(int)
#     df = df[df['label'] == 1]  # keep above-median only (DeepTarget-style)
#     print("check df",df.head())
#     if df.empty:
#         continue
#     accs, seq = fetch_sequence_bundle(tid)
#     # Filter out sequences with unknown 'X'
#     if seq and ('X' in seq):
#         continue

#     for r in df.itertuples(index=False):
#         all_pos.append({
#             'target_chembl_id': tid,
#             'target_pref_name': t['pref_name'],
#             'uniprot_ids': ";".join(accs) if accs else None,
#             'protein_sequence': seq,
#             'molecule_chembl_id': r.molecule_chembl_id,
#             'pchembl_value': r.pchembl_value,
#             'label': 1
#         })
# print(df.head())



Targets:   0%|          | 0/20 [00:00<?, ?it/s]

Targets:   5%|▌         | 1/20 [00:01<00:19,  1.02s/it]

AMINO SEQUENCE of Accession O43451:  MARKKLKKFTTLEIVLSVLLLVLFIISIVLIVLLAKESLKSTAPDPGTTGTPDPGTTGTPDPGTTGTTHARTTGPPDPGTTGTTPVSAECPVVNELERINCIPDQPPTKATCDQRGCCWNPQGAVSVPWCYYSKNHSYHVEGNLVNTNAGFTARLKNLPSSPVFGSNVDNVLLTAEYQTSNRFHFKLTDQTNNRFEVPHEHVQSFSGNAAASLTYQVEISRQPFSIKVTRRSNNRVLFDSSIGPLLFADQFLQLSTRLPSTNVYGLGEHVHQQYRHDMNWKTWPIFNRDTTPNGNGTNLYGAQTFFLCLEDASGLSFGVFLMNSNAMEVVLQPAPAITYRTIGGILDFYVFLGNTPEQVVQEYLELIGRPALPSYWALGFHLSRYEYGTLDNMREVVERNRAAQLPYDVQHADIDYMDERRDFTYDSVDFKGFPEFVNELHNNGQKLVIIVDPAISNNSSSSKPYGPYDRGSDMKIWVNSSDGVTPLIGEVWPGQTVFPDYTNPNCAVWWTKEFELFHNQVEFDGIWIDMNEVSNFVDGSVSGCSTNNLNNPPFTPRILDGYLFCKTLCMDAVQHWGKQYDIHNLYGYSMAVATAEAAKTVFPNKRSFILTRSTFAGSGKFAAHWLGDNTATWDDLRWSIPGVLEFNLFGIPMVGPDICGFALDTPEELCRRWMQLGAFYPFSRNHNGQGYKDQDPASFGADSLLLNSSRHYLNIRYTLLPYLYTLFFRAHSRGDTVARPLLHEFYEDNSTWDVHQQFLWGPGLLITPVLDEGAEKVMAYVPDAVWYDYETGSQVRWRKQKVEMELPGDKIGLHLRGGYIFPTQQPNTTTLASRKNPLGLIIALDENKEAKGELFWDNGETKDTVANKVYLLCEFSVTQNRLEVNISQSTYKDPNNLAFNEIKILGTEEPSNVTVKHNGVPSQTSPTVTYDSNLKVAIITDIDLLLGEAYTVEWSIKIRDEEK

Targets:  10%|█         | 2/20 [00:01<00:15,  1.19it/s]

AMINO SEQUENCE of Accession O60706:  MSLSFCGNNISSYNINDGVLQNSCFVDALNLVPHVFLLFITFPILFIGWGSQSSKVQIHHNTWLHFPGHNLRWILTFALLFVHVCEIAEGIVSDSRRESRHLHLFMPAVMGFVATTTSIVYYHNIETSNFPKLLLALFLYWVMAFITKTIKLVKYCQSGLDISNLRFCITGMMVILNGLLMAVEINVIRVRRYVFFMNPQKVKPPEDLQDLGVRFLQPFVNLLSKATYWWMNTLIISAHKKPIDLKAIGKLPIAMRAVTNYVCLKDAYEEQKKKVADHPNRTPSIWLAMYRAFGRPILLSSTFRYLADLLGFAGPLCISGIVQRVNETQNGTNNTTGISETLSSKEFLENAYVLAVLLFLALILQRTFLQASYYVTIETGINLRGALLAMIYNKILRLSTSNLSMGEMTLGQINNLVAIETNQLMWFLFLCPNLWAMPVQIIMGVILLYNLLGSSALVGAAVIVLLAPIQYFIATKLAEAQKSTLDYSTERLKKTNEILKGIKLLKLYAWEHIFCKSVEETRMKELSSLKTFALYTSLSIFMNAAIPIAAVLATFVTHAYASGNNLKPAEAFASLSLFHILVTPLFLLSTVVRFAVKAIISVQKLNEFLLSDEIGDDSWRTGESSLPFESCKKHTGVQPKTINRKQPGRYHLDSYEQSTRRLRPAETEDIAIKVTNGYFSWGSGLATLSNIDIRIPTGQLTMIVGQVGCGKSSLLLAILGEMQTLEGKVHWSNVNESEPSFEATRSRNRYSVAYAAQKPWLLNATVEENITFGSPFNKQRYKAVTDACSLQPDIDLLPFGDQTEIGERGINLSGGQRQRICVARALYQNTNIVFLDDPFSALDIHLSDHLMQEGILKFLQDDKRTLVLVTHKLQYLTHADWIIAMKDGSVLREGTLKDIQTKDVELYEHWKTLMNRQDQELEKDMEADQTTLERKTLRRAMYSREAKAQMEDEDEEEEEEEDE

Targets:  15%|█▌        | 3/20 [00:02<00:13,  1.28it/s]

AMINO SEQUENCE of Accession O76074:  MERAGPSFGQQRQQQQPQQQKQQQRDQDSVEAWLDDHWDFTFSYFVRKATREMVNAWFAERVHTIPVCKEGIRGHTESCSCPLQQSPRADNSAPGTPTRKISASEFDRPLRPIVVKDSEGTVSFLSDSEKKEQMPLTPPRFDHDEGDQCSRLLELVKDISSHLDVTALCHKIFLHIHGLISADRYSLFLVCEDSSNDKFLISRLFDVAEGSTLEEVSNNCIRLEWNKGIVGHVAALGEPLNIKDAYEDPRFNAEVDQITGYKTQSILCMPIKNHREEVVGVAQAINKKSGNGGTFTEKDEKDFAAYLAFCGIVLHNAQLYETSLLENKRNQVLLDLASLIFEEQQSLEVILKKIAATIISFMQVQKCTIFIVDEDCSDSFSSVFHMECEELEKSSDTLTREHDANKINYMYAQYVKNTMEPLNIPDVSKDKRFPWTTENTGNVNQQCIRSLLCTPIKNGKKNKVIGVCQLVNKMEENTGKVKPFNRNDEQFLEAFVIFCGLGIQNTQMYEAVERAMAKQMVTLEVLSYHASAAEEETRELQSLAAAVVPSAQTLKITDFSFSDFELSDLETALCTIRMFTDLNLVQNFQMKHEVLCRWILSVKKNYRKNVAYHNWRHAFNTAQCMFAALKAGKIQNKLTDLEILALLIAALSHDLDHRGVNNSYIQRSEHPLAQLYCHSIMEHHHFDQCLMILNSPGNQILSGLSIEEYKTTLKIIKQAILATDLALYIKRRGEFFELIRKNQFNLEDPHQKELFLAMLMTACDLSAITKPWPIQQRIAELVATEFFDQGDRERKELNIEPTDLMNREKKNKIPSMQVGFIDAICLQLYEALTHVSEDCFPLLDGCRKNRQKWQALAEQQEKMLINGESGQAKRN


Targets:  20%|██        | 4/20 [00:03<00:12,  1.32it/s]

AMINO SEQUENCE of Accession O95180:  MTEGARAADEVRVPLGAPPPGPAALVGASPESPGAPGREAERGSELGVSPSESPAAERGAELGADEEQRVPYPALAATVFFCLGQTTRPRSWCLRLVCNPWFEHVSMLVIMLNCVTLGMFRPCEDVECGSERCNILEAFDAFIFAFFAVEMVIKMVALGLFGQKCYLGDTWNRLDFFIVVAGMMEYSLDGHNVSLSAIRTVRVLRPLRAINRVPSMRILVTLLLDTLPMLGNVLLLCFFVFFIFGIVGVQLWAGLLRNRCFLDSAFVRNNNLTFLRPYYQTEEGEENPFICSSRRDNGMQKCSHIPGRRELRMPCTLGWEAYTQPQAEGVGAARNACINWNQYYNVCRSGDSNPHNGAINFDNIGYAWIAIFQVITLEGWVDIMYYVMDAHSFYNFIYFILLIIVGSFFMINLCLVVIATQFSETKQRESQLMREQRARHLSNDSTLASFSEPGSCYEELLKYVGHIFRKVKRRSLRLYARWQSRWRKKVDPSAVQGQGPGHRQRRAGRHTASVHHLVYHHHHHHHHHYHFSHGSPRRPGPEPGACDTRLVRAGAPPSPPSPGRGPPDAESVHSIYHADCHIEGPQERARVAHAAATAAASLRLATGLGTMNYPTILPSGVGSGKGSTSPGPKGKWAGGPPGTGGHGPLSLNSPDPYEKIPHVVGEHGLGQAPGHLSGLSVPCPLPSPPAGTLTCELKSCPYCTRALEDPEGELSGSESGDSDGRGVYEFTQDVRHGDRWDPTRPPRATDTPGPGPGSPQRRAQQRAAPGEPGWMGRLWVTFSGKLRRIVDSKYFSRGIMMAILVNTLSMGVEYHEQPEELTNALEISNIVFTSMFALEMLLKLLACGPLGYIRNPYNIFDGIIVVISVWEIVGQADGGLSVLRTFRLLRVLKLVRFLPALRRQLVVLVKTMDNVATFCTLLMLFIFIFSILGMHLFGCKFSLKTDTGDTVPDRKNFDSLLWA

Targets:  25%|██▌       | 5/20 [00:03<00:11,  1.36it/s]

AMINO SEQUENCE of Accession P00374:  MVGSLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGKQNLVIMGKKTWFSIPEKNRPLKGRINLVLSRELKEPPQGAHFLSRSLDDALKLTEQPELANKVDMVWIVGGSSVYKEAMNHPGHLKLFVTRIMQDFESDTFFPEIDLEKYKLLPEYPGVLSDVQEEKGIKYKFEVYEKND


Targets:  30%|███       | 6/20 [00:04<00:10,  1.38it/s]

AMINO SEQUENCE of Accession P00519:  MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNGQGWVPSNYITPVNSLEKHSWYHGPVSRNAAEYLLSSGINGSFLVRESESSPGQRSISLRYEGRVYHYRINTASDGKLYVSSESRFNTLAELVHHHSTVADGLITTLHYPAPKRNKPTVYGVSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTMEVEEFLKEAAVMKEIKHPNLVQLLGVCTREPPFYIITEFMTYGNLLDYLRECNRQEVNAVVLLYMATQISSAMEYLEKKNFIHRDLAARNCLVGENHLVKVADFGLSRLMTGDTYTAHAGAKFPIKWTAPESLAYNKFSIKSDVWAFGVLLWEIATYGMSPYPGIDLSQVYELLEKDYRMERPEGCPEKVYELMRACWQWNPSDRPSFAEIHQAFETMFQESSISDEVEKELGKQGVRGAVSTLLQAPELPTKTRTSRRAAEHRDTTDVPEMPHSKGQGESDPLDHEPAVSPLLPRKERGPPEGGLNEDERLLPKDKKTNLFSALIKKKKKTAPTPPKRSSSFREMDGQPERRGAGEEEGRDISNGALAFTPLDTADPAKSPKPSNGAGVPNGALRESGGSGFRSPHLWKKSSTLTSSRLATGEEEGGGSSSKRFLRSCSASCVPHGAKDTEWRSVTLPRDLQSTGRQFDSSTFGGHKSEKPALPRKRAGENRSDQVTRGTVTPPPRLVKKNEEAADEVFKDIMESSPGSSPPNLTPKPLRRQVTVAPASGLPHKEEAGKGSALGTPAAAEPVTPTSKAGSGAPGGTSKGPAEESRVRRHKHSSESPGRDKGKLSRLKPAPPPPPAASAGKAGGKPSQSPSQEAAGEAVLGAKTKATSLVDAVNSDAAKPSQPGEGLKKPVLPAT

Targets:  35%|███▌      | 7/20 [00:05<00:09,  1.40it/s]

AMINO SEQUENCE of Accession P00533:  MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYVQRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNLQEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKLTKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVNPEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLSINATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAFENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKIISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHPECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTGPGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRE

Targets:  40%|████      | 8/20 [00:05<00:08,  1.41it/s]

AMINO SEQUENCE of Accession P00734:  MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANTFLEEVRKGNLERECVEETCSYEEAFEALESSTATDVFWAKYTACETARTPRDKLAACLEGNCAEGLGTNYRGHVNITRSGIECQLWRSRYPHKPEINSTTHPGADLQENFCRNPDSSTTGPWCYTTDPTVRRQECSIPVCGQDQVTVAMTPRSEGSSVNLSPPLEQCVPDRGQQYQGRLAVTTHGLPCLAWASAQAKALSKHQDFNSAVQLVENFCRNPDGDEEGVWCYVAGKPGDFGYCDLNYCEEAVEEETGDGLDEDSDRAIEGRTATSEYQTFFNPRTFGSGEADCGLRPLFEKKSLEDKTERELLESYIDGRIVEGSDAEIGMSPWQVMLFRKSPQELLCGASLISDRWVLTAAHCLLYPPWDKNFTENDLLVRIGKHSRTRYERNIEKISMLEKIYIHPRYNWRENLDRDIALMKLKKPVAFSDYIHPVCLPDRETAASLLQAGYKGRVTGWGNLKETWTANVGKGQPSVLQVVNLPIVERPVCKDSTRIRITDNMFCAGYKPDEGKRGDACEGDSGGPFVMKSPFNNRWYQMGIVSWGEGCDRDGKYGFYTHVFRLKKWIQKVIDQFGE


Targets:  45%|████▌     | 9/20 [00:06<00:07,  1.42it/s]

AMINO SEQUENCE of Accession P00747:  MEHKEVVLLLLLFLKSGQGEPLDDYVNTQGASLFSVTKKQLGAGSIEECAAKCEEDEEFTCRAFQYHSKEQQCVIMAENRKSSIIIRMRDVVLFEKKVYLSECKTGNGKNYRGTMSKTKNGITCQKWSSTSPHRPRFSPATHPSEGLEENYCRNPDNDPQGPWCYTTDPEKRYDYCDILECEEECMHCSGENYDGKISKTMSGLECQAWDSQSPHAHGYIPSKFPNKNLKKNYCRNPDRELRPWCFTTDPNKRWELCDIPRCTTPPPSSGPTYQCLKGTGENYRGNVAVTVSGHTCQHWSAQTPHTHNRTPENFPCKNLDENYCRNPDGKRAPWCHTTNSQVRWEYCKIPSCDSSPVSTEQLAPTAPPELTPVVQDCYHGDGQSYRGTSSTTTTGKKCQSWSSMTPHRHQKTPENYPNAGLTMNYCRNPDADKGPWCFTTDPSVRWEYCNLKKCSGTEASVVAPPPVVLLPDVETPSEEDCMFGNGKGYRGKRATTVTGTPCQDWAAQEPHRHSIFTPETNPRAGLEKNYCRNPDGDVGGPWCYTTNPRKLYDYCDVPQCAAPSFDCGKPQVEPKKCPGRVVGGCVAHPHSWPWQVSLRTRFGMHFCGGTLISPEWVLTAAHCLEKSPRPSSYKVILGAHQEVNLEPHVQEIEVSRLFLEPTRKDIALLKLSSPAVITDKVIPACLPSPNYVVADRTECFITGWGETQGTFGAGLLKEAQLPVIENKVCNRYEFLNGRVQSTELCAGHLAGGTDSCQGDSGGPLVCFEKDKYILQGVTSWGLGCARPNKPGVYVRVSRFVTWIEGVMRNN


Targets:  50%|█████     | 10/20 [00:07<00:07,  1.42it/s]

AMINO SEQUENCE of Accession P00813:  MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGLLNVIGMDKPLTLPDFLAKFDYYMPAIAGCREAIKRIAYEFVEMKAKEGVVYVEVRYSPHLLANSKVEPIPWNQAEGDLTPDEVVALVGQGLQEGERDFGVKARSILCCMRHQPNWSPKVVELCKKYQQQTVVAIDLAGDETIPGSSLLPGHVQAYQEAVKSGIHRTVHAGEVGSAEVVKEAVDILKTERLGHGYHTLEDQALYNRLRQENMHFEICPWSSYLTGAWKPDTEHAVIRLKNDQANYSLNTDDPLIFKSTLDTDYQMTKRDMGFTEEEFKRLNINAAKSSFLPEDEKRELLDLLYKAYGMPPSASAGQNL


Targets:  55%|█████▌    | 11/20 [00:08<00:06,  1.42it/s]

AMINO SEQUENCE of Accession P00918:  MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK


Targets:  60%|██████    | 12/20 [00:08<00:05,  1.43it/s]

AMINO SEQUENCE of Accession P01008:  MYSNVIGTVTSGKRKVYLLSLLLIGFWDCVTCHGSPVDICTAKPRDIPMNPMCIYRSPEKKATEDEGSEQKIPEATNRRVWELSKANSRFATTFYQHLADSKNDNDNIFLSPLSISTAFAMTKLGACNDTLQQLMEVFKFDTISEKTSDQIHFFFAKLNCRLYRKANKSSKLVSANRLFGDKSLTFNETYQDISELVYGAKLQPLDFKENAEQSRAAINKWVSNKTEGRITDVIPSEAINELTVLVLVNTIYFKGLWKSKFSPENTRKELFYKADGESCSASMMYQEGKFRYRRVAEGTQVLELPFKGDDITMVLILPKPEKSLAKVEKELTPEVLQEWLDELEEMMLVVHMPRFRIEDGFSLKEQLQDMGLVDLFSPEKSKLPGIVAEGRDDLYVSDAFHKAFLEVNEEGSEAAASTAVVIAGRSLNPNRVTFKANRPFLVFIREVPLNTIIFMGRVANPCVK


Targets:  65%|██████▌   | 13/20 [00:09<00:04,  1.44it/s]

AMINO SEQUENCE of Accession P63316:  MDDIYKAAVEQLTEEQKNEFKAAFDIFVLGAEDGCISTKELGKVMRMLGQNPTPEELQEMIDEVDEDGSGTVDFDEFLVMMVRCMKDDSKGKSEEELSDLFRMFDKNADGYIDLDELKIMLQATGETITEDDIEELMKDGDKNNDGRIDYDEFLEFMKGVE


Targets:  70%|███████   | 14/20 [00:10<00:04,  1.44it/s]

AMINO SEQUENCE of Accession P03372:  MTMTLHTKASGMALLHQIQGNELEPLNRPQLKIPLERPLGEVYLDSSKPAVYNYPEGAAYEFNAAAAANAQVYGQTGLPYGPGSEAAAFGSNGLGGFPPLNSVSPSPLMLLHPPPQLSPFLQPHGQQVPYYLENEPSGYTVREAGPPAFYRPNSDNRRQGGRERLASTNDKGSMAMESAKETRYCAVCNDYASGYHYGVWSCEGCKAFFKRSIQGHNDYMCPATNQCTIDKNRRKSCQACRLRKCYEVGMMKGGIRKDRRGGRMLKHKRQRDDGEGRGEVGSAGDMRAANLWPSPLMIKRSKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHQRLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLHAPTSRGGASVEETDQSHLATAGSTSSHSLQKYYITGEAEGFPATV


Targets:  75%|███████▌  | 15/20 [00:10<00:03,  1.43it/s]

AMINO SEQUENCE of Accession P03952:  MILFKQATYFISLFATVSCGCLTQLYENAFFRGGDVASMYTPNAQYCQMRCTFHPRCLLFSFLPASSINDMEKRFGCFLKDSVTGTLPKVHRTGAVSGHSLKQCGHQISACHRDIYKGVDMRGVNFNVSKVSSVEECQKRCTSNIRCQFFSYATQTFHKAEYRNNCLLKYSPGGTPTAIKVLSNVESGFSLKPCALSEIGCHMNIFQHLAFSDVDVARVLTPDAFVCRTICTYHPNCLFFTFYTNVWKIESQRNVCLLKTSESGTPSSSTPQENTISGYSLLTCKRTLPEPCHSKIYPGVDFGGEELNVTFVKGVNVCQETCTKMIRCQFFTYSLLPEDCKEEKCKCFLRLSMDGSPTRIAYGTQGSSGYSLRLCNTGDNSVCTTKTSTRIVGGTNSSWGEWPWQVSLQVKLTAQRHLCGGSLIGHQWVLTAAHCFDGLPLQDVWRIYSGILNLSDITKDTPFSQIKEIIIHQNYKVSEGNHDIALIKLQAPLNYTEFQKPICLPSKGDTSTIYTNCWVTGWGFSKEKGEIQNILQKVNIPLVTNEECQKRYQDYKITQRMVCAGYKEGGKDACKGDSGGPLVCKHNGMWRLVGITSWGEGCARREQPGVYTKVAEYMDWILEKTQSSDGKAQMQSPA


Targets:  80%|████████  | 16/20 [00:11<00:02,  1.43it/s]

AMINO SEQUENCE of Accession P04035:  MLSRLFRMHGLFVASHPWEVIVGTVTLTICMMSMNMFTGNNKICGWNYECPKFEEDVLSSDIIILTITRCIAILYIYFQFQNLRQLGSKYILGIAGLFTIFSSFVFSTVVIHFLDKELTGLNEALPFFLLLIDLSRASTLAKFALSSNSQDEVRENIARGMAILGPTFTLDALVECLVIGVGTMSGVRQLEIMCCFGCMSVLANYFVFMTFFPACVSLVLELSRESREGRPIWQLSHFARVLEEEENKPNPVTQRVKMIMSLGLVLVHAHSRWIADPSPQNSTADTSKVSLGLDENVSKRIEPSVSLWQFYLSKMISMDIEQVITLSLALLLAVKYIFFEQTETESTLSLKNPITSPVVTQKKVPDNCCRREPMLVRNNQKCDSVEEETGINRERKVEVIKPLVAETDTPNRATFVVGNSSLLDTSSVLVTQEPEIELPREPRPNEECLQILGNAEKGAKFLSDAEIIQLVNAKHIPAYKLETLMETHERGVSIRRQLLSKKLSEPSSLQYLPYRDYNYSLVMGACCENVIGYMPIPVGVAGPLCLDEKEFQVPMATTEGCLVASTNRGCRAIGLGGGASSRVLADGMTRGPVVRLPRACDSAEVKAWLETSEGFAVIKEAFDSTSRFARLQKLHTSIAGRNLYIRFQSRSGDAMGMNMISKGTEKALSKLHEYFPEMQILAVSGNYCTDKKPAAINWIEGRGKSVVCEAVIPAKVVREVLKTTTEAMIEVNINKNLVGSAMAGSIGGYNAHAANIVTAIYIACGQDAAQNVGSSNCITLMEASGPTNEDLYISCTMPSIEIGTVGGGTNLLPQQACLQMLGVQGACKDNPGENARQLARIVCGTVMAGELSLMAALAAGHLVKSHMIHNRSKINLQDLQGACTKKTA


Targets:  85%|████████▌ | 17/20 [00:12<00:02,  1.43it/s]

AMINO SEQUENCE of Accession P04150:  MDSKESLTPGREENPSSVLAQERGDVMDFYKTLRGGATVKVSASSPSLAVASQSDSKQRRLLVDFPKGSVSNAQQPDLSKAVSLSMGLYMGETETKVMGNDLGFPQQGQISLSSGETDLKLLEESIANLNRSTSVPENPKSSASTAVSAAPTEKEFPKTHSDVSSEQQHLKGQTGTNGGNVKLYTTDQSTFDILQDLEFSSGSPGKETNESPWRSDLLIDENCLLSPLAGEDDSFLLEGNSNEDCKPLILPDTKPKIKDNGDLVLSSPSNVTLPQVKTEKEDFIELCTPGVIKQEKLGTVYCQASFPGANIIGNKMSAISVHGVSTSGGQMYHYDMNTASLSQQQDQKPIFNVIPPIPVGSENWNRCQGSGDDNLTSLGTLNFPGRTVFSNGYSSPSMRPDVSSPPSSSSTATTGPPPKLCLVCSDEASGCHYGVLTCGSCKVFFKRAVEGQHNYLCAGRNDCIIDKIRRKNCPACRYRKCLQAGMNLEARKTKKKIKGIQQATTGVSQETSENPGNKTIVPATLPQLTPTLVSLLEVIEPEVLYAGYDSSVPDSTWRIMTTLNMLGGRQVIAAVKWAKAIPGFRNLHLDDQMTLLQYSWMFLMAFALGWRSYRQSSANLLCFAPDLIINEQRMTLPCMYDQCKHMLYVSSELHRLQVSYEEYLCMKTLLLLSSVPKDGLKSQELFDEIRMTYIKELGKAIVKREGNSSQNWQRFYQLTKLLDSMHEVVENLLNYCFQTFLDKTMSIEFPEMLAEIITNQIPKYSNGNIKKLLFHQK


Targets:  90%|█████████ | 18/20 [00:12<00:01,  1.44it/s]

AMINO SEQUENCE of Accession P04818:  MPVAGSELPRRPLPPAAQERDAEPRPPHGELQYLGQIQHILRCGVRKDDRTGTGTLSVFGMQARYSLRDEFPLLTTKRVFWKGVLEELLWFIKGSTNAKELSSKGVKIWDANGSRDFLDSLGFSTREEGDLGPVYGFQWRHFGAEYRDMESDYSGQGVDQLQRVIDTIKTNPDDRRIIMCAWNPRDLPLMALPPCHALCQFYVVNSELSCQLYQRSGDMGLGVPFNIASYALLTYMIAHITGLKPGDFIHTLGDAHIYLNHIEPLKIQLQREPRPFPKLRILRKVEKIDDFKAEDFQIEGYNPHPTIKMEMAV


Targets:  95%|█████████▌| 19/20 [00:13<00:00,  1.43it/s]

AMINO SEQUENCE of Accession P05023:  MGKGVGRDKYEPAAVSEQGDKKGKKGKKDRDMDELKKEVSMDDHKLSLDELHRKYGTDLSRGLTSARAAEILARDGPNALTPPPTTPEWIKFCRQLFGGFSMLLWIGAILCFLAYSIQAATEEEPQNDNLYLGVVLSAVVIITGCFSYYQEAKSSKIMESFKNMVPQQALVIRNGEKMSINAEEVVVGDLVEVKGGDRIPADLRIISANGCKVDNSSLTGESEPQTRSPDFTNENPLETRNIAFFSTNCVEGTARGIVVYTGDRTVMGRIATLASGLEGGQTPIAAEIEHFIHIITGVAVFLGVSFFILSLILEYTWLEAVIFLIGIIVANVPEGLLATVTVCLTLTAKRMARKNCLVKNLEAVETLGSTSTICSDKTGTLTQNRMTVAHMWFDNQIHEADTTENQSGVSFDKTSATWLALSRIAGLCNRAVFQANQENLPILKRAVAGDASESALLKCIELCCGSVKEMRERYAKIVEIPFNSTNKYQLSIHKNPNTSEPQHLLVMKGAPERILDRCSSILLHGKEQPLDEELKDAFQNAYLELGGLGERVLGFCHLFLPDEQFPEGFQFDTDDVNFPIDNLCFVGLISMIDPPRAAVPDAVGKCRSAGIKVIMVTGDHPITAKAIAKGVGIISEGNETVEDIAARLNIPVSQVNPRDAKACVVHGSDLKDMTSEQLDDILKYHTEIVFARTSPQQKLIIVEGCQRQGAIVAVTGDGVNDSPALKKADIGVAMGIAGSDVSKQAADMILLDDNFASIVTGVEEGRLIFDNLKKSIAYTLTSNIPEITPFLIFIIANIPLPLGTVTILCIDLGTDMVPAISLAYEQAESDIMKRQPRNPKTDKLVNERLISMAYGQIGMIQALGGFFTYFVILAENGFLPIHLLGLRVDWDDRWINDVEDSYGQQWTYEQRKIVEFTCHTAFFVSIVVVQWADLVICKTRRNSVFQQGMKNKILIFGLFEETA

Targets: 100%|██████████| 20/20 [00:14<00:00,  1.40it/s]

AMINO SEQUENCE of Accession P05091:  MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTFPTVNPSTGEVICQVAEGDKEDVDKAVKAARAAFQLGSPWRRMDASHRGRLLNRLADLIERDRTYLAALETLDNGKPYVISYLVDLDMVLKCLRYYAGWADKYHGKTIPIDGDFFSYTRHEPVGVCGQIIPWNFPLLMQAWKLGPALATGNVVVMKVAEQTPLTALYVANLIKEAGFPPGVVNIVPGFGPTAGAAIASHEDVDKVAFTGSTEIGRVIQVAAGSSNLKRVTLELGGKSPNIIMSDADMDWAVEQAHFALFFNQGQCCCAGSRTFVQEDIYDEFVERSVARAKSRVVGNPFDSKTEQGPQVDETQFKKILGYINTGKQEGAKLLCGGGIAADRGYFIQPTVFGDVQDGMTIAKEEIFGPVMQILKFKTIEEVVGRANNSTYGLAAAVFTKDLDKANYLSQALQAGTVWVNCYDVFGAQSPFGGYKMSGSGRELGEYGLQAYTEVKTVTVKVPQKNS
length: 20
picked 2 {
  "target_chembl_id": "CHEMBL1971",
  "pref_name": "Sulfonylurea receptor 2",
  "accession": "O60706",
  "component_id": 294,
  "component_type": "PROTEIN",
  "relationship": "SINGLE PROTEIN",
  "sequence": "MSLSFCGNNISSYNINDGVLQNSCFVDALNLVPHVFLLFITFPILFIGWGSQSSKVQIHHNTWLHFPGHNLRWILTFALLFVHVCEIAEGIVSDSRRESRHLHLFMPAVMGFVATTTSIVYYHNIETSNFPKLLLALFLYWVMAFITKTIKLVKYCQSGLDISNLRFCITGMMVILNGLLMAVEINVIRVRRYVFFMNPQKVKPPEDLQDLGVRF




In [75]:
org_seq

{'organism': 'Homo sapiens',
 'sequence': 'MLRAAARFGPRLGRRLLSAAATQAVPAPNQQPEVFCNQIFINNEWHDAVSRKTFPTVNPSTGEVICQVAEGDKEDVDKAVKAARAAFQLGSPWRRMDASHRGRLLNRLADLIERDRTYLAALETLDNGKPYVISYLVDLDMVLKCLRYYAGWADKYHGKTIPIDGDFFSYTRHEPVGVCGQIIPWNFPLLMQAWKLGPALATGNVVVMKVAEQTPLTALYVANLIKEAGFPPGVVNIVPGFGPTAGAAIASHEDVDKVAFTGSTEIGRVIQVAAGSSNLKRVTLELGGKSPNIIMSDADMDWAVEQAHFALFFNQGQCCCAGSRTFVQEDIYDEFVERSVARAKSRVVGNPFDSKTEQGPQVDETQFKKILGYINTGKQEGAKLLCGGGIAADRGYFIQPTVFGDVQDGMTIAKEEIFGPVMQILKFKTIEEVVGRANNSTYGLAAAVFTKDLDKANYLSQALQAGTVWVNCYDVFGAQSPFGGYKMSGSGRELGEYGLQAYTEVKTVTVKVPQKNS'}

In [27]:
PICKED_TARGETS[0]['sequence']

'MARKKLKKFTTLEIVLSVLLLVLFIISIVLIVLLAKESLKSTAPDPGTTGTPDPGTTGTPDPGTTGTTHARTTGPPDPGTTGTTPVSAECPVVNELERINCIPDQPPTKATCDQRGCCWNPQGAVSVPWCYYSKNHSYHVEGNLVNTNAGFTARLKNLPSSPVFGSNVDNVLLTAEYQTSNRFHFKLTDQTNNRFEVPHEHVQSFSGNAAASLTYQVEISRQPFSIKVTRRSNNRVLFDSSIGPLLFADQFLQLSTRLPSTNVYGLGEHVHQQYRHDMNWKTWPIFNRDTTPNGNGTNLYGAQTFFLCLEDASGLSFGVFLMNSNAMEVVLQPAPAITYRTIGGILDFYVFLGNTPEQVVQEYLELIGRPALPSYWALGFHLSRYEYGTLDNMREVVERNRAAQLPYDVQHADIDYMDERRDFTYDSVDFKGFPEFVNELHNNGQKLVIIVDPAISNNSSSSKPYGPYDRGSDMKIWVNSSDGVTPLIGEVWPGQTVFPDYTNPNCAVWWTKEFELFHNQVEFDGIWIDMNEVSNFVDGSVSGCSTNNLNNPPFTPRILDGYLFCKTLCMDAVQHWGKQYDIHNLYGYSMAVATAEAAKTVFPNKRSFILTRSTFAGSGKFAAHWLGDNTATWDDLRWSIPGVLEFNLFGIPMVGPDICGFALDTPEELCRRWMQLGAFYPFSRNHNGQGYKDQDPASFGADSLLLNSSRHYLNIRYTLLPYLYTLFFRAHSRGDTVARPLLHEFYEDNSTWDVHQQFLWGPGLLITPVLDEGAEKVMAYVPDAVWYDYETGSQVRWRKQKVEMELPGDKIGLHLRGGYIFPTQQPNTTTLASRKNPLGLIIALDENKEAKGELFWDNGETKDTVANKVYLLCEFSVTQNRLEVNISQSTYKDPNNLAFNEIKILGTEEPSNVTVKHNGVPSQTSPTVTYDSNLKVAIITDIDLLLGEAYTVEWSIKIRDEEKIDCYPDENGASAENCTARGCIWEASNSSGVPFCYFV

In [28]:
import torch
from tape import ProteinBertModel, TAPETokenizer

model = ProteinBertModel.from_pretrained('bert-base')
tokenizer = TAPETokenizer(vocab='iupac')  # iupac is the vocab for TAPE models, use unirep for the UniRep model


In [None]:

# Pfam Family: Hexapep, Clan: CL0536
sequence = 'GCTVEDRCLIGMGAILLNGCVIGSGSLVAAGALITQ'
token_ids = torch.tensor([tokenizer.encode(sequence)])
output = model(token_ids)
sequence_output = output[0]
pooled_output = output[1]

# NOTE: pooled_output is *not* trained for the transformer, do not use
# w/o fine-tuning. A better option for now is to simply take a mean of
# the sequence output

In [None]:

# 4) fetch SMILES for all molecules (RDKit cleaning next)
pos_df = pd.DataFrame(all_pos)
if pos_df.empty:
    raise SystemExit("No positive pairs collected—try lowering MIN_ACTS_PER_TARGET or increase MAX_TARGETS.")

mol_ids = sorted(pos_df['molecule_chembl_id'].unique())
MCHUNK = 100
mol_rows = []
for i in tqdm(range(0, len(mol_ids), MCHUNK), desc='Molecules'):
    part = mol_ids[i:i+MCHUNK]
    mres = mol_api.filter(molecule_chembl_id__in=";".join(part)) \
                  .only(['molecule_chembl_id','molecule_structures'])
    for m in mres:
        mid = m['molecule_chembl_id']
        s   = (m.get('molecule_structures') or {}).get('canonical_smiles')
        mol_rows.append({'molecule_chembl_id': mid, 'canonical_smiles': s})

mol_df = pd.DataFrame(mol_rows)


In [None]:

# 5) RDKit cleaning (desalt, strip stereo, remove uncommon atoms)
from rdkit import Chem

ALLOWED = set(['B','C','N','O','P','S','F','Cl','Br','I','Si','H'])

def clean_smiles(smi):
    if not smi:
        return None
    # Keep largest fragment (desalt)
    largest = max(smi.split('.'), key=len)
    mol = Chem.MolFromSmiles(largest)
    if mol is None:
        return None
    # Remove stereochemistry
    Chem.RemoveStereochemistry(mol)
    # Remove uncommon atoms
    for atom in mol.GetAtoms():
        sym = atom.GetSymbol()
        if sym not in ALLOWED:
            return None
    return Chem.MolToSmiles(mol, isomericSmiles=False)

mol_df['smiles_clean'] = mol_df['canonical_smiles'].map(clean_smiles)
mol_df = mol_df.dropna(subset=['smiles_clean'])

# 6) join & (optionally) add random negatives for 1:1 balance
pairs = pos_df.merge(mol_df[['molecule_chembl_id','smiles_clean']],
                     on='molecule_chembl_id', how='inner')

if MAKE_BALANCED_1TO1:
    # Build candidate negatives as (target, molecule) combos not in positives
    pos_set = {(r.target_chembl_id, r.molecule_chembl_id) for r in pairs.itertuples()}
    all_targets = pairs['target_chembl_id'].unique().tolist()
    all_mols    = pairs['molecule_chembl_id'].unique().tolist()

    neg_rows = []
    tries = 0
    goal = pairs.shape[0]
    while len(neg_rows) < goal and tries < goal * 50:
        tries += 1
        t = random.choice(all_targets)
        m = random.choice(all_mols)
        if (t, m) in pos_set:
            continue
        neg_rows.append({'target_chembl_id': t, 'molecule_chembl_id': m, 'label': 0})
    neg_df = pd.DataFrame(neg_rows).drop_duplicates().head(goal)

    # Attach metadata & smiles
    meta_cols = ['target_pref_name','uniprot_ids','protein_sequence']
    target_meta = pairs[['target_chembl_id'] + meta_cols].drop_duplicates()
    neg_df = neg_df.merge(target_meta, on='target_chembl_id', how='left') \
                   .merge(mol_df[['molecule_chembl_id','smiles_clean']], on='molecule_chembl_id', how='left') \
                   .dropna(subset=['smiles_clean'])

    # Merge positives + negatives
    base_cols = ['target_chembl_id','target_pref_name','uniprot_ids','protein_sequence',
                 'molecule_chembl_id','smiles_clean','label']
    pos_out = pairs[base_cols + ['pchembl_value']].copy()
    neg_out = neg_df[base_cols].copy()
    neg_out['pchembl_value'] = None

    final_df = pd.concat([pos_out, neg_out], ignore_index=True)
else:
    final_df = pairs[['target_chembl_id','target_pref_name','uniprot_ids','protein_sequence',
                      'molecule_chembl_id','smiles_clean','pchembl_value','label']].copy()

# 7) write to CSV
final_df.to_csv('deeptarget_like_pairs.csv', index=False)
print("Wrote", final_df.shape, "to deeptarget_like_pairs.csv")