In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
import os
import torch
import json


In [5]:
from chembl_webresource_client.new_client import new_client

available_resources = [resource for resource in dir(new_client) if not resource.startswith('_')]
print(available_resources)

  __version__ = __import__('pkg_resources').get_distribution('chembl_webresource_client').version




In [6]:
from chembl_webresource_client.new_client import new_client

molecule = new_client.molecule
mols = molecule.filter(pref_name__iexact='aspirin')
mols



## DRUG - PROTEIN PAIR

In [None]:
# pip install chembl-webresource-client pandas tqdm rdkit-pypi
from chembl_webresource_client.new_client import new_client

from tqdm import tqdm
import random
from collections import defaultdict

# ---------- knobs (tune scale here) ----------
MAX_TARGETS            = 20      # ~80 human single proteins
MIN_ACTS_PER_TARGET    = 10      # skip sparse targets
MAX_ACTS_PER_TARGET    = 100     # cap per target to keep it light
MAKE_BALANCED_1TO1     = True    # add random negatives to balance
RAND_SEED              = 42
# --------------------------------------------

random.seed(RAND_SEED)

# Sanity: show that 'target_component' exists (not 'component')
# print('has target_component?', hasattr(new_client, 'target_component'))

target_api   = new_client.target
act_api      = new_client.activity
assay_api    = new_client.assay
mol_api      = new_client.molecule
tcomp_api    = new_client.target_component

# 1) pick human single-protein targets
targets = target_api.filter(target_type='SINGLE PROTEIN', organism='Homo sapiens') \
                    .only(['target_chembl_id','pref_name','target_components'])

print(targets)
print(type(targets))

# Shuffle for diversity and iterate
# targets = list(targets)


[{'pref_name': 'Maltase-glucoamylase', 'target_chembl_id': 'CHEMBL2074', 'target_components': [{'accession': 'O43451', 'component_description': 'Maltase-glucoamylase', 'component_id': 434, 'component_type': 'PROTEIN', 'relationship': 'SINGLE PROTEIN', 'target_component_synonyms': [{'component_synonym': '3.2.1.20', 'syn_type': 'EC_NUMBER'}, {'component_synonym': 'Alpha-1,4-glucosidase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'Maltase-glucoamylase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'MGA', 'syn_type': 'GENE_SYMBOL_OTHER'}, {'component_synonym': 'MGAM', 'syn_type': 'GENE_SYMBOL'}, {'component_synonym': 'MGAML', 'syn_type': 'GENE_SYMBOL_OTHER'}, {'component_synonym': 'Synonyms=MGA', 'syn_type': 'GENE_SYMBOL_OTHER'}], 'target_component_xrefs': [{'xref_id': 'O43451', 'xref_name': None, 'xref_src_db': 'AlphaFoldDB'}, {'xref_id': 'O43451', 'xref_name': None, 'xref_src_db': 'ExpressionAtlas'}, {'xref_id': 'GO:0005886', 'xref_name': 'plasma membrane', 'xref_src_db': 'GoCompone

In [8]:
print("TOTAL SINGLE PROTEIN organism 'Homo sapiens': ",len(targets))

sample_target = targets[0]
print('sample :',sample_target)
print(type(sample_target))

TOTAL SINGLE PROTEIN organism 'Homo sapiens':  4387
sample : {'pref_name': 'Maltase-glucoamylase', 'target_chembl_id': 'CHEMBL2074', 'target_components': [{'accession': 'O43451', 'component_description': 'Maltase-glucoamylase', 'component_id': 434, 'component_type': 'PROTEIN', 'relationship': 'SINGLE PROTEIN', 'target_component_synonyms': [{'component_synonym': '3.2.1.20', 'syn_type': 'EC_NUMBER'}, {'component_synonym': 'Alpha-1,4-glucosidase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'Maltase-glucoamylase', 'syn_type': 'UNIPROT'}, {'component_synonym': 'MGA', 'syn_type': 'GENE_SYMBOL_OTHER'}, {'component_synonym': 'MGAM', 'syn_type': 'GENE_SYMBOL'}, {'component_synonym': 'MGAML', 'syn_type': 'GENE_SYMBOL_OTHER'}, {'component_synonym': 'Synonyms=MGA', 'syn_type': 'GENE_SYMBOL_OTHER'}], 'target_component_xrefs': [{'xref_id': 'O43451', 'xref_name': None, 'xref_src_db': 'AlphaFoldDB'}, {'xref_id': 'O43451', 'xref_name': None, 'xref_src_db': 'ExpressionAtlas'}, {'xref_id': 'GO:0005886

In [5]:
PICKED_TARGETS = []
for t in targets:
    if len(PICKED_TARGETS) >= MAX_TARGETS:
        break
    PICKED_TARGETS.append({'target_chembl_id': t['target_chembl_id'],
                   'pref_name': t.get('pref_name', None),
                   'accession':t['target_components'][0]['accession'],
                   'component_id':t['target_components'][0]['component_id'],
                   'component_type':t['target_components'][0]['component_type'],
                   'relationship':t['target_components'][0]['relationship'],

                   })
PICKED_TARGETS

[{'target_chembl_id': 'CHEMBL2074',
  'pref_name': 'Maltase-glucoamylase',
  'accession': 'O43451',
  'component_id': 434,
  'component_type': 'PROTEIN',
  'relationship': 'SINGLE PROTEIN'},
 {'target_chembl_id': 'CHEMBL1971',
  'pref_name': 'Sulfonylurea receptor 2',
  'accession': 'O60706',
  'component_id': 294,
  'component_type': 'PROTEIN',
  'relationship': 'SINGLE PROTEIN'},
 {'target_chembl_id': 'CHEMBL1827',
  'pref_name': 'Phosphodiesterase 5A',
  'accession': 'O76074',
  'component_id': 124,
  'component_type': 'PROTEIN',
  'relationship': 'SINGLE PROTEIN'},
 {'target_chembl_id': 'CHEMBL1859',
  'pref_name': 'Voltage-gated T-type calcium channel alpha-1H subunit',
  'accession': 'O95180',
  'component_id': 167,
  'component_type': 'PROTEIN',
  'relationship': 'SINGLE PROTEIN'},
 {'target_chembl_id': 'CHEMBL202',
  'pref_name': 'Dihydrofolate reductase',
  'accession': 'P00374',
  'component_id': 396,
  'component_type': 'PROTEIN',
  'relationship': 'SINGLE PROTEIN'},
 {'targ

In [10]:
picked_sample_target = PICKED_TARGETS[3]
# picked_sample_target['accession']

## Get sequence - organism from accession id: TESTER

In [11]:
all_seq = tcomp_api.filter(accession= 'B2RXH2').only(['organism','sequence'])
print("leng" ,len(all_seq))
print(all_seq)

leng 1
[{'organism': 'Homo sapiens', 'sequence': 'MKSVHSSPQNTSHTIMTFYPTMEEFADFNTYVAYMESQGAHQAGLAKVIPPKEWKARQMYDDIEDILIATPLQQVTSGQGGVFTQYHKKKKAMRVGQYRRLANSKKYQTPPHQNFADLEQRYWKSHPGNPPIYGADISGSLFEESTKQWNLGHLGTILDLLEQECGVVIEGVNTPYLYFGMWKTTFAWHTEDMDLYSINYLHFGEPKTWYVVPPEHGQHLERLARELFPDISRGCEAFLRHKVALISPTVLKENGIPFNCMTQEAGEFMVTFPYGYHAGFNHGFNCAEAINFATPRWIDYGKMASQCSCGESTVTFSMDPFVRIVQPESYELWKHRQDLAIVEHTEPRVAESQELSNWRDDIVLRRAALGLRLLPNLTAQCPTQPVSSGHCYNPKGCGTDAVPGSAFQSSAYHTQTQSLTLGMSARVLLPSTGSWGSGRGRGRGQGQGRGCSRGRGHGCCTRELGTEEPTVQPASKRRLLMGTRSRAQGHRPQLPLANDLMTNLSL'}]


In [12]:
homosep_seq = tcomp_api.filter(organism='Homo sapiens',target_chembl_id=picked_sample_target['component_id']).only(['organism','sequence'])
homosep_seq

[{'organism': 'Homo sapiens', 'sequence': 'MEPWPLLLLFSLCSAGLVLGSEHETRLVAKLFKDYSSVVRPVEDHRQVVEVTVGLQLIQLINVDEVNQIVTTNVRLKQQWVDYNLKWNPDDYGGVKKIHIPSEKIWRPDLVLYNNADGDFAIVKFTKVLLQYTGHITWTPPAIFKSYCEIIVTHFPFDEQNCSMKLGTWTYDGSVVAINPESDQPDLSNFMESGEWVIKESRGWKHSVTYSCCPDTPYLDITYHFVMQRLPLYFIVNVIIPCLLFSFLTGLVFYLPTDSGEKMTLSISVLLSLTVFLLVIVELIPSTSSAVPLIGKYMLFTMVFVIASIIITVIVINTHHRSPSTHVMPNWVRKVFIDTIPNIMFFSTMKRPSREKQDKKIFTEDIDISDISGKPGPPPMGFHSPLIKHPEVKSAIEGIKYIAETMKSDQESNNAAAEWKYVAMVMDHILLGVFMLVCIIGTLAVFAGRLIELNQQG'}, {'organism': 'Homo sapiens', 'sequence': 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD'}, {'organism': 'Homo sapiens', 'sequence': 'MRARPRPRPLWATVL

## PULL amino seq info list based on accession list

In [7]:
import json

In [3]:
# pull amino sequence info
# Accession = UniProtein ID
def fetch_sequence_bundle(accession_ID):
    rows = tcomp_api.filter(accession = accession_ID, organism='Homo sapiens').only(['organism','sequence'])  # accession is UniProt
    
    if len(rows) == 1:
        rows = rows[0]
    else: 
        print("SO THERE IS A CASE WHERE 1 ACCESSION is >1 protein sequence???")

    print(f'AMINO SEQUENCE of Accession {accession_ID}: ', rows['sequence'])
    # organism = sorted({r.get('organism') for r in rows if r.get('organism')})
    # seqs       = [r.get('sequence') for r in rows if r.get('sequence')]
    # Prefer the longest sequence (single protein targets normally have one)
    # seq = max(seqs, key=len) if seqs else None
    # return organism, seq
    return rows

# 2) pull binding activities with strong QC & pChEMBL present
def fetch_clean_activities(tid):
    # Assay filter first (faster if you ever expand): relationship D, binding B
    assays = assay_api.filter(target_chembl_id=tid,
                              relationship_type='D',
                              assay_type='B').only(['assay_chembl_id'])
    assay_ids = [a['assay_chembl_id'] for a in assays]
    # print(assay_ids)
    # print("assay_ids: ",assay_ids)

    if not assay_ids:
        return pd.DataFrame()

    # Chunk assay_ids with __in; and require pChEMBL present & valid
    CHUNK = 200
    recs = []
    for i in range(0, len(assay_ids), CHUNK):
        chunk = assay_ids[i:i+CHUNK]
        # print("chunk:",chunk)
        acts = act_api.filter(assay_chembl_id__in=chunk, #";".join(chunk)
                              pchembl_value__isnull=False,
                              data_validity_comment__isnull=True) \
                      .only(['molecule_chembl_id','pchembl_value','standard_type',
                             'standard_units','standard_relation'])
        # print("acts:",acts)
        recs.extend(acts)

    if not recs:
        return pd.DataFrame()
    # print("RECS: ",recs)
    df = pd.DataFrame.from_records(recs)
    # Keep exact values (relation "=") if present; pChEMBL already standardizes, but keep clean
    if 'standard_relation' in df.columns:
        df = df[(df['standard_relation'].isna()) | (df['standard_relation'] == '=')]
    # Deduplicate molecule per target by best pChEMBL
    df = df.sort_values('pchembl_value', ascending=False) \
           .drop_duplicates(subset=['molecule_chembl_id'])
    return df[['molecule_chembl_id','pchembl_value']]

In [None]:

# 3) iterate targets → compute per-target median, label, keep label==1
all_pos = []
target_info = {}
for t in tqdm(PICKED_TARGETS, desc='Targets'):
    accession = t['accession']
    # print(t)
    org_seq = fetch_sequence_bundle(accession)
    # print(df,a)
    target_info[accession] = org_seq['sequence']
    t['sequence'] = org_seq['sequence']
    t['organism'] = org_seq['organism']
    
print("length:", len(PICKED_TARGETS))
# print('PICKED_TARGETS', json.dumps(PICKED_TARGETS,indent=2))
print('picked 2', json.dumps(PICKED_TARGETS[1],indent=2))
print('picked 16', json.dumps(PICKED_TARGETS[15],indent=2))
#     if df.shape[0] < MIN_ACTS_PER_TARGET:
#         continue
#     df = df.head(MAX_ACTS_PER_TARGET)  # cap
#     print(df.head())
#     df['pchembl_value'] = df['pchembl_value'].apply(lambda x: round(float(x),2))
#     med = df['pchembl_value'].median()
#     print("==== MEDIAN : ", med)
#     df['label'] = (df['pchembl_value'] > med).astype(int)
#     df = df[df['label'] == 1]  # keep above-median only (DeepTarget-style)
#     print("check df",df.head())
#     if df.empty:
#         continue
#     accs, seq = fetch_sequence_bundle(tid)
#     # Filter out sequences with unknown 'X'
#     if seq and ('X' in seq):
#         continue

#     for r in df.itertuples(index=False):
#         all_pos.append({
#             'target_chembl_id': tid,
#             'target_pref_name': t['pref_name'],
#             'uniprot_ids': ";".join(accs) if accs else None,
#             'protein_sequence': seq,
#             'molecule_chembl_id': r.molecule_chembl_id,
#             'pchembl_value': r.pchembl_value,
#             'label': 1
#         })
# print(df.head())



Targets: 100%|██████████| 20/20 [00:00<00:00, 510.06it/s]

AMINO SEQUENCE of Accession O43451:  MARKKLKKFTTLEIVLSVLLLVLFIISIVLIVLLAKESLKSTAPDPGTTGTPDPGTTGTPDPGTTGTTHARTTGPPDPGTTGTTPVSAECPVVNELERINCIPDQPPTKATCDQRGCCWNPQGAVSVPWCYYSKNHSYHVEGNLVNTNAGFTARLKNLPSSPVFGSNVDNVLLTAEYQTSNRFHFKLTDQTNNRFEVPHEHVQSFSGNAAASLTYQVEISRQPFSIKVTRRSNNRVLFDSSIGPLLFADQFLQLSTRLPSTNVYGLGEHVHQQYRHDMNWKTWPIFNRDTTPNGNGTNLYGAQTFFLCLEDASGLSFGVFLMNSNAMEVVLQPAPAITYRTIGGILDFYVFLGNTPEQVVQEYLELIGRPALPSYWALGFHLSRYEYGTLDNMREVVERNRAAQLPYDVQHADIDYMDERRDFTYDSVDFKGFPEFVNELHNNGQKLVIIVDPAISNNSSSSKPYGPYDRGSDMKIWVNSSDGVTPLIGEVWPGQTVFPDYTNPNCAVWWTKEFELFHNQVEFDGIWIDMNEVSNFVDGSVSGCSTNNLNNPPFTPRILDGYLFCKTLCMDAVQHWGKQYDIHNLYGYSMAVATAEAAKTVFPNKRSFILTRSTFAGSGKFAAHWLGDNTATWDDLRWSIPGVLEFNLFGIPMVGPDICGFALDTPEELCRRWMQLGAFYPFSRNHNGQGYKDQDPASFGADSLLLNSSRHYLNIRYTLLPYLYTLFFRAHSRGDTVARPLLHEFYEDNSTWDVHQQFLWGPGLLITPVLDEGAEKVMAYVPDAVWYDYETGSQVRWRKQKVEMELPGDKIGLHLRGGYIFPTQQPNTTTLASRKNPLGLIIALDENKEAKGELFWDNGETKDTVANKVYLLCEFSVTQNRLEVNISQSTYKDPNNLAFNEIKILGTEEPSNVTVKHNGVPSQTSPTVTYDSNLKVAIITDIDLLLGEAYTVEWSIKIRDEEK




## GET ACTIVATION (STANDARD TYPE IC50)

In [25]:
pairs = []   # (protein_id, smiles, activity_value)

for t in tqdm(PICKED_TARGETS):
    tid = t['target_chembl_id']

    acts = act_api.filter(target_chembl_id=t['target_chembl_id'], standard_type='IC50') \
                  .only(['molecule_chembl_id','canonical_smiles',
                         'standard_value','standard_units','pchembl_value'])

    acts = list(acts)

    # filter out missing/invalid
    acts = [a for a in acts if a.get('canonical_smiles') and a.get('standard_value')]

    # cap per target
    if len(acts) > MAX_ACTS_PER_TARGET:
        acts = random.sample(acts, MAX_ACTS_PER_TARGET)

    # skip sparse targets
    if len(acts) < MIN_ACTS_PER_TARGET:
        continue

    for a in acts:
        pairs.append({
            'protein_id': tid,
            'smiles': a['canonical_smiles'],
            'activity': float(a['standard_value']),
            'units': a['standard_units'],
            'pchembl': a.get('pchembl_value')
        })


 25%|██▌       | 5/20 [08:11<24:33, 98.20s/it]


KeyboardInterrupt: 

In [16]:
PICKED_TARGETS[0]['sequence']

'MARKKLKKFTTLEIVLSVLLLVLFIISIVLIVLLAKESLKSTAPDPGTTGTPDPGTTGTPDPGTTGTTHARTTGPPDPGTTGTTPVSAECPVVNELERINCIPDQPPTKATCDQRGCCWNPQGAVSVPWCYYSKNHSYHVEGNLVNTNAGFTARLKNLPSSPVFGSNVDNVLLTAEYQTSNRFHFKLTDQTNNRFEVPHEHVQSFSGNAAASLTYQVEISRQPFSIKVTRRSNNRVLFDSSIGPLLFADQFLQLSTRLPSTNVYGLGEHVHQQYRHDMNWKTWPIFNRDTTPNGNGTNLYGAQTFFLCLEDASGLSFGVFLMNSNAMEVVLQPAPAITYRTIGGILDFYVFLGNTPEQVVQEYLELIGRPALPSYWALGFHLSRYEYGTLDNMREVVERNRAAQLPYDVQHADIDYMDERRDFTYDSVDFKGFPEFVNELHNNGQKLVIIVDPAISNNSSSSKPYGPYDRGSDMKIWVNSSDGVTPLIGEVWPGQTVFPDYTNPNCAVWWTKEFELFHNQVEFDGIWIDMNEVSNFVDGSVSGCSTNNLNNPPFTPRILDGYLFCKTLCMDAVQHWGKQYDIHNLYGYSMAVATAEAAKTVFPNKRSFILTRSTFAGSGKFAAHWLGDNTATWDDLRWSIPGVLEFNLFGIPMVGPDICGFALDTPEELCRRWMQLGAFYPFSRNHNGQGYKDQDPASFGADSLLLNSSRHYLNIRYTLLPYLYTLFFRAHSRGDTVARPLLHEFYEDNSTWDVHQQFLWGPGLLITPVLDEGAEKVMAYVPDAVWYDYETGSQVRWRKQKVEMELPGDKIGLHLRGGYIFPTQQPNTTTLASRKNPLGLIIALDENKEAKGELFWDNGETKDTVANKVYLLCEFSVTQNRLEVNISQSTYKDPNNLAFNEIKILGTEEPSNVTVKHNGVPSQTSPTVTYDSNLKVAIITDIDLLLGEAYTVEWSIKIRDEEKIDCYPDENGASAENCTARGCIWEASNSSGVPFCYFV

In [9]:
import torch
from tape import ProteinBertModel, TAPETokenizer

model = ProteinBertModel.from_pretrained('bert-base')
tokenizer = TAPETokenizer(vocab='iupac')  # iupac is the vocab for TAPE models, use unirep for the UniRep model


In [None]:

# Pfam Family: Hexapep, Clan: CL0536
sequence = PICKED_TARGETS[0]['sequence']
token_ids = torch.tensor([tokenizer.encode(sequence)])
output = model(token_ids)
sequence_output = output[0]
pooled_output = output[1]

# NOTE: pooled_output is *not* trained for the transformer, do not use
# w/o fine-tuning. A better option for now is to simply take a mean of
# the sequence output

  token_ids = torch.tensor([tokenizer.encode(sequence)])


: 

In [None]:
for picked in tqdm(PICKED_TARGETS, desc="Embedded to seq"):
    picked_sequence = picked['sequence']
    token_ids = torch.tensor([tokenizer.encode(picked_sequence)])
    output = model(token_ids)
    picked['embsequence'] = output[1]
    
print(PICKED_TARGETS)
print("length:", len(PICKED_TARGETS))
# print('PICKED_TARGETS', json.dumps(PICKED_TARGETS,indent=2))
# print('picked 2', json.dumps(PICKED_TARGETS[1],indent=2))
# print('picked 16', json.dumps(PICKED_TARGETS[15],indent=2))

  token_ids = torch.tensor([tokenizer.encode(picked_sequence)])
Embedded to seq: 100%|██████████| 20/20 [01:27<00:00,  4.38s/it]


[{'target_chembl_id': 'CHEMBL2074', 'pref_name': 'Maltase-glucoamylase', 'accession': 'O43451', 'component_id': 434, 'component_type': 'PROTEIN', 'relationship': 'SINGLE PROTEIN', 'sequence': 'MARKKLKKFTTLEIVLSVLLLVLFIISIVLIVLLAKESLKSTAPDPGTTGTPDPGTTGTPDPGTTGTTHARTTGPPDPGTTGTTPVSAECPVVNELERINCIPDQPPTKATCDQRGCCWNPQGAVSVPWCYYSKNHSYHVEGNLVNTNAGFTARLKNLPSSPVFGSNVDNVLLTAEYQTSNRFHFKLTDQTNNRFEVPHEHVQSFSGNAAASLTYQVEISRQPFSIKVTRRSNNRVLFDSSIGPLLFADQFLQLSTRLPSTNVYGLGEHVHQQYRHDMNWKTWPIFNRDTTPNGNGTNLYGAQTFFLCLEDASGLSFGVFLMNSNAMEVVLQPAPAITYRTIGGILDFYVFLGNTPEQVVQEYLELIGRPALPSYWALGFHLSRYEYGTLDNMREVVERNRAAQLPYDVQHADIDYMDERRDFTYDSVDFKGFPEFVNELHNNGQKLVIIVDPAISNNSSSSKPYGPYDRGSDMKIWVNSSDGVTPLIGEVWPGQTVFPDYTNPNCAVWWTKEFELFHNQVEFDGIWIDMNEVSNFVDGSVSGCSTNNLNNPPFTPRILDGYLFCKTLCMDAVQHWGKQYDIHNLYGYSMAVATAEAAKTVFPNKRSFILTRSTFAGSGKFAAHWLGDNTATWDDLRWSIPGVLEFNLFGIPMVGPDICGFALDTPEELCRRWMQLGAFYPFSRNHNGQGYKDQDPASFGADSLLLNSSRHYLNIRYTLLPYLYTLFFRAHSRGDTVARPLLHEFYEDNSTWDVHQQFLWGPGLLITPVLDEGAEKVMAYVPDAVWYDYETGSQVRWRKQKVEMELP

TypeError: Object of type Tensor is not JSON serializable

In [14]:
import pickle

In [15]:
len(PICKED_TARGETS)
print(PICKED_TARGETS[0])
with open('saved_embed_picked_310825.pkl', 'wb') as f:
    pickle.dump(PICKED_TARGETS, f)

{'target_chembl_id': 'CHEMBL2074', 'pref_name': 'Maltase-glucoamylase', 'accession': 'O43451', 'component_id': 434, 'component_type': 'PROTEIN', 'relationship': 'SINGLE PROTEIN', 'sequence': 'MARKKLKKFTTLEIVLSVLLLVLFIISIVLIVLLAKESLKSTAPDPGTTGTPDPGTTGTPDPGTTGTTHARTTGPPDPGTTGTTPVSAECPVVNELERINCIPDQPPTKATCDQRGCCWNPQGAVSVPWCYYSKNHSYHVEGNLVNTNAGFTARLKNLPSSPVFGSNVDNVLLTAEYQTSNRFHFKLTDQTNNRFEVPHEHVQSFSGNAAASLTYQVEISRQPFSIKVTRRSNNRVLFDSSIGPLLFADQFLQLSTRLPSTNVYGLGEHVHQQYRHDMNWKTWPIFNRDTTPNGNGTNLYGAQTFFLCLEDASGLSFGVFLMNSNAMEVVLQPAPAITYRTIGGILDFYVFLGNTPEQVVQEYLELIGRPALPSYWALGFHLSRYEYGTLDNMREVVERNRAAQLPYDVQHADIDYMDERRDFTYDSVDFKGFPEFVNELHNNGQKLVIIVDPAISNNSSSSKPYGPYDRGSDMKIWVNSSDGVTPLIGEVWPGQTVFPDYTNPNCAVWWTKEFELFHNQVEFDGIWIDMNEVSNFVDGSVSGCSTNNLNNPPFTPRILDGYLFCKTLCMDAVQHWGKQYDIHNLYGYSMAVATAEAAKTVFPNKRSFILTRSTFAGSGKFAAHWLGDNTATWDDLRWSIPGVLEFNLFGIPMVGPDICGFALDTPEELCRRWMQLGAFYPFSRNHNGQGYKDQDPASFGADSLLLNSSRHYLNIRYTLLPYLYTLFFRAHSRGDTVARPLLHEFYEDNSTWDVHQQFLWGPGLLITPVLDEGAEKVMAYVPDAVWYDYETGSQVRWRKQKVEMELPG

In [None]:
with open('saved_embed_picked_310825.pkl', 'rb') as f:
    picked = pickle.load(f)

print(len(picked))

20


In [19]:
picked.keys()

dict_keys(['target_chembl_id', 'pref_name', 'accession', 'component_id', 'component_type', 'relationship', 'sequence', 'organism', 'embsequence'])

In [None]:
print(len(output))
print(output[0].shape)
print(output[1].shape)
print(output)

2
torch.Size([1, 2755, 768])
torch.Size([1, 768])
(tensor([[[ 0.7320,  0.9589,  0.6381,  ...,  0.5864,  0.1955, -1.6099],
         [ 1.7864, -0.2002, -0.4942,  ..., -0.1764, -0.5036, -1.2659],
         [ 2.0414,  0.1063, -0.2197,  ...,  0.4180, -0.4415, -1.9124],
         ...,
         [ 0.5134, -0.6142, -0.9401,  ..., -0.7539, -0.7913, -1.6626],
         [ 0.5750, -0.5357,  0.2142,  ..., -0.1438, -0.5304, -1.7309],
         [ 0.2703, -0.1470, -0.6470,  ...,  0.4946, -0.6886, -1.0927]]],
       grad_fn=<AddBackward0>), tensor([[-4.7742e-01, -7.9333e-01, -6.8756e-01,  1.3448e-01,  3.7441e-02,
         -1.2088e-01, -2.8503e-01, -9.1971e-03,  3.5296e-01,  6.0001e-01,
         -4.7859e-01,  5.0382e-01, -1.5400e-01,  2.8046e-01, -2.9455e-01,
          7.0898e-01, -5.2298e-01,  4.3997e-01, -6.3681e-01, -2.0339e-01,
         -1.0749e-02,  1.7008e-01, -7.2904e-01, -2.8344e-01,  3.7381e-01,
         -3.4308e-01, -4.1796e-01, -2.2071e-01,  2.2674e-01,  3.9903e-01,
         -3.6635e-01, -1.6068e-

## EMBEDDED P

In [None]:

# 4) fetch SMILES for all molecules (RDKit cleaning next)
pos_df = pd.DataFrame(all_pos)
if pos_df.empty:
    raise SystemExit("No positive pairs collected—try lowering MIN_ACTS_PER_TARGET or increase MAX_TARGETS.")

mol_ids = sorted(pos_df['molecule_chembl_id'].unique())
MCHUNK = 100
mol_rows = []
for i in tqdm(range(0, len(mol_ids), MCHUNK), desc='Molecules'):
    part = mol_ids[i:i+MCHUNK]
    mres = mol_api.filter(molecule_chembl_id__in=";".join(part)) \
                  .only(['molecule_chembl_id','molecule_structures'])
    for m in mres:
        mid = m['molecule_chembl_id']
        s   = (m.get('molecule_structures') or {}).get('canonical_smiles')
        mol_rows.append({'molecule_chembl_id': mid, 'canonical_smiles': s})

mol_df = pd.DataFrame(mol_rows)


In [None]:

# 5) RDKit cleaning (desalt, strip stereo, remove uncommon atoms)
from rdkit import Chem

ALLOWED = set(['B','C','N','O','P','S','F','Cl','Br','I','Si','H'])

def clean_smiles(smi):
    if not smi:
        return None
    # Keep largest fragment (desalt)
    largest = max(smi.split('.'), key=len)
    mol = Chem.MolFromSmiles(largest)
    if mol is None:
        return None
    # Remove stereochemistry
    Chem.RemoveStereochemistry(mol)
    # Remove uncommon atoms
    for atom in mol.GetAtoms():
        sym = atom.GetSymbol()
        if sym not in ALLOWED:
            return None
    return Chem.MolToSmiles(mol, isomericSmiles=False)

mol_df['smiles_clean'] = mol_df['canonical_smiles'].map(clean_smiles)
mol_df = mol_df.dropna(subset=['smiles_clean'])

# 6) join & (optionally) add random negatives for 1:1 balance
pairs = pos_df.merge(mol_df[['molecule_chembl_id','smiles_clean']],
                     on='molecule_chembl_id', how='inner')

if MAKE_BALANCED_1TO1:
    # Build candidate negatives as (target, molecule) combos not in positives
    pos_set = {(r.target_chembl_id, r.molecule_chembl_id) for r in pairs.itertuples()}
    all_targets = pairs['target_chembl_id'].unique().tolist()
    all_mols    = pairs['molecule_chembl_id'].unique().tolist()

    neg_rows = []
    tries = 0
    goal = pairs.shape[0]
    while len(neg_rows) < goal and tries < goal * 50:
        tries += 1
        t = random.choice(all_targets)
        m = random.choice(all_mols)
        if (t, m) in pos_set:
            continue
        neg_rows.append({'target_chembl_id': t, 'molecule_chembl_id': m, 'label': 0})
    neg_df = pd.DataFrame(neg_rows).drop_duplicates().head(goal)

    # Attach metadata & smiles
    meta_cols = ['target_pref_name','uniprot_ids','protein_sequence']
    target_meta = pairs[['target_chembl_id'] + meta_cols].drop_duplicates()
    neg_df = neg_df.merge(target_meta, on='target_chembl_id', how='left') \
                   .merge(mol_df[['molecule_chembl_id','smiles_clean']], on='molecule_chembl_id', how='left') \
                   .dropna(subset=['smiles_clean'])

    # Merge positives + negatives
    base_cols = ['target_chembl_id','target_pref_name','uniprot_ids','protein_sequence',
                 'molecule_chembl_id','smiles_clean','label']
    pos_out = pairs[base_cols + ['pchembl_value']].copy()
    neg_out = neg_df[base_cols].copy()
    neg_out['pchembl_value'] = None

    final_df = pd.concat([pos_out, neg_out], ignore_index=True)
else:
    final_df = pairs[['target_chembl_id','target_pref_name','uniprot_ids','protein_sequence',
                      'molecule_chembl_id','smiles_clean','pchembl_value','label']].copy()

# 7) write to CSV
final_df.to_csv('deeptarget_like_pairs.csv', index=False)
print("Wrote", final_df.shape, "to deeptarget_like_pairs.csv")