# Database

In [1]:
import src
from src import *
from src.creation import create_entry, combine_sites, NoModulator

In [2]:
db.init('database.db')
src.allodb.save_cifs = True

In [3]:
import pandas as pd
pd.DF = pd.DataFrame

# Data in ACRALL

Data is based on molecules, so only those which contain "NAM" or "PAM" as part of their effect on one of the three types of acetylcholine receptors (alpha4beta2, alpha7 and muscle) will be taken into account.

The Uniprots of each AChR type will be separately retrieved, and used to query the PDB database for structures that contain both a Uniprot protein chain and a molecule that matches a SMILES of the database.

Each selected SMILES will be queried only with the Uniprot of the AChR type for which the "NAM" or "PAM" effect is annotated and not all 3.

In [4]:
df = pd.read_excel("ACRALL-23Jun23.xlsx", sheet_name="ACRALL", header=1)
df

Unnamed: 0,cid,name,Smiles,inchikey,in_vivo_tested,phase,alpha4beta2,alpha7,muscletype,targetname,metric,value,reference,measure
0,187,ACETYLCHOLINE,CC(=O)OCC[N+](C)(C)C,OIPILFWXSMYKGL-UHFFFAOYSA-N,yes,4,agonist,agonist,agonist,nAChR a4b2,Emax_%,172.0000,16033252,efficacy
1,187,ACETYLCHOLINE,CC(=O)OCC[N+](C)(C)C,OIPILFWXSMYKGL-UHFFFAOYSA-N,yes,4,agonist,agonist,agonist,nAChR a7,Emax_%,110.0000,22468936,efficacy
2,305,CHOLINE,C[N+](C)(C)CCO,OEYIOHPDSNJKLS-UHFFFAOYSA-N,yes,4,Unknown activity,agonist,Unknown activity,nAChR a7,Emax_%,95.0000,16033252,efficacy
3,1204,EPIBATIDINE,C1CC2C(CC1N2)C3=CN=C(C=C3)Cl,NLPRAJRHRHZCQQ-UHFFFAOYSA-N,yes,0,agonist,agonist,agonist,nAChR a4b2,Emax_%,133.0000,19232492,efficacy
4,1204,EPIBATIDINE,C1CC2C(CC1N2)C3=CN=C(C=C3)Cl,NLPRAJRHRHZCQQ-UHFFFAOYSA-N,yes,0,agonist,agonist,agonist,nAChR a7,Emax_%,123.5000,16033252,efficacy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7888,155556155,"alphaO-conotoxin [C9S,C20S]GeXIVA[1,4]",peptide[TCRSSGRYSRSPYDRRRRYSRRITDACV],SCLCAVDKGIIDIR-VLSRZGDXSA-N,No,0,N.A.,antagonist,,nAChR a1b1de,IC50_uM,2.9900,31986036,potency
7889,155557819,N-[(2S)-1-[3-[4-(3-aminopropylamino)butylamino...,C1CCC(CC1)C[C@@H](C(=O)NCCCNCCCCNCCCN)NC(=O)C2...,FOBOXOHWDKQRAH-QFIPXVFZSA-N,No,0,,,,nAChR a4b2,IC50_uM,0.0208,31244109,potency
7890,155562264,"(1R,2R,3S,4S)-3-(6-chloropyridin-3-yl)bicyclo[...",C1C[C@@H]2C[C@H]1[C@H]([C@@H]2N)C3=CN=C(C=C3)Cl,NKHNTAYDCXZLFL-IZLWQMGWSA-N,No,0,,,,nAChR a7,EC50_uM,2.7100,30681854,potency
7891,155564442,N-[(2S)-1-[3-[4-(3-aminopropylamino)butylamino...,C1CCC(C1)C(=O)N[C@@H](CC2=CC=C(C=C2)O)C(=O)NCC...,QYFZGNQKVGYCKF-QHCPKHFHSA-N,No,0,,,,nAChR a4b2,IC50_uM,0.0225,31244109,potency


In [5]:
df.columns

Index(['cid', 'name', 'Smiles', 'inchikey', 'in_vivo_tested', 'phase',
       'alpha4beta2', 'alpha7', 'muscletype', 'targetname', 'metric', 'value',
       'reference', 'measure'],
      dtype='object')

In [6]:
import numpy as np

In [7]:
set(np.concatenate(df[['alpha4beta2', 'alpha7', 'muscletype']].values))

{'N.A.',
 'NAM',
 'PAM',
 'PAM (type n.a.)',
 'PAM type I',
 'PAM type II',
 'Unknown activity',
 'ago - PAM type II',
 'agonist',
 'antagonist',
 'inactive',
 nan,
 'partial agonist',
 'silent agonist'}

The only interesting effects are those that contain "PAM" or "NAM" ('contain' because there is an effect annotated as "ago - PAM type II").

<br>

**ago - PAM type II**

In [8]:
df.query("'ago - PAM type II' in alpha4beta2 or 'ago - PAM type II' in alpha7 or 'ago - PAM type II' in muscletype")

Unnamed: 0,cid,name,Smiles,inchikey,in_vivo_tested,phase,alpha4beta2,alpha7,muscletype,targetname,metric,value,reference,measure
5692,6554040,GAT-107,C1C=C[C@H]2[C@@H]1[C@H](NC3=C2C=C(C=C3)S(=O)(=...,YNCXHXYZTLIZTO-HDMKZQKVSA-N,No,0,inactive,ago - PAM type II,inactive,nAChR a7,EC50_uM,28.0,24090443,potency


Indeed this ligand can act as an agonist by itself, binding on its allosteric-agonist binding sites; and also as a PAM of the orthosteric ligand.

<br>

## Selection of "PAM" or "NAM"

In [9]:
allosteric = df[
    df['alpha4beta2'].str.contains("AM", case=True, na=False)
    | df['alpha7'].str.contains("AM", case=True, na=False)
    | df['muscletype'].str.contains("AM", case=True, na=False)
]
allosteric

Unnamed: 0,cid,name,Smiles,inchikey,in_vivo_tested,phase,alpha4beta2,alpha7,muscletype,targetname,metric,value,reference,measure
8,4032,MECAMYLAMINE,CC1(C2CCC(C2)C1(C)NC)C,IMYZQPCYWPFTAG-UHFFFAOYSA-N,yes,4,NAM,NAM,NAM,nAChR a4b2,Inhibition_%,91.8,23327794,efficacy
27,637026,DESFORMYLFLUSTRABROMINE,CC(C)(C=C)C1=C(C2=C(N1)C=C(C=C2)Br)CCNC,GQHSCJUTJKLZPX-UHFFFAOYSA-N,No,0,PAM,N.A.,Unknown activity,nAChR a4b2,Emax_%,295.0,17604168,efficacy
59,5980988,"4,2,5-Trihydroxychalcone",C1=CC(=CC=C1/C=C/C(=O)C2=C(C=CC(=C2)O)O)O,NMANELLSWUVZNL-XVNBXDOJSA-N,yes,0,N.A.,PAM type II,N.A.,nAChR a4b2,Emax_%,87.0,25232969,efficacy
255,11998180,NS-9283,C1=CC(=CC(=C1)C2=NC(=NO2)C3=CN=CC=C3)C#N,HGFXDSQLRSWUBO-UHFFFAOYSA-N,yes,0,PAM,inactive,N.A.,nAChR a4b2,P3_%,200.0,24365158,efficacy
470,44348296,KAB-18,C1CC(CN(C1)CCCC2=CC=CC=C2)COC(=O)C3=CC=CC=C3C4...,FSLPUZLMGFJUNB-UHFFFAOYSA-N,No,0,NAM,N.A.,N.A.,nAChR a4b2,Inhibition_%,39.0,23327794,efficacy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7605,118714575,"(Z)-3-(2,4-Dihydroxyphenyl)-1-(2,5-dihydroxyph...",C1=CC(=C(C=C1O)O)/C=C\C(=O)C2=C(C=CC(=C2)O)O,NSGNPNKKTAPFEM-DJWKRKHSSA-N,No,0,N.A.,PAM (type n.a.),N.A.,nAChR a7,EC50_uM,10.7,25232969,potency
7606,118714576,"(E)-1-(4-Hydroxy-2-methylphenyl)-3-(2,3,4-trim...",CC1=C(C=CC(=C1)O)C(=O)/C=C/C2=C(C(=C(C=C2)OC)O...,WYADPPPAHTYQNX-WEVVVXLNSA-N,No,0,N.A.,PAM (type n.a.),N.A.,nAChR a7,EC50_uM,7.3,25232969,potency
7607,118714577,Compound 111,COC1=CC(=C(C=C1)/C=C/C(=O)C2=C(C=CC(=C2)O)O)OC,HOMVMWKJDBDBSW-QPJJXVBHSA-N,No,0,N.A.,PAM type II,N.A.,nAChR a7,EC50_uM,3.9,25232969,potency
7608,118714578,"(E)-1-(2,5-Dihydroxyphenyl)-3-(2-hydroxy-4,6-d...",COC1=CC(=C(C(=C1)OC)/C=C/C(=O)C2=C(C=CC(=C2)O)O)O,GKWKPCQXKSIKEP-GQCTYLIASA-N,No,0,N.A.,PAM (type n.a.),N.A.,nAChR a7,EC50_uM,2.6,25232969,potency


**PAM or NAM in one type and (ant)agonist in other**

In [10]:
allosteric[
    allosteric['alpha4beta2'].str.contains("agonist", case=True, na=False)
    | allosteric['alpha7'].str.contains("agonist", case=True, na=False)
    | allosteric['muscletype'].str.contains("agonist", case=True, na=False)
]

Unnamed: 0,cid,name,Smiles,inchikey,in_vivo_tested,phase,alpha4beta2,alpha7,muscletype,targetname,metric,value,reference,measure
5675,5360515,NALTREXONE,C1CC1CN2CC[C@]34[C@@H]5C(=O)CC[C@]3([C@H]2CC6=...,DQCKKXVULJGBQN-XFWGSAIBSA-N,yes,4,antagonist,NAM,N.A.,nAChR a4b2,IC50_uM,31.0,15050620,potency
5676,5360515,NALTREXONE,C1CC1CN2CC[C@]34[C@@H]5C(=O)CC[C@]3([C@H]2CC6=...,DQCKKXVULJGBQN-XFWGSAIBSA-N,yes,4,antagonist,NAM,N.A.,nAChR a7,IC50_uM,27.5,15050620,potency


Indeed, naltrexone has different effects in each receptor, but only the one with NAM in its column will be taken into account for PDB querying.

<br>

## Only fields of interest

In [11]:
allo_unique = allosteric[["name", "Smiles", 'alpha4beta2', 'alpha7', 'muscletype']].drop_duplicates()
allo_unique

Unnamed: 0,name,Smiles,alpha4beta2,alpha7,muscletype
8,MECAMYLAMINE,CC1(C2CCC(C2)C1(C)NC)C,NAM,NAM,NAM
27,DESFORMYLFLUSTRABROMINE,CC(C)(C=C)C1=C(C2=C(N1)C=C(C=C2)Br)CCNC,PAM,N.A.,Unknown activity
59,"4,2,5-Trihydroxychalcone",C1=CC(=CC=C1/C=C/C(=O)C2=C(C=CC(=C2)O)O)O,N.A.,PAM type II,N.A.
255,NS-9283,C1=CC(=CC(=C1)C2=NC(=NO2)C3=CN=CC=C3)C#N,PAM,inactive,N.A.
470,KAB-18,C1CC(CN(C1)CCCC2=CC=CC=C2)COC(=O)C3=CC=CC=C3C4...,NAM,N.A.,N.A.
...,...,...,...,...,...
7605,"(Z)-3-(2,4-Dihydroxyphenyl)-1-(2,5-dihydroxyph...",C1=CC(=C(C=C1O)O)/C=C\C(=O)C2=C(C=CC(=C2)O)O,N.A.,PAM (type n.a.),N.A.
7606,"(E)-1-(4-Hydroxy-2-methylphenyl)-3-(2,3,4-trim...",CC1=C(C=CC(=C1)O)C(=O)/C=C/C2=C(C(=C(C=C2)OC)O...,N.A.,PAM (type n.a.),N.A.
7607,Compound 111,COC1=CC(=C(C=C1)/C=C/C(=O)C2=C(C=CC(=C2)O)O)OC,N.A.,PAM type II,N.A.
7608,"(E)-1-(2,5-Dihydroxyphenyl)-3-(2-hydroxy-4,6-d...",COC1=CC(=C(C(=C1)OC)/C=C/C(=O)C2=C(C=CC(=C2)O)O)O,N.A.,PAM (type n.a.),N.A.


**Duplicated compounds**

In [12]:
allo_unique[
    allo_unique.duplicated("name", keep=False)
]

Unnamed: 0,name,Smiles,alpha4beta2,alpha7,muscletype
8,MECAMYLAMINE,CC1(C2CCC(C2)C1(C)NC)C,NAM,NAM,NAM
5693,MECAMYLAMINE,C[C@]1([C@@H]2CC[C@@H](C2)C1(C)C)NC,NAM,NAM,NAM


In [13]:
import os, pickle, requests, json

##### Get Uniprots of AChRs

Annotation score 4/5 (instead of 5/5 to include some Torpedo Uniprots that don't have 5/5)

In [14]:
q_web = """https://rest.uniprot.org/uniprotkb/stream?format=list&"""

q_alpha4 = """query=%28%28family%3A%22ligand-gated+ion+channel+%28TC+1.A.9%29+family+Acetylcholine+receptor+%28TC+1.A.9.1%29+subfamily+Alpha-4%2FCHRNA4+sub-subfamily%22%29%29+AND+%28annotation_score%3A5%29"""
q_beta2 = """query=%28%28family%3A%22ligand-gated+ion+channel+%28TC+1.A.9%29+family+Acetylcholine+receptor+%28TC+1.A.9.1%29+subfamily+Beta-2%2FCHRNB2+sub-subfamily%22%29%29+AND+%28annotation_score%3A5%29"""

q_alpha7 = """query=%28%28family%3A%22ligand-gated+ion+channel+%28TC+1.A.9%29+family+Acetylcholine+receptor+%28TC+1.A.9.1%29+subfamily+Alpha-7%2FCHRNA7+sub-subfamily%22%29%29+AND+%28annotation_score%3A5%29"""

q_alpha = """query=%28%28family%3A%22ligand-gated+ion+channel+%28TC+1.A.9%29+family+Acetylcholine+receptor+%28TC+1.A.9.1%29+subfamily+Alpha-1%2FCHRNA1+sub-subfamily%22%29%29+AND+%28annotation_score%3A5%29"""
q_beta = """query=%28%28family%3A%22ligand-gated+ion+channel+%28TC+1.A.9%29+family+Acetylcholine+receptor+%28TC+1.A.9.1%29+subfamily+Beta-1%2FCHRNB1+sub-subfamily%22%29%29+AND+%28annotation_score%3A5%29"""
q_delta = """query=%28%28family%3A%22ligand-gated+ion+channel+%28TC+1.A.9%29+family+Acetylcholine+receptor+%28TC+1.A.9.1%29+subfamily+Delta%2FCHRND+sub-subfamily%22%29%29+AND+%28annotation_score%3A5%29"""
q_epsilon = """query=%28%28family%3A%22ligand-gated+ion+channel+%28TC+1.A.9%29+family+Acetylcholine+receptor+%28TC+1.A.9.1%29+subfamily+Epsilon%2FCHRNE+sub-subfamily%22%29%29+AND+%28annotation_score%3A5%29"""
q_gamma = """query=%28%28family%3A%22ligand-gated+ion+channel+%28TC+1.A.9%29+family+Acetylcholine+receptor+%28TC+1.A.9.1%29+subfamily+Gamma%2FCHRNG+sub-subfamily%22%29%29+AND+%28annotation_score%3A5%29"""

In [15]:
file = "AChRs_uniprots.json"

if not os.path.isfile(file):
    uniprots = {}
    for q in "q_alpha4, q_beta2, q_alpha7, q_alpha, q_beta, q_delta, q_epsilon, q_gamma".split(", "):
        uniprots[q] = [i for i in requests.get(q_web + eval(q)).content.decode().split("\n") if len(i)>0]
        uniprots[q].extend([i for i in requests.get(q_web + eval(q).replace("%3A5%29", "%3A4%29")).content.decode().split("\n") if len(i)>0]) # Include annotation score 4
        
    with open(file, "wb") as f:
        pickle.dump(uniprots, f)

with open(file, "rb") as f:
    uniprots = pickle.load(f)
    
uniprots

{'q_alpha4': ['O70174',
  'P09482',
  'P09483',
  'P43681',
  'Q5IS77',
  'A0A286ZKF4',
  'A0A2I2YY21',
  'E1BHK0',
  'F6QGI2',
  'F7DET6',
  'G1KER0',
  'M3WUG2',
  'Q19AE6'],
 'q_beta2': ['P09484', 'P12390', 'P17787', 'Q9ERK7', 'P19370'],
 'q_alpha7': ['A0A0P6K2L9',
  'A0A286XUB1',
  'A0A2K5U752',
  'A0A2K6GC14',
  'A0A2R8MV58',
  'A0A337SQQ4',
  'A0A3Q7R9T2',
  'A0A452VFF0',
  'A0A4X1U8B8',
  'A0A8C8WIX8',
  'A0A8I3PAT2',
  'F1SNR6',
  'F6RM03',
  'G1SHV0',
  'G3T2T7',
  'P22770',
  'P36544',
  'P49582',
  'P54131',
  'Q05941',
  'Q5UMH9',
  'Q866A2'],
 'q_alpha': ['P02708',
  'P02709',
  'P04756',
  'P25108',
  'Q98880',
  'A0A1S3G4Y4',
  'A0A286XU07',
  'A0A287D981',
  'A0A2I2ZA95',
  'A0A2J8VR53',
  'A0A2K6FUM4',
  'A0A2K6V5P4',
  'A0A2R8N8V0',
  'A0A2R8ZN82',
  'A0A2Y9F3J6',
  'A0A384BS09',
  'A0A452QP10',
  'A0A4W2H4C8',
  'A0A4X1URU6',
  'A0A4X2LMF8',
  'A0A5F5XL67',
  'A0A7N4NK50',
  'A0A8C0RN65',
  'A0A8C5Y2K4',
  'A0A8C8YA30',
  'A0A8I3P2C7',
  'F6XJA4',
  'F7C5D6',
  'F7E1

##### Query the PDB

In [16]:
from rcsbsearchapi.search import AttributeQuery, ChemSimilarityQuery

In [17]:
file = "AChRs_pdbs.json"

if not os.path.isfile(file):
    pdb_query = {}
    skip = False
else:
    with open(file, "rb") as f:
        pdb_query = pickle.load(f)
    skip = True

In [18]:
if not skip:
    r = "alpha4beta2"
    pdb_query[r] = {}
    
    upq_alpha4 = AttributeQuery(
        attribute="rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
        operator="in",
        negation=False,
        value=uniprots["q_alpha4"]
    )
    upq_beta2 = AttributeQuery(
        attribute="rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
        operator="in",
        negation=False,
        value=uniprots["q_beta2"]
    )
    
    for i, m in allo_unique[
        allo_unique[r].str.contains("AM", case=True, na=False)
    ].iterrows():
        
        smiquery = ChemSimilarityQuery(
            value=m["Smiles"],
            query_type="descriptor",
            descriptor_type="SMILES",
            match_type="graph-relaxed" # To search for similar molecules to the SMILES
        )
    
        pdb_query[r][m["name"]] = (
            m.to_dict(),
            list((upq_alpha4 & upq_beta2 & smiquery)()),
            list((upq_alpha4 & upq_beta2 & smiquery)("mol_definition"))
        )

In [19]:
if not skip:
    r = "alpha7"
    pdb_query[r] = {}
    
    upq_alpha7 = AttributeQuery(
        attribute="rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
        operator="in",
        negation=False,
        value=uniprots["q_alpha7"]
    )
    
    for i, m in allo_unique[
        allo_unique[r].str.contains("AM", case=True, na=False)
    ].iterrows():
        
        smiquery = ChemSimilarityQuery(
            value=m["Smiles"],
            query_type="descriptor",
            descriptor_type="SMILES",
            match_type="graph-relaxed" # To search for similar molecules to the SMILES
        )
    
        pdb_query[r][m["name"]] = (
            m.to_dict(),
            list((upq_alpha7 & smiquery)()),
            list((upq_alpha7 & smiquery)("mol_definition"))
        )

In [20]:
if not skip:
    r = "muscletype"
    pdb_query[r] = {}
    
    upq_alpha = AttributeQuery(
        attribute="rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
        operator="in",
        negation=False,
        value=uniprots["q_alpha"]
    )
    upq_beta = AttributeQuery(
        attribute="rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
        operator="in",
        negation=False,
        value=uniprots["q_beta"]
    )
    upq_delta = AttributeQuery(
        attribute="rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
        operator="in",
        negation=False,
        value=uniprots["q_delta"]
    )
    upq_epsilon = AttributeQuery(
        attribute="rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
        operator="in",
        negation=False,
        value=uniprots["q_epsilon"]
    )
    upq_gamma = AttributeQuery(
        attribute="rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession",
        operator="in",
        negation=False,
        value=uniprots["q_gamma"]
    )
    
    
    for i, m in allo_unique[
        allo_unique[r].str.contains("AM", case=True, na=False)
    ].iterrows():
        
        smiquery = ChemSimilarityQuery(
            value=m["Smiles"],
            query_type="descriptor",
            descriptor_type="SMILES",
            match_type="graph-relaxed" # To search for similar molecules to the SMILES
        )
    
        pdb_query[r][m["name"]] = (
            m.to_dict(),
            list((upq_alpha & upq_beta & upq_delta & (upq_epsilon | upq_gamma) & smiquery)()),
            list((upq_alpha & upq_beta & upq_delta & (upq_epsilon | upq_gamma) & smiquery)("mol_definition"))
        )

In [21]:
if not skip:
    with open(file, "wb") as f:
        pickle.dump(pdb_query, f)
        
pdb_query

{'alpha4beta2': {'MECAMYLAMINE': ({'name': 'MECAMYLAMINE',
    'Smiles': 'C[C@]1([C@@H]2CC[C@@H](C2)C1(C)C)NC',
    'alpha4beta2': 'NAM',
    'alpha7': 'NAM',
    'muscletype': 'NAM'},
   [],
   []),
  'DESFORMYLFLUSTRABROMINE': ({'name': 'DESFORMYLFLUSTRABROMINE',
    'Smiles': 'CC(C)(C=C)C1=C(C2=C(N1)C=C(C=C2)Br)CCNC',
    'alpha4beta2': 'PAM',
    'alpha7': 'N.A.',
    'muscletype': 'Unknown activity'},
   [],
   []),
  'NS-9283': ({'name': 'NS-9283',
    'Smiles': 'C1=CC(=CC(=C1)C2=NC(=NO2)C3=CN=CC=C3)C#N',
    'alpha4beta2': 'PAM',
    'alpha7': 'inactive',
    'muscletype': 'N.A.'},
   [],
   []),
  'KAB-18': ({'name': 'KAB-18',
    'Smiles': 'C1CC(CN(C1)CCCC2=CC=CC=C2)COC(=O)C3=CC=CC=C3C4=CC=CC=C4',
    'alpha4beta2': 'NAM',
    'alpha7': 'N.A.',
    'muscletype': 'N.A.'},
   [],
   []),
  'buproprion': ({'name': 'buproprion',
    'Smiles': 'CC(C(=O)C1=CC(=CC=C1)Cl)NC(C)(C)C',
    'alpha4beta2': 'NAM',
    'alpha7': 'NAM',
    'muscletype': 'NAM'},
   [],
   []),
  'Galantamine'

In [22]:
pdb_query["muscletype"].values()

dict_values([({'name': 'MECAMYLAMINE', 'Smiles': 'C[C@]1([C@@H]2CC[C@@H](C2)C1(C)C)NC', 'alpha4beta2': 'NAM', 'alpha7': 'NAM', 'muscletype': 'NAM'}, [], []), ({'name': 'buproprion', 'Smiles': 'CC(C(=O)C1=CC(=CC=C1)Cl)NC(C)(C)C', 'alpha4beta2': 'NAM', 'alpha7': 'NAM', 'muscletype': 'NAM'}, [], []), ({'name': '2-(Tert-butylamino)-1-(3-fluorophenyl)propan-1-one', 'Smiles': 'CC(C(=O)C1=CC(=CC=C1)F)NC(C)(C)C', 'alpha4beta2': 'NAM', 'alpha7': 'N.A.', 'muscletype': 'NAM'}, [], []), ({'name': '2-(Tert-butylamino)-1-(3-chlorophenyl)butan-1-one', 'Smiles': 'CCC(C(=O)C1=CC(=CC=C1)Cl)NC(C)(C)C', 'alpha4beta2': 'NAM', 'alpha7': 'N.A.', 'muscletype': 'NAM'}, [], []), ({'name': '2-(Tert-butylamino)-1-(3-chlorophenyl)pentan-1-one', 'Smiles': 'CCCC(C(=O)C1=CC(=CC=C1)Cl)NC(C)(C)C', 'alpha4beta2': 'NAM', 'alpha7': 'N.A.', 'muscletype': 'NAM'}, [], []), ({'name': '1-(3-Bromophenyl)-2-(tert-butylamino)propan-1-one', 'Smiles': 'CC(C(=O)C1=CC(=CC=C1)Br)NC(C)(C)C', 'alpha4beta2': 'NAM', 'alpha7': 'N.A.', 'mus

In [23]:
{
    k: [p for v in v.values() for p in v[1]]
    for k,v in pdb_query.items()
}

{'alpha4beta2': [],
 'alpha7': ['8UTB',
  '7EKT',
  '8V82',
  '8V8A',
  '8V8C',
  '8V8D',
  '8V80',
  '8F4V',
  '8UZJ',
  '8V86',
  '8V88'],
 'muscletype': []}

In [24]:
{
    k: {n: (a[-1], a[-2]) for n, a in v.items() if len(a[-2]) > 0}
    for k,v in pdb_query.items()
}

{'alpha4beta2': {},
 'alpha7': {'NS-1738': (['XG3'], ['8UTB']),
  'PNU-120596': (['I34'], ['7EKT', '8V82', '8V8A', '8V8C', '8V8D']),
  'TQS': (['YLR'], ['8V80']),
  'Ivermectin': (['IVM'], ['8F4V', '8UZJ']),
  'GAT-107': (['YLI'], ['8V86', '8V88'])},
 'muscletype': {}}

In [25]:
len([p for k,v in pdb_query.items() for e in v.values() for p in e[1]]), "PDBs"

(11, 'PDBs')

## Comparison with papers

### Review

https://www.annualreviews.org/content/journals/10.1146/annurev-biochem-030122-033116

The only compounds annotated as possible "*AM"s in Table 1 are etomidate, PNU-120596 (7EKT, 7KOX (not resolved)) and the nanobodies C4 and E3. PNU-120596 is correctly identified in 7EKT (it's not resolved in 7KOX), while etomidate is not written down as "*AM" in any instance in the ACRALL database and therefore was not used for searching. Nanobodies are not in ACRALL.

### Paper with alpha7 structures

https://doi.org/10.1016/j.cell.2024.01.032

The structures with modulators 8UZJ (ivermectin), 8UTB (NS1738), 8V82 (PNU-120596), 8V80 (TQS), 8V86 (GAT107), 8V88 (GAT107), 8V8A (PNU-120596), 8V8C (PNU-120596) and 8V8D (PNU-120596) have been retrieved, plus 7EKT that is mentioned in the paper. 8UT1 and 8V89 deposited with the paper contain epibatidine (orthosteric) and no ligand, respectively.

### Other

8F4V is the only one that does not appear in either paper and it corresponds to ivermectin.

## Data already in the database

In [26]:
list(PDB.select().where(PDB.entry_id.in_([p.lower() for k,v in pdb_query.items() for e in v.values() for p in e[1]])))

[<PDB: 8f4v>, <PDB: 8uzj>]

In [27]:
[
    (
        p.entry_id, 
        list(s.modulator_residues.label_comp_id.unique().tolist() for s in p.sites),
        [(n, e[-1]) for k,v in pdb_query.items() for n,e in v.items() if p.entry_id.upper() in e[-2]]
    )
    for p in list(PDB.select().where(PDB.entry_id.in_([p.lower() for k,v in pdb_query.items() for e in v.values() for p in e[1]])))
    
]

[('8f4v', [['IVM']], [('Ivermectin', ['IVM'])]),
 ('8uzj', [['IVM']], [('Ivermectin', ['IVM'])])]

In [28]:
PDB.get(PDB.entry_id == "8f4v").sites[0].info

{'modulator_info': [{'modulator': [{'label_asym_id': 'F'}],
   'label_entity_id': '2',
   'type': 'non-polymer',
   'pdbx_description': "(2aE,4E,5'S,6S,6'R,7S,8E,11R,13R,15S,17aR,20R,20aR,20bS)-6'-[(2S)-butan-2-yl]-20,20b-dihydroxy-5',6,8,19-tetramethyl-17 -oxo-3',4',5',6,6',10,11,14,15,17,17a,20,20a,20b-tetradecahydro-2H,7H-spiro[11,15-methanofuro[4,3,2-pq][2,6]benzodioxacy clooctadecine-13,2'-pyran]-7-yl 2,6-dideoxy-4-O-(2,6-dideoxy-3-O-methyl-alpha-L-arabino-hexopyranosyl)-3-O-methyl-alpha-L-arabino-hexopyranoside"}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['A', 'E']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P36544']}],
 'source': {'grall': [{'entry': {'Name': 'ivermectin-b1a',
     'SMILES': 'CC[C@H](C)[C@H]1O[C@]2(CC[C@@H]1C)C[C@@H]1C[C@@H](C/C=C(\\C)[C@@H](O[C@H]3C[C@H](OC)[C@@H](O[C@H]4C[C@H](OC)[C@@H](O)[C@H](C)O4)[C@H](C)O3)[C@@H](C)/C=C/C=C3\\CO[C@@H]4[C@H](O)C(C)=C[C@@H](C(=O)O1)[C@]34O)O2',
     'Family'

In [29]:
PDB.get(PDB.entry_id == "8uzj").sites[0].info

{'modulator_info': [{'modulator': [{'label_asym_id': 'BA'}],
   'label_entity_id': '8',
   'type': 'non-polymer',
   'pdbx_description': "(2aE,4E,5'S,6S,6'R,7S,8E,11R,13R,15S,17aR,20R,20aR,20bS)-6'-[(2S)-butan-2-yl]-20,20b-dihydroxy-5',6,8,19-tetramethyl-17 -oxo-3',4',5',6,6',10,11,14,15,17,17a,20,20a,20b-tetradecahydro-2H,7H-spiro[11,15-methanofuro[4,3,2-pq][2,6]benzodioxacy clooctadecine-13,2'-pyran]-7-yl 2,6-dideoxy-4-O-(2,6-dideoxy-3-O-methyl-alpha-L-arabino-hexopyranosyl)-3-O-methyl-alpha-L-arabino-hexopyranoside"}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['B', 'C']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P36544']}],
 'source': {'grall': [{'entry': {'Name': 'ivermectin-b1a',
     'SMILES': 'CC[C@H](C)[C@H]1O[C@]2(CC[C@@H]1C)C[C@@H]1C[C@@H](C/C=C(\\C)[C@@H](O[C@H]3C[C@H](OC)[C@@H](O[C@H]4C[C@H](OC)[C@@H](O)[C@H](C)O4)[C@H](C)O3)[C@@H](C)/C=C/C=C3\\CO[C@@H]4[C@H](O)C(C)=C[C@@H](C(=O)O1)[C@]34O)O2',
     'Family

## Processing

In [30]:
errors = {}
error_entries = []

In [31]:
def process_entry(entry, update={}, auto_site_grouping=True, stringent_site_grouping=True):
    try:
        pdb = update["pdb"]
        mod = update["mod"]
        print(pdb, mod)
    
        old_sites = None
        # If PDB already exists, save its site IDs
        if PDB.get_or_none(PDB.entry_id == pdb) is not None:
            old_sites = list(PDB.get(PDB.entry_id == pdb).sites)
            
        with db.atomic() as txn:
            sites = create_entry(db, pdb, mod, auto_site_grouping, stringent_site_grouping)
            
            assert len(sites.objects()) == 1, f"{pdb}, {mod}: using only residue name retrieves more than one site(group)"
            
            for site in sites.objects():
                site.info["source"] = {
                    "acrall": [{
                        "entry": entry.to_dict(orient="list"),
                        "version": '23Jun23',
                        "date": "21-6-2024",
                        "update": update
                    }],
                }
                site.save()
                
            if old_sites is not None:
                try:
                    combine_sites(db, pdb.lower(), old_sites, sites.objects(), auto_site_grouping, stringent_site_grouping)
                except Exception as e:
                    assert False, "combine_sites failed; " + str(e.args[0])

        return sites

    except (AssertionError, KeyError) as error:
        id = pdb.lower()
        errors.setdefault(id, [])
        errors[id].append(str(error.args[0]))
        error_entries.append(entry)
        print(id, errors[id])

In [32]:
pdb_query

{'alpha4beta2': {'MECAMYLAMINE': ({'name': 'MECAMYLAMINE',
    'Smiles': 'C[C@]1([C@@H]2CC[C@@H](C2)C1(C)C)NC',
    'alpha4beta2': 'NAM',
    'alpha7': 'NAM',
    'muscletype': 'NAM'},
   [],
   []),
  'DESFORMYLFLUSTRABROMINE': ({'name': 'DESFORMYLFLUSTRABROMINE',
    'Smiles': 'CC(C)(C=C)C1=C(C2=C(N1)C=C(C=C2)Br)CCNC',
    'alpha4beta2': 'PAM',
    'alpha7': 'N.A.',
    'muscletype': 'Unknown activity'},
   [],
   []),
  'NS-9283': ({'name': 'NS-9283',
    'Smiles': 'C1=CC(=CC(=C1)C2=NC(=NO2)C3=CN=CC=C3)C#N',
    'alpha4beta2': 'PAM',
    'alpha7': 'inactive',
    'muscletype': 'N.A.'},
   [],
   []),
  'KAB-18': ({'name': 'KAB-18',
    'Smiles': 'C1CC(CN(C1)CCCC2=CC=CC=C2)COC(=O)C3=CC=CC=C3C4=CC=CC=C4',
    'alpha4beta2': 'NAM',
    'alpha7': 'N.A.',
    'muscletype': 'N.A.'},
   [],
   []),
  'buproprion': ({'name': 'buproprion',
    'Smiles': 'CC(C(=O)C1=CC(=CC=C1)Cl)NC(C)(C)C',
    'alpha4beta2': 'NAM',
    'alpha7': 'NAM',
    'muscletype': 'NAM'},
   [],
   []),
  'Galantamine'

In [33]:
for r, pdbs in pdb_query.items():
    for name, (row, pdbs, comp) in pdbs.items():
        if len(pdbs) > 0 and len(comp) > 0:
            entry = allosteric.merge(pd.DF([row]))
            assert len(comp) == 1
            for pdb in pdbs:
                process_entry(
                    entry,
                    {"pdb": pdb.lower(), "mod": [[{"auth_comp_id": comp[0]}]]},
                    auto_site_grouping=True,
                    stringent_site_grouping=True
                )

8utb [[{'auth_comp_id': 'XG3'}]]
7ekt [[{'auth_comp_id': 'I34'}]]
8v82 [[{'auth_comp_id': 'I34'}]]
8v8a [[{'auth_comp_id': 'I34'}]]
8v8c [[{'auth_comp_id': 'I34'}]]
8v8d [[{'auth_comp_id': 'I34'}]]
8v80 [[{'auth_comp_id': 'YLR'}]]
8f4v [[{'auth_comp_id': 'IVM'}]]
8uzj [[{'auth_comp_id': 'IVM'}]]
8v86 [[{'auth_comp_id': 'YLI'}]]
8v88 [[{'auth_comp_id': 'YLI'}]]


In [34]:
errors

{}

In [35]:
PDB.get(PDB.entry_id == "8f4v").sites[0].info

{'modulator_info': [{'modulator': [{'label_asym_id': 'F'}],
   'label_entity_id': '2',
   'type': 'non-polymer',
   'pdbx_description': "(2aE,4E,5'S,6S,6'R,7S,8E,11R,13R,15S,17aR,20R,20aR,20bS)-6'-[(2S)-butan-2-yl]-20,20b-dihydroxy-5',6,8,19-tetramethyl-17 -oxo-3',4',5',6,6',10,11,14,15,17,17a,20,20a,20b-tetradecahydro-2H,7H-spiro[11,15-methanofuro[4,3,2-pq][2,6]benzodioxacy clooctadecine-13,2'-pyran]-7-yl 2,6-dideoxy-4-O-(2,6-dideoxy-3-O-methyl-alpha-L-arabino-hexopyranosyl)-3-O-methyl-alpha-L-arabino-hexopyranoside"}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['A', 'E']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P36544']}],
 'source': {'grall': [{'entry': {'Name': 'ivermectin-b1a',
     'SMILES': 'CC[C@H](C)[C@H]1O[C@]2(CC[C@@H]1C)C[C@@H]1C[C@@H](C/C=C(\\C)[C@@H](O[C@H]3C[C@H](OC)[C@@H](O[C@H]4C[C@H](OC)[C@@H](O)[C@H](C)O4)[C@H](C)O3)[C@@H](C)/C=C/C=C3\\CO[C@@H]4[C@H](O)C(C)=C[C@@H](C(=O)O1)[C@]34O)O2',
     'Family'

In [36]:
PDB.get(PDB.entry_id == "8uzj").sites[0].info

{'modulator_info': [{'modulator': [{'label_asym_id': 'BA'}],
   'label_entity_id': '8',
   'type': 'non-polymer',
   'pdbx_description': "(2aE,4E,5'S,6S,6'R,7S,8E,11R,13R,15S,17aR,20R,20aR,20bS)-6'-[(2S)-butan-2-yl]-20,20b-dihydroxy-5',6,8,19-tetramethyl-17 -oxo-3',4',5',6,6',10,11,14,15,17,17a,20,20a,20b-tetradecahydro-2H,7H-spiro[11,15-methanofuro[4,3,2-pq][2,6]benzodioxacy clooctadecine-13,2'-pyran]-7-yl 2,6-dideoxy-4-O-(2,6-dideoxy-3-O-methyl-alpha-L-arabino-hexopyranosyl)-3-O-methyl-alpha-L-arabino-hexopyranoside"}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['B', 'C']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P36544']}],
 'source': {'grall': [{'entry': {'Name': 'ivermectin-b1a',
     'SMILES': 'CC[C@H](C)[C@H]1O[C@]2(CC[C@@H]1C)C[C@@H]1C[C@@H](C/C=C(\\C)[C@@H](O[C@H]3C[C@H](OC)[C@@H](O[C@H]4C[C@H](OC)[C@@H](O)[C@H](C)O4)[C@H](C)O3)[C@@H](C)/C=C/C=C3\\CO[C@@H]4[C@H](O)C(C)=C[C@@H](C(=O)O1)[C@]34O)O2',
     'Family

In [37]:
PDB.get(PDB.entry_id == "8v8d").sites[0].info

{'modulator_info': [{'modulator': [{'label_asym_id': 'DA'}],
   'label_entity_id': '5',
   'type': 'non-polymer',
   'pdbx_description': "N-(5-Chloro-2,4-dimethoxyphenyl)-N'-(5-methyl-3-isoxazolyl)-urea"}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['D', 'E']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P36544']}],
 'source': {'acrall': [{'entry': {'cid': [311434],
     'name': ['PNU-120596'],
     'Smiles': ['CC1=CC(=NO1)NC(=O)NC2=CC(=C(C=C2OC)OC)Cl'],
     'inchikey': ['CEIIEALEIHQDBX-UHFFFAOYSA-N'],
     'in_vivo_tested': ['yes'],
     'phase': [0],
     'alpha4beta2': ['N.A.'],
     'alpha7': ['PAM type II'],
     'muscletype': ['N.A.'],
     'targetname': ['nAChR a7'],
     'metric': ['EC50_uM'],
     'value': [0.829245],
     'reference': [20591663],
     'measure': ['potency']},
    'version': '23Jun23',
    'date': '21-6-2024',
    'update': {'pdb': '8v8d', 'mod': [[{'auth_comp_id': 'I34'}]]}}]}}

In [39]:
acrall_sites = list(list(s.info["source"].keys()) for s in Site.select().where(Site.info["source"].contains("acrall")))
acrall_sites

[['acrall'],
 ['acrall'],
 ['acrall'],
 ['acrall'],
 ['acrall'],
 ['acrall'],
 ['acrall'],
 ['grall', 'acrall'],
 ['grall', 'acrall'],
 ['acrall'],
 ['acrall']]

In [40]:
acrall_sites.count(['grall', 'acrall'])

2

In [42]:
acrall_sites.count(['acrall'])

9

# Statistics

In [43]:
# Total number of sites
len(Site.select())

3190

In [44]:
# Total number of different PDBs
len(PDB.select())

3050

In [45]:
# PDBs with no sites (expected 0)
[(pdb.entry_id, pdb) for p in PDB.select() if len(p.sites) == 0]

[]

In [46]:
# Number of sites in PDBs
set(len(p.sites) for p in PDB.select())

{1, 2, 3, 4}

In [47]:
# Modulator identifier fields used
set(tuple(s.modulator.keys()) for s in Site.select())

{('label_asym_id',)}

In [48]:
# Number of different entity instances (different "label_asym_id") annotated as modulators to look for outliers
modulator_chains = dict(
    sorted(
        {s.id: len(s.modulator["label_asym_id"]) for s in Site.select()}.items(),
        key=lambda i: i[-1], reverse=True
    )
)
modulator_chains

{520: 24,
 582: 8,
 608: 6,
 4806: 6,
 5088: 6,
 5246: 6,
 5320: 6,
 5365: 6,
 160: 4,
 187: 4,
 193: 4,
 368: 4,
 380: 4,
 412: 4,
 684: 4,
 741: 4,
 4493: 4,
 4767: 4,
 336: 3,
 419: 3,
 481: 3,
 482: 3,
 483: 3,
 552: 3,
 569: 3,
 570: 3,
 592: 3,
 599: 3,
 615: 3,
 624: 3,
 2319: 3,
 2327: 3,
 2334: 3,
 2633: 3,
 4542: 3,
 4559: 3,
 4577: 3,
 4590: 3,
 4636: 3,
 4733: 3,
 4834: 3,
 4842: 3,
 4863: 3,
 4886: 3,
 4907: 3,
 4929: 3,
 4949: 3,
 4966: 3,
 4979: 3,
 4992: 3,
 5002: 3,
 5003: 3,
 5122: 3,
 5143: 3,
 5190: 3,
 5200: 3,
 5267: 3,
 5272: 3,
 5382: 3,
 5463: 3,
 5471: 3,
 5474: 3,
 5: 2,
 14: 2,
 51: 2,
 75: 2,
 81: 2,
 91: 2,
 92: 2,
 93: 2,
 94: 2,
 95: 2,
 97: 2,
 136: 2,
 140: 2,
 144: 2,
 147: 2,
 150: 2,
 153: 2,
 168: 2,
 175: 2,
 198: 2,
 248: 2,
 254: 2,
 269: 2,
 272: 2,
 288: 2,
 318: 2,
 328: 2,
 339: 2,
 371: 2,
 383: 2,
 418: 2,
 432: 2,
 446: 2,
 463: 2,
 467: 2,
 480: 2,
 488: 2,
 491: 2,
 523: 2,
 526: 2,
 529: 2,
 530: 2,
 573: 2,
 585: 2,
 627: 2,
 748: 2,


In [49]:
db.close()

True