# Basic use of the relative feature tightness measure

Using 25 000 structures for ChEMBL, and 25 000 from ZINC, against PMI profiles of ChEMBL and ZINC individually, and also against pKL divergence profile between ChEMBL and ZINC profile.

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv

from fip import profiles

## Dataset prep

In [2]:
DATASET_NAME = 'ChEMBL28'
DATASET_LINK = '../../scratch/data/chembl_28/chembl_compounds_activities_features.csv'
OUTPUT_FIGURE_PATH = '../../scratch/data/chembl_28/pix'

FEATURES_NAME = 'ECFP1-like substructures'
COOCCURRENCE_MX_LINK = '../../scratch/data/chembl_28/ec_fragments_r1_coomx.csv'
COOCCURRENCE_MX_VECTOR_COUNT = 2066376

MINIMAL_COOCCURRENCE_CUTOFF = 100  # How many co-occurrences are needed to start tracking the relation

REFERENCE_DATASET_NAME = 'ZINC_DM'
REFERENCE_DATASET_LINK = '../../scratch/data/zinc_dark_matter/zinc_dm_in_stock_features.csv'
REFERENCE_COOCCURRENCE_MX_LINK = '../../scratch/data/zinc_dark_matter/zinc_dm_in_stock_ec_fragments_r1_coomx.csv'
REFERENCE_COOCCURRENCE_MX_VECTOR_COUNT = 5063486

# Derived from primary dataset, adjusted for size difference
REFERENCE_MINIMAL_COOCCURRENCE_CUTOFF = int(MINIMAL_COOCCURRENCE_CUTOFF * 
                                            (REFERENCE_COOCCURRENCE_MX_VECTOR_COUNT /
                                             COOCCURRENCE_MX_VECTOR_COUNT))

DATASET_SAMPLE_SIZE = 1000

### Making feature co-occurrence probability profiles

In [3]:
probability_profile = profiles.CooccurrenceProbabilityProfile.from_cooccurrence_profile(
    profiles.CooccurrenceProfile.from_dataframe(
        pd.read_csv(COOCCURRENCE_MX_LINK),
        min_cutoff_value=MINIMAL_COOCCURRENCE_CUTOFF,
        vector_count=COOCCURRENCE_MX_VECTOR_COUNT))
probability_profile.df

Unnamed: 0_level_0,Unnamed: 1_level_0,value
feature1,feature2,Unnamed: 2_level_1
C=O,C=O,0.689240
C=O,Cc(c)c,0.302402
C=O,c-n(c)n,0.018186
C=O,c=O,0.058172
C=O,cC,0.132594
...,...,...
N=c([nH])n,cc(c)Cl,0.000068
CCS,cN=S,0.000072
cn(c)n,nc(-n)[nH],0.000273
CC(N)=O,cS(F)(F)(F)(F)F,0.000051


In [4]:
reference_probability_profile = profiles.CooccurrenceProbabilityProfile.from_cooccurrence_profile(
    profiles.CooccurrenceProfile.from_dataframe(
        pd.read_csv(REFERENCE_COOCCURRENCE_MX_LINK),
        min_cutoff_value=REFERENCE_MINIMAL_COOCCURRENCE_CUTOFF,
        vector_count=REFERENCE_COOCCURRENCE_MX_VECTOR_COUNT))
reference_probability_profile.df

Unnamed: 0_level_0,Unnamed: 1_level_0,value
feature1,feature2,Unnamed: 2_level_1
CCN,CCN,0.499327
CCN,CCO,0.163191
CCN,COC,0.145833
CCN,Cn,0.027918
CCN,Cn(n)n,0.000510
...,...,...
CC(c)n,cSc,0.000302
CC(c)n,cc(o)S,0.000152
C=C(C)N,cc(-n)s,0.000053
CN=c,cc(-n)s,0.000058


### Making feature pointwise mutual information profiles

In [5]:
pmi_profile = profiles.PointwiseMutualInformationProfile.from_cooccurrence_probability_profile(
    probability_profile)
pmi_profile.df

Unnamed: 0_level_0,Unnamed: 1_level_0,value
feature1,feature2,Unnamed: 2_level_1
C=O,C=O,0.000000
C=O,Cc(c)c,0.336752
C=O,c-n(c)n,-0.021437
C=O,c=O,-0.350960
C=O,cC,-0.038451
...,...,...
N=c([nH])n,cc(c)Cl,1.755450
CCS,cN=S,2.678707
cn(c)n,nc(-n)[nH],4.683056
CC(N)=O,cS(F)(F)(F)(F)F,0.540092


In [6]:
reference_pmi_profile = profiles.PointwiseMutualInformationProfile.from_cooccurrence_probability_profile(
    reference_probability_profile)
reference_pmi_profile.df

Unnamed: 0_level_0,Unnamed: 1_level_0,value
feature1,feature2,Unnamed: 2_level_1
CCN,CCN,0.000000
CCN,CCO,0.139260
CCN,COC,0.186709
CCN,Cn,0.069478
CCN,Cn(n)n,-0.508853
...,...,...
CC(c)n,cSc,2.488432
CC(c)n,cc(o)S,3.156343
C=C(C)N,cc(-n)s,0.589271
CN=c,cc(-n)s,1.948431


### Making feature pointwise KL divergence profile between ChEMBL and ZINC

In [7]:
difference_pkld_profile = profiles.PointwiseKLDivergenceProfile.from_cooccurrence_probability_profiles(
    probability_profile, reference_probability_profile)
difference_pkld_profile.df

Unnamed: 0_level_0,Unnamed: 1_level_0,value
feature1,feature2,Unnamed: 2_level_1
C=O,C=O,-0.195532
C=O,Cc(c)c,-0.042577
C=O,c-n(c)n,-0.604856
C=O,c=O,-0.613486
C=O,cC,-1.073105
...,...,...
CC(c)n,cSc,-9.286288
CC(c)n,cc(o)S,-8.291935
C=C(C)N,cc(-n)s,-6.762255
CN=c,cc(-n)s,-6.906645


### Sampling structures from ChEMBL and ZINC datasets

In [8]:
chembl_samples = pd.read_csv(DATASET_LINK).sample(DATASET_SAMPLE_SIZE)
chembl_samples

Unnamed: 0,molregno,canonical_smiles,chembl_activities,ec_fragments_r1,ec_fragments_r2,ec_fragments_r3,brics_fragments
1755385,2131503,NCC(=O)NCC1CCN(c2nc(N3CCOCC3)nc(-n3c(C(F)F)nc4...,PI3-kinase p110-beta subunit,CN | COC | Cc(n)n | nc(n)-n | CCN | CC(N)=O | ...,CCOCC | COCCN | cnc(nc)N(C)C | cN(C)CCC | NCC(...,CN(C)c1ncnc(N(C)C)n1 | Cc1nc2ccccc2n1-c(n)n | ...,[1*]C(=O)CN [5*]N1CCOCC1 [8*]C(F)F [5*]N1CCC([...
1796480,2179616,O=C(Cn1nnc(-c2ccncc2)c1-c1ccccc1)Nc1ccc(-c2ccc...,Probable protein-cysteine N-palmitoyltransfera...,cc(n)N | ccn | ccc | CCn | c-c(c)c | CC(N)=O |...,c-c(c)ccc | ccc(cn)-c(c)c | ccc(cc)-c(c)n | cc...,cc(c)-c1ccccc1 | c1ccccc1 | c-c1c(-c(c)c)nnn1C...,[14*]c1ccc([16*])cn1 [9*]n1nnc([14*])c1[14*] [...
631337,709821,COc1cc(C(=O)c2ccc(-c3csc(NC(=O)[C@@H](N)CO)n3)...,HL-60,cc(c)O | CN | ncn | cOC | c-c(c)c | CC(N)=O | ...,ncncn | cC(=O)c(cc)cc | CC(N)CO | c-c(c)ccc | ...,cNC(=O)C(N)CO | cnc(NC(=O)C(C)N)sc | cc(c)-c1c...,[16*]c1ccc([16*])c([16*])c1 [6*]C([6*])=O [1*]...
1376575,1632652,CC(C)(C)OC(=O)C[C@@]1(C)[C@H](C(=O)O)N2C(=O)CC...,,CC(N)S | COC | CC(C)(C)S | CN(C)C | CCC | CC(N...,O=C1CC(S)N1 | CC(C)N1C(=O)CC1S | CC(N)C(=O)O |...,CCC(=O)OC(C)(C)C | CN1C(C(=O)O)C(C)(CC(=O)O)S(...,[1*]C(=O)C[8*] [13*]C1N2C(=O)CC2S(=O)(=O)C1([1...
181733,209893,Nc1sc2c(c1C(=O)c1cccc(Cl)c1)CCCCC2,,cC(c)=O | cc(N)s | ccc | cc(c)C | cc(C)s | CCC...,cC(=O)c(cc)cc | Cc(c)c(N)sc | cC(=O)c(c(c)C)c(...,cC(=O)c1c(N)sc(CC)c1CCC | Cc1c(N)sc(CC)c1C | c...,[6*]C([6*])=O [16*]c1cccc(Cl)c1 [16*]c1c(N)sc2...
...,...,...,...,...,...,...,...
198871,230198,COC(C)CN(C(=O)N(CCCl)N=O)C1OC(CO)C(COCC2OC(CO)...,,CC | COC | NC(N)=O | CC(N)O | CCN | NN=O | CN(...,CCOCC | CC(O)CO | COCC(C)O | CC(C)OC(C)C | COC...,CN(C)CC(C)OC | COC(C(C)O)N(CC(C)O)C(=O)N(C)N |...,[5*]N([5*])[5*] [13*]C1OC([13*])C(O)C(O)C1[15*...
29441,34378,CCCCC(=O)N[C@@H]1CC(=O)NCCCC[C@@H](C(N)=O)NC(=...,Melanocortin receptor 3 | Melanocortin recepto...,CN | CCN | N=C(N)N | CC(N)=O | cc(c)[nH] | CNC...,c[nH]cc(c)C | cc[nH]cn | cnc(c[nH])CC | CC(C)N...,CNC(Cc(c[nH])nc)C(N)=O | c1ccccc1 | CCC(NC(=O)...,[14*]c1c[nH]cn1 [13*]C1CCCCNC(=O)CC([15*])C(=O...
1907303,2294892,[N-]=[N+]=NC[C@H]1NC[C@@H](O)[C@H]1O,,[N-]=[N+] | CCN | CC(C)N | CC(C)O | CO | CN=[N...,CN=[N+]=[N-] | CNC(CN)C(C)O | CC(N)CN=[N+] | C...,NCC1NCC(O)C1O | CC1NCC(O)C1O | [N+]=NCC1NCC(O)...,[13*]C1NCC(O)C1O [8*]CN=[N+]=[N-]
381805,431839,CCC(CC)C(=O)Nc1nc(-c2ccccc2)c(C#N)c(-c2ccccc2)n1,Adenosine A1 receptor | Adenosine A2a receptor,CC | C#N | ccc | CCC | CC(C)C | c-c(c)c | CC(N...,c-c(n)c(C#N)c(-c)n | c-c(c)ccc | Cc(c)c(nc)-c(...,c1ccccc1 | Cc(c)c(nc)-c1ccccc1 | cNC(=O)C(CC)C...,[1*]C(=O)C(CC)CC [5*]N[5*] [16*]c1ccccc1 [14*]...


In [9]:
zinc_samples = pd.read_csv(REFERENCE_DATASET_LINK).sample(DATASET_SAMPLE_SIZE)
zinc_samples

Unnamed: 0,smiles,id,ec_fragments_r1,ec_fragments_r2,ec_fragments_r3,brics_fragments
3514131,O=C(COC(=O)c1sc2cc(F)ccc2c1Cl)Nc1ccc(OC(F)F)cc1,ZINC000003311976,C=O | cc(c)s | CF | cc(c)F | cC(=O)O | CC(N)=O...,ccc(sc)c(c)c | csc(C(=O)O)c(c)Cl | cOC(F)F | c...,cc(s)C(=O)OCC(N)=O | ccc(cc)OC(F)F | CC(=O)Nc1...,[3*]O[3*] [1*]C(=O)C[4*] [5*]N[5*] [16*]c1ccc(...
3105183,CCOc1ccc(CCNS(=O)(=O)c2cc3c(cc2C)n(C)c(=O)n3C)...,ZINC000012412201,cc(c)C | cC | cc(c)n | cCC | cc(c)O | cn(c)C |...,cc(c)n(C)c(n)=O | cn(C)c(=O)n(c)C | ccc(C)c(c)...,cn(C)c1cc(S(N)(=O)=O)c(C)cc1n | cn(C)c1cc(C)c(...,[3*]O[3*] [4*]CC [5*]N[5*] [16*]c1ccc([16*])c(...
2669153,CCO/C(=C\c1sc2ccc(OC)cc2[n+]1CCCS(=O)(=O)O)SC,ZINC000004754047,Cc([n+])s | cc(c)[n+] | C=C(O)S | cc(c)O | CC[...,c[n+](c)CCC | C=C(S)OCC | CCCS(=O)(=O)O | ccc(...,C=Cc1sc(c)c(cc)[n+]1CCC | c[n+](C)c1cc(OC)ccc1...,[3*]OC [3*]OC(=Cc1sc2ccc([16*])cc2[n+]1CCCS(=O...
2631728,CCN1CCN(C(=O)c2cc[nH]n2)C[C@@H](Cc2ccc(-c3cccs...,ZINC000019527428,C=O | ccs | CC(C)C | cc[nH] | cc(c)C | cn[nH] ...,c-c(s)ccc | CN(C)CCN | cc(n)C(=O)N(C)C | CN(C)...,cCC(CN(CC)C(c)=O)C(N)=O | ccc(n[nH])C(=O)N(CC)...,[8*]C[8*] [14*]c1cc[nH]n1 [14*]c1cccs1 [5*]N1C...
2571385,C[C@H](C(=O)Oc1ccc(Br)cc1)N1C(=O)[C@H]2[C@H]3C...,ZINC000239121216,C=O | CC(C)N | CC(C)C | cc(c)Br | CN(C)C | C=C...,CN(C)C(C)C(=O)O | cccc(c)Br | CN(C)C(=O)C(C)C ...,CC1CC2C=CC1C(C)C2 | CC1CC2C=CC1C1CC21 | CC1CC2...,[3*]O[3*] [1*]C(=O)C([8*])C [10*]N1C(=O)C2C3C=...
...,...,...,...,...,...,...
4219945,C=C[C@@H]1C[N+]2(Cc3ccc(C(F)(F)F)cc3)CC[C@H]1C...,ZINC000035189269,cc(c)C | CC(C)C | CF | cC[N+] | cc(c)n | C=CC ...,ccc(cc)C[N+] | C=CC(C[N+])C(C)C | cc(c)C[N+](C...,C=CC1C[N+]2(C)CCC1CC2 | ccc1c(C(O)C(C)[N+])ccn...,[5*][N+]12CCC(CC1[13*])C(C=C)C2 [8*]C(F)(F)F [...
3610912,C=C(C)COc1cc(C)cc2oc(=O)c(CC(=O)N(C)CC(=O)N3CC...,ZINC000020647100,C=O | cC | cc(c)C | cCC | CN | CC(N)=O | cc(c)...,CCOc(c)c | cOCC(=C)C | ccc(OC)c(c)c | CN(C)CCN...,CC(=O)N1CCN(C)CC1 | cc(C)c(CC(=O)N(C)C)c(=O)o ...,[3*]O[3*] [1*]C(=O)C[8*] [5*]N1CCN(C)CC1 [4*]C...
1618061,CCN(CC)C(=O)c1ccc(NCc2ccc(Br)o2)cc1,ZINC000170616014,C=O | cc(c)N | cc(o)Br | CN(C)C | cNC | ccc | ...,ccc(cc)C(N)=O | cc(c)C(=O)N(C)C | cccc(C)o | c...,ccc(cc)NCc(c)o | cc(c)C(=O)N(CC)CC | cNCc1ccc(...,[4*]CC [14*]c1ccc(Br)o1 [5*]N([5*])[5*] [5*]N[...
4631137,O=C(C[C@@H]1S/C(=N\c2ccc(Cl)cc2)N(C(c2ccccc2)c...,ZINC000102408465,C=O | cc(c)C | cN=C | CN(C)C | cCl | CC(N)=O |...,cccc(c)Cl | CN(C)C(c(c)c)c(c)c | cC(c)N(C(=N)S...,CN1C(=N)SC(CC(N)=O)C1=O | CC(=O)Nc1ccccc1 | Cc...,[8*]C([8*])[8*] [1*]C(=O)C[8*] [5*]N[5*] [10*]...


## Relative feature tightness against a PMI interrelation profile