In [2]:
import os
import pandas as pd
from rdkit import Chem
from npmine.fit_entity_model import *

In [3]:
jnatprod = pd.read_csv('data/entities_img_dataframe_jnatprod.tsv', sep='\t')
metabo = pd.read_csv('data/entities_img_dataframe_metabo.tsv', sep='\t')
phytorev = pd.read_csv('data/entities_img_dataframe_phytorev.tsv', sep='\t')
qn = pd.read_csv('data/entities_img_dataframe_qn.tsv', sep='\t')
chrom_a = pd.read_csv('data/entities_img_dataframe_chroma_a.tsv', sep='\t')
chrom_b = pd.read_csv('data/entities_img_dataframe_chroma_b.tsv', sep='\t')

In [4]:
comp = pd.concat([jnatprod[['smiles', 'standardInChIKey', 'standardInChI', 'pubchem']],
                  metabo[['smiles', 'standardInChIKey', 'standardInChI', 'pubchem']],
                  phytorev[['smiles', 'standardInChIKey', 'standardInChI', 'pubchem']],
                  qn[['smiles', 'standardInChIKey', 'standardInChI', 'pubchem']],
                  chrom_a[['smiles', 'standardInChIKey', 'standardInChI', 'pubchem']],
                  chrom_b[['smiles', 'standardInChIKey', 'standardInChI', 'pubchem']]])
comp.reset_index(drop=True, inplace=True)
comp.loc[comp['pubchem'].str.contains('csid')==True, 'pubchem'] = 0
comp.loc[comp['pubchem'].isnull(), 'pubchem'] = 0
comp['pubchem'] = comp['pubchem'].astype(int)

In [5]:
acty = pd.read_csv('data/pubchem_bioactivity_results.tsv', sep='\t')
acty['Assay Name'].value_counts()
sel_acty = acty[acty['Assay Name']=='qHTS profiling for inhibitors of Plasmodium falciparum proliferation']
len(sel_acty['CID'].unique())

sel_pub = sel_acty[['CID', 'Bioactivity Outcome']]
sel_pub = sel_pub[sel_pub['Bioactivity Outcome'].isin(['Inactive', 'Active'])]
sel_pub = sel_pub[~sel_pub['CID'].duplicated()]

In [6]:
sel_pub = pd.merge(sel_pub,
                   comp[~comp['pubchem'].duplicated()],
                   left_on='CID', right_on='pubchem', how='left')

sum(sel_pub['standardInChI'].isnull())
sel_pub.fillna('', inplace=True)
sum(sel_pub['standardInChI']=='')

280

In [7]:
for i in sel_pub.index:
    if sel_pub.loc[i, 'standardInChI']=='':
        sel_pub.loc[i, 'standardInChI'] = Chem.MolToInchi(Chem.MolFromSmiles(sel_pub.loc[i, 'smiles']))

sum(sel_pub['standardInChI']=='')

0

In [9]:
fit_model(sel_pub['standardInChI'],
          sel_pub['Bioactivity Outcome'],
          out='qHTS_inhibitors')

Counter({'Inactive': 950, 'Active': 138})
Fitting 5 folds for each of 12 candidates, totalling 60 fits


  self.best_estimator_.fit(X, y, **fit_params)


{'max_features': 20, 'n_estimators': 250}
0.879889271285351
Accuracy: 0.9080882352941176
Matthews_Corr: 0.3477882182681992
Cohen_Kappa: 0.31726907630522083


In [1]:
!activity_network --help

Usage: activity_network [OPTIONS]

Options:
  -n, --net_stats TEXT   Network statistics
  -e, --edge_list TEXT   Edge list
  -d, --entity_dir TEXT  Chemical entities directory
  -a, --acy TEXT         Bioactivity file
  -i, --aid TEXT         AID number
  -c, --conn TEXT        Connected component number
  -o, --out_dir TEXT     Output directory
  --sconn INTEGER        Selected connected component count
  --naid INTEGER         AID count
  --nconn INTEGER        Connected component count
  --help                 Show this message and exit.


In [3]:
if not os.path.exists('net_exploration'):
    os.mkdir('net_exploration')

!activity_network -n data/net_stats.tsv \
    -e data/edge_list_complement_filtered.txt \
    -d data/ \
    -a data/pubchem_bioactivity_results.tsv \
    -i 651838 -c 1 -o net_exploration/

  return __callback(*args, **kwargs)
  return __callback(*args, **kwargs)




















































































A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sel['CID'] = sel['CID'].astype(int)


Found 24 leads.


In [30]:
import networkx as nx

def predict_leads(aid, qsar_dir, out_dir):
    fls = [x for x in os.listdir(out_dir) if aid in x]
    afl = [x for x in fls if bool(re.match('subnet_%s_\d_\d_attr.txt' % aid, x))][0]
    attr = pd.read_csv(os.path.join(out_dir, afl), sep='\t')
    leads = attr.loc[attr['Leads']==1, 'standardInChI'].tolist()

    # Double check
    #model_predict('qsar/1296008/', attr.loc[attr['Bioactivity Outcome']=='Active', 'standardInChI'].tolist())
    attr.loc[attr['Leads']==1,'predicted_activity'] = model_predict(os.path.join(qsar_dir, aid), leads)
    attr.to_csv(os.path.join(out_dir, afl), sep='\t', index=None)

def find_break_point(edge_list, toremove):
    g = nx.Graph()
    g.add_edges_from(edge_list)
    g.remove_edge(toremove[0], toremove[1])
    c = sorted(nx.connected_components(g), key = len, reverse=True)
    if len(c)>1:
        return True
    else:
        return False

def create_sparce_graph(edge_list, attr, conn, out_dir):
    sub = pd.read_csv(os.path.join(out_dir, edge_list), sep='\t')
    attr = pd.read_csv(os.path.join(out_dir, attr), sep='\t')
    g = nx.Graph()
    g.add_edges_from(sub[['1', '2']].apply(lambda a: a.tolist(), axis=1).tolist())
    c = sorted(nx.connected_components(g), key = len, reverse=True)
    csub = sub[(sub['1'].isin(c[conn])) | (sub['2'].isin(c[conn]))]
    nds = attr[(attr['standardInChIKey'].isin(c[1])) & ((~attr['Bioactivity Outcome'].isnull()) | (~attr['predicted_activity'].isnull()))]
    csub = csub[(csub['1'].isin(nds['standardInChIKey'])) & (csub['2'].isin(nds['standardInChIKey']))]

    tsub = csub.copy()
    for n in nds.iloc[:,1].tolist():
        scores = tsub.loc[(tsub['1']==n) | (tsub['2']==n), '3']
        if not len(scores):
            continue
        for i in scores.index:
            e = tsub.loc[i,['1', '2']].tolist()
            a = find_break_point(tsub[['1', '2']].apply(lambda a: a.tolist(),
                                                       axis=1).tolist(), e)
            if a:
                continue
            else:
                tsub.drop(i, inplace=True)
    outname = re.sub('.txt$', '_conn_%s.txt' % conn, edge_list)
    tsub.to_csv(os.path.join(out_dir, outname), sep='\t', index=None)


In [33]:
import re 

predict_leads('651838', 'qHTS_inhibitors', 'net_exploration/')

In [34]:
edge_list = 'subnet_651838_1_1.txt'
attr = 'subnet_651838_1_1_attr.txt'
conn = 0
out_dir = 'net_exploration/'

sub = pd.read_csv(os.path.join(out_dir, edge_list), sep='\t')
attr = pd.read_csv(os.path.join(out_dir, attr), sep='\t')
g = nx.Graph()
g.add_edges_from(sub[['1', '2']].apply(lambda a: a.tolist(), axis=1).tolist())
c = sorted(nx.connected_components(g), key = len, reverse=True)
csub = sub[(sub['1'].isin(c[conn])) | (sub['2'].isin(c[conn]))]
nds = attr[(attr['standardInChIKey'].isin(c[0])) & ((~attr['Bioactivity Outcome'].isnull()) | (~attr['predicted_activity'].isnull()))]
csub = csub[(csub['1'].isin(nds['standardInChIKey'])) & (csub['2'].isin(nds['standardInChIKey']))]

tsub = csub.copy()
for n in nds.iloc[:,1].tolist():
    scores = tsub.loc[(tsub['1']==n) | (tsub['2']==n), '3']
    if not len(scores):
        continue
    for i in scores.index:
        e = tsub.loc[i,['1', '2']].tolist()
        a = find_break_point(tsub[['1', '2']].apply(lambda a: a.tolist(),
                                                   axis=1).tolist(), e)
        if a:
            continue
        else:
            tsub.drop(i, inplace=True)
outname = re.sub('.txt$', '_conn_%s.txt' % conn, edge_list)
tsub.to_csv(os.path.join(out_dir, outname), sep='\t', index=None)

In [35]:
tsub.shape

(117, 4)