In [132]:
import pandas as pd
import numpy as np
import time
import requests
from io import StringIO
from rdkit import Chem
from rdkit.Chem import MACCSkeys, AllChem
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                             roc_auc_score, balanced_accuracy_score, make_scorer)
from sklearn.ensemble import RandomForestClassifier

In [2]:
# develop prediction model for small mol activity against aromatase
# train on tox21 bioassay data
url = 'https://pubchem.ncbi.nlm.nih.gov/assay/pcget.cgi?query=download&record_type=datatable&activity=all&response_type=save&aid=743139'
df_raw = pd.read_csv(url)

In [3]:
df_raw.head()

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Summary,Antagonist Activity,Antagonist Potency (uM),Antagonist Efficacy (%),Viability Activity,Viability Potency (uM),Viability Efficacy (%),Sample Source
0,RESULT_TYPE,,,,,,,STRING,STRING,FLOAT,FLOAT,STRING,FLOAT,FLOAT,STRING
1,RESULT_DESCR,,,,,,,Type of compound activity based on both the ar...,Type of compound activity in the aromatase ant...,The concentration of sample yielding half-maxi...,Percent inhibition of aromatase.,Type of compound activity in the cell viabilit...,The concentration of sample yielding half-maxi...,Percent inhibition of cell viability.,Where sample was obtained.
2,RESULT_UNIT,,,,,,,,,MICROMOLAR,PERCENT,,MICROMOLAR,PERCENT,
3,1,144203552.0,12850184.0,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,NCI
4,2,144203553.0,89753.0,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,NCI


In [4]:
# remove lines 0-2 
# descriptions for each column type
df_raw = df_raw[3:]
df_raw.head()

Unnamed: 0,PUBCHEM_RESULT_TAG,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity Summary,Antagonist Activity,Antagonist Potency (uM),Antagonist Efficacy (%),Viability Activity,Viability Potency (uM),Viability Efficacy (%),Sample Source
3,1,144203552.0,12850184.0,Inactive,0.0,,,inactive,inactive,,0.0,inactive,,0.0,NCI
4,2,144203553.0,89753.0,Inactive,0.0,,,inactive,inactive,,0.0,inactive,,0.0,NCI
5,3,144203554.0,9403.0,Inactive,0.0,,,inactive,inactive,,0.0,inactive,,0.0,NCI
6,4,144203555.0,13218779.0,Inactive,0.0,,,inactive,inactive,,0.0,inactive,,0.0,NCI
7,5,144203556.0,142766.0,Inconclusive,25.0,,,inconclusive antagonist (cytotoxic),active antagonist,15.5454,-115.803,active antagonist,14.9601,-76.8218,NCI


In [5]:
df_raw.columns

Index(['PUBCHEM_RESULT_TAG', 'PUBCHEM_SID', 'PUBCHEM_CID',
       'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_ACTIVITY_SCORE',
       'PUBCHEM_ACTIVITY_URL', 'PUBCHEM_ASSAYDATA_COMMENT', 'Activity Summary',
       'Antagonist Activity', 'Antagonist Potency (uM)',
       'Antagonist Efficacy (%)', 'Viability Activity',
       'Viability Potency (uM)', 'Viability Efficacy (%)', 'Sample Source'],
      dtype='object')

In [6]:
# rename columns

col_names_map = {'PUBCHEM_RESULT_TAG' : 'pc_result_tag',
                 'PUBCHEM_SID' : 'sid',
                 'PUBCHEM_CID' : 'cid',
                 'PUBCHEM_ACTIVITY_OUTCOME' : 'activity_outcome',
                 'PUBCHEM_ACTIVITY_SCORE' : 'activity_score',
                 'PUBCHEM_ACTIVITY_URL' : 'activity_url',
                 'PUBCHEM_ASSAYDATA_COMMENT' : 'assay_data_comment',
                 'Activity Summary' : 'activity_summary',
                 'Antagonist Potency (uM)' : 'antagonist_activity',
                 'Antagonist Efficacy (%)' : 'antagonist_efficacy',
                 'Viability Activity' : 'viability_potency',
                 'Viability Potency (uM)' : 'viability_potency',
                 'Viability Efficacy (%)' : 'viability_efficacy',
                 'Sample Source' : 'sample_source'}

In [7]:
df_raw = df_raw.rename(columns = col_names_map)
df_raw.columns

Index(['pc_result_tag', 'sid', 'cid', 'activity_outcome', 'activity_score',
       'activity_url', 'assay_data_comment', 'activity_summary',
       'Antagonist Activity', 'antagonist_activity', 'antagonist_efficacy',
       'viability_potency', 'viability_potency', 'viability_efficacy',
       'sample_source'],
      dtype='object')

In [8]:
# number of compounds for each activity group

df_raw.groupby('activity_outcome').count()

Unnamed: 0_level_0,pc_result_tag,sid,cid,activity_score,activity_url,assay_data_comment,activity_summary,Antagonist Activity,antagonist_activity,antagonist_efficacy,viability_potency,viability_potency,viability_efficacy,sample_source
activity_outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Active,379,379,378,379,0,0,379,379,378,379,379,115,359,379
Inactive,7562,7562,7466,7562,0,0,7562,7562,0,7562,7562,324,7449,7562
Inconclusive,2545,2545,2493,2545,0,0,2545,2545,2111,2136,2545,1206,2450,2545


In [9]:
df_raw.groupby(['activity_outcome', 'activity_summary']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,pc_result_tag,sid,cid,activity_score,activity_url,assay_data_comment,Antagonist Activity,antagonist_activity,antagonist_efficacy,viability_potency,viability_potency,viability_efficacy,sample_source
activity_outcome,activity_summary,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Active,active antagonist,379,379,378,379,0,0,379,378,379,379,115,359,379
Inactive,inactive,7562,7562,7466,7562,0,0,7562,0,7562,7562,324,7449,7562
Inconclusive,active agonist,612,612,571,612,0,0,612,612,612,612,60,590,612
Inconclusive,inconclusive,44,44,44,44,0,0,44,0,0,44,19,42,44
Inconclusive,inconclusive agonist,414,414,409,414,0,0,414,212,223,414,12,397,414
Inconclusive,inconclusive agonist (cytotoxic),59,59,59,59,0,0,59,41,45,59,59,59,59
Inconclusive,inconclusive antagonist,367,367,364,367,0,0,367,227,230,367,8,313,367
Inconclusive,inconclusive antagonist (cytotoxic),1049,1049,1046,1049,0,0,1049,1019,1026,1049,1048,1049,1049


In [10]:
# select only active (active agonist, active antagonist) and inactive 

df = df_raw[ (df_raw['activity_summary'] == 'active agonist') |
             (df_raw['activity_summary'] == 'active antagonist') | 
             (df_raw['activity_summary'] == 'inactive') ]
len(df)

8553

In [11]:
print(len(df['sid'].unique()))
print(len(df['cid'].unique()))

8553
6864


In [12]:
df.isna().sum().to_frame()

Unnamed: 0,0
pc_result_tag,0
sid,0
cid,138
activity_outcome,0
activity_score,0
activity_url,8553
assay_data_comment,8553
activity_summary,0
Antagonist Activity,0
antagonist_activity,7563


In [13]:
# drop substances without cids

df = df.dropna(subset=['cid'])
len(df)

8415

In [14]:
print(len(df['sid'].unique()))
print(len(df['cid'].unique()))

8415
6863


In [15]:
df.isna().sum()

pc_result_tag             0
sid                       0
cid                       0
activity_outcome          0
activity_score            0
activity_url           8415
assay_data_comment     8415
activity_summary          0
Antagonist Activity       0
antagonist_activity    7467
antagonist_efficacy       0
viability_potency         0
viability_potency      7919
viability_efficacy      154
sample_source             0
dtype: int64

In [16]:
# remove cids with conflicting activities

cid_conflict = []
idx_conflict = []

for cid in df['cid'].unique():
    
    outcomes = df.query("cid == @cid")['activity_summary'].unique()

    if len(outcomes) > 1:
        
        idx_tmp = df.query("cid == @cid").index
        idx_conflict.extend(idx_tmp)
        cid_conflict.append(cid)
        
print('# {} CIDs with conflicting activities [associated with {} rows (SIDs)]'.format(len(cid_conflict), len(idx_conflict)))
        

# 65 CIDs with conflicting activities [associated with 146 rows (SIDs)]


In [17]:
df.loc[idx_conflict, :].head(10)

Unnamed: 0,pc_result_tag,sid,cid,activity_outcome,activity_score,activity_url,assay_data_comment,activity_summary,Antagonist Activity,antagonist_activity,antagonist_efficacy,viability_potency,viability_potency.1,viability_efficacy,sample_source
8,6,144203557.0,16043.0,Inactive,0.0,,,inactive,inactive,,0.0,inactive,,0.0,NCI
5956,5954,144209507.0,16043.0,Active,43.0,,,active antagonist,active antagonist,54.4827,-73.4024,inconclusive antagonist,,,SigmaAldrich
6850,6848,144210401.0,16043.0,Inactive,0.0,,,inactive,inactive,,0.0,inactive,,0.0,SIGMA
52,50,144203601.0,443939.0,Inactive,0.0,,,inactive,inactive,,0.0,inactive,,0.0,NCI
6130,6128,144209681.0,443939.0,Active,61.0,,,active antagonist,active antagonist,1.65519,-115.932,active antagonist,12.1763,-120.598,Toronto Research
66,64,144203615.0,2170.0,Inactive,0.0,,,inactive,inactive,,0.0,inactive,,0.0,BIOMOL
9118,9116,144212669.0,2170.0,Active,50.0,,,active antagonist,active antagonist,16.5803,-115.202,inconclusive antagonist,61.1306,-80.7706,SIGMA
106,104,144203655.0,2554.0,Inconclusive,20.0,,,active agonist,active agonist,2.87255,73.7025,inactive,,0.0,SigmaAldrich
5920,5918,144209471.0,2554.0,Inactive,0.0,,,inactive,inactive,,0.0,inactive,,0.0,SIGMA
6964,6962,144210515.0,2554.0,Inactive,0.0,,,inactive,inactive,,0.0,inactive,,0.0,SIGMA


In [18]:
df = df.drop(idx_conflict)

In [19]:
df.groupby('activity_summary').count()

Unnamed: 0_level_0,pc_result_tag,sid,cid,activity_outcome,activity_score,activity_url,assay_data_comment,Antagonist Activity,antagonist_activity,antagonist_efficacy,viability_potency,viability_potency,viability_efficacy,sample_source
activity_summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
active agonist,537,537,537,537,537,0,0,537,537,537,537,58,517,537
active antagonist,343,343,343,343,343,0,0,343,342,343,343,108,326,343
inactive,7389,7389,7389,7389,7389,0,0,7389,0,7389,7389,318,7278,7389


In [20]:
print(len(df['sid'].unique()))
print(len(df['cid'].unique()))

8269
6798


In [21]:
# remove redundant data

df = df.drop_duplicates(subset='cid')
print(len(df['sid'].unique()))
print(len(df['cid'].unique()))

6798
6798


In [22]:
# label encode
#df['activity'] = [0 if x == 'inactive' else 1 for x in df['activity_summary']]
df['activity'] = df['activity_summary'].apply(lambda x: 0 if x == 'inactive' else 1)

In [23]:
df.head(3)

Unnamed: 0,pc_result_tag,sid,cid,activity_outcome,activity_score,activity_url,assay_data_comment,activity_summary,Antagonist Activity,antagonist_activity,antagonist_efficacy,viability_potency,viability_potency.1,viability_efficacy,sample_source,activity
3,1,144203552.0,12850184.0,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,NCI,0
4,2,144203553.0,89753.0,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,NCI,0
5,3,144203554.0,9403.0,Inactive,0.0,,,inactive,inactive,,0,inactive,,0,NCI,0


In [24]:
df.groupby('activity_summary').count()

Unnamed: 0_level_0,pc_result_tag,sid,cid,activity_outcome,activity_score,activity_url,assay_data_comment,Antagonist Activity,antagonist_activity,antagonist_efficacy,viability_potency,viability_potency,viability_efficacy,sample_source,activity
activity_summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
active agonist,451,451,451,451,451,0,0,451,451,451,451,44,432,451,451
active antagonist,291,291,291,291,291,0,0,291,290,291,291,88,275,291,291
inactive,6056,6056,6056,6056,6056,0,0,6056,0,6056,6056,269,5970,6056,6056


In [25]:
df.groupby('activity').count()

Unnamed: 0_level_0,pc_result_tag,sid,cid,activity_outcome,activity_score,activity_url,assay_data_comment,activity_summary,Antagonist Activity,antagonist_activity,antagonist_efficacy,viability_potency,viability_potency,viability_efficacy,sample_source
activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,6056,6056,6056,6056,6056,0,0,6056,6056,0,6056,6056,269,5970,6056
1,742,742,742,742,742,0,0,742,742,741,742,742,132,707,742


In [26]:
# create smaller df with only cids and activities

df_activity = df[['cid', 'activity']]
df_activity.head()

Unnamed: 0,cid,activity
3,12850184.0,0
4,89753.0,0
5,9403.0,0
6,13218779.0,0
12,637566.0,0


In [27]:
cids = df['cid'].astype(int).tolist()

In [28]:
# download structure info

chunk_size = 200
num_cids = len(cids)

if num_cids % chunk_size == 0:
    num_chunks = num_cids // chunk_size
else:
    num_chunks = num_cids // chunk_size + 1
    
print('# cids = {}'.format(num_cids))
print('# cid chunks = {}, chunked by {}'.format(num_chunks, chunk_size))

# cids = 6798
# cid chunks = 34, chunked by 200


In [29]:
df['cid'].isna().sum()

0

In [30]:
df_smiles = pd.DataFrame()
list_dfs = []

for i in range(0, num_chunks):
    
    idx1 = chunk_size * i
    idx2 = chunk_size * (i+1)
    
    cidstr = ','.join(map(str, cids[idx1:idx2]))
    
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cidstr}/property/isomericsmiles/txt'
    res = requests.get(url)
    data = pd.read_csv(StringIO(res.text), header=None, names=['smiles'])
    
    list_dfs.append(data)
    
    time.sleep(0.2)
    
    if i % 5 == 0:
        print(f'processing chunk {i}')
        
df_smiles = pd.concat(list_dfs, ignore_index=True)
df_smiles['cid'] = cids
df_smiles.head(5)

processing chunk 0
processing chunk 5
processing chunk 10
processing chunk 15
processing chunk 20
processing chunk 25
processing chunk 30


Unnamed: 0,smiles,cid
0,C(C(=O)[C@H]([C@@H]([C@H](C(=O)[O-])O)O)O)O.C(...,12850184
1,C([C@H]([C@H]([C@@H]([C@H](C(=O)[O-])O)O)O)O)O...,89753
2,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2OC(=O)CCC...,9403
3,C[C@@]12CC[C@@H](C1(C)C)C[C@H]2OC(=O)CSC#N,13218779
4,CC(=CCC/C(=C/CO)/C)C,637566


In [31]:
len(df_smiles)

6798

In [32]:
df_smiles = df_smiles[['cid', 'smiles']]
df_smiles.head(5)

Unnamed: 0,cid,smiles
0,12850184,C(C(=O)[C@H]([C@@H]([C@H](C(=O)[O-])O)O)O)O.C(...
1,89753,C([C@H]([C@H]([C@@H]([C@H](C(=O)[O-])O)O)O)O)O...
2,9403,C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2OC(=O)CCC...
3,13218779,C[C@@]12CC[C@@H](C1(C)C)C[C@H]2OC(=O)CSC#N
4,637566,CC(=CCC/C(=C/CO)/C)C


In [33]:
# generate maccs keys

fps = {}

for idx, row in df_smiles.iterrows():
    
    mol = Chem.MolFromSmiles(row['smiles'])
    if mol == None:
        print("can't generate mol object: CID {} {}".format(row['cid'], row['smiles']))
    else:
        fps[row['cid']] = [row['cid']] + list(MACCSkeys.GenMACCSKeys(mol).ToBitString())

RDKit ERROR: [20:11:34] Explicit valence for atom # 3 Si, 8, is greater than permitted


can't generate mol object: CID 28145 [NH4+].[NH4+].F[Si-2](F)(F)(F)(F)F


RDKit ERROR: [20:11:38] Explicit valence for atom # 1 Si, 8, is greater than permitted


can't generate mol object: CID 28127 F[Si-2](F)(F)(F)(F)F.[Na+].[Na+]


In [34]:
# generate column names

fpbitnames = []

fpbitnames.append('cid')

# from MACCS000 to MACCS166
for i in range(0, 167):
    fpbitnames.append(f'maccs{i:03}')
    
df_fps = pd.DataFrame.from_dict(fps, orient='index', columns=fpbitnames)

In [35]:
df_fps.head()

Unnamed: 0,cid,maccs000,maccs001,maccs002,maccs003,maccs004,maccs005,maccs006,maccs007,maccs008,...,maccs157,maccs158,maccs159,maccs160,maccs161,maccs162,maccs163,maccs164,maccs165,maccs166
12850184,12850184,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,1
89753,89753,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,1
9403,9403,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0
13218779,13218779,0,0,0,0,0,0,0,0,0,...,1,0,1,1,1,0,1,1,1,0
637566,637566,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0


In [36]:
df_activity.head(3)

Unnamed: 0,cid,activity
3,12850184.0,0
4,89753.0,0
5,9403.0,0


In [37]:
df_fps.head(3)

Unnamed: 0,cid,maccs000,maccs001,maccs002,maccs003,maccs004,maccs005,maccs006,maccs007,maccs008,...,maccs157,maccs158,maccs159,maccs160,maccs161,maccs162,maccs163,maccs164,maccs165,maccs166
12850184,12850184,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,1
89753,89753,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,1
9403,9403,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0


In [38]:
df_data = df_activity.join(df_fps.set_index('cid'), on='cid')

In [39]:
df_data[df_data.isna().any(axis=1)]

Unnamed: 0,cid,activity,maccs000,maccs001,maccs002,maccs003,maccs004,maccs005,maccs006,maccs007,...,maccs157,maccs158,maccs159,maccs160,maccs161,maccs162,maccs163,maccs164,maccs165,maccs166
2293,28145.0,0,,,,,,,,,...,,,,,,,,,,
9077,28127.0,0,,,,,,,,,...,,,,,,,,,,


In [40]:
df_data = df_data.dropna()
len(df_data)

6796

In [41]:
df_data.to_csv('df_data.csv')

In [42]:
df_data.head(3)

Unnamed: 0,cid,activity,maccs000,maccs001,maccs002,maccs003,maccs004,maccs005,maccs006,maccs007,...,maccs157,maccs158,maccs159,maccs160,maccs161,maccs162,maccs163,maccs164,maccs165,maccs166
3,12850184.0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,1
4,89753.0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,1
5,9403.0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0


In [43]:
# load data into x and y
X = df_data.iloc[:, 2:]
y = df_data['activity'].values

In [44]:
X.head(3)

Unnamed: 0,maccs000,maccs001,maccs002,maccs003,maccs004,maccs005,maccs006,maccs007,maccs008,maccs009,...,maccs157,maccs158,maccs159,maccs160,maccs161,maccs162,maccs163,maccs164,maccs165,maccs166
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,1
5,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,1,1,1,1,0


In [45]:
print(len(y))
y.sum()

6796


742

In [46]:
# remove zero variance features
print(X.shape)

sel = VarianceThreshold()
X = sel.fit_transform(X)
print(X.shape)

(6796, 167)
(6796, 163)


In [47]:
# split data into train (90%) and test (10%)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=3100,
                                                    stratify=y, test_size=0.1)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(y_train.sum(), y_test.sum())

(6116, 163) (680, 163) (6116,) (680,)
668 74


In [48]:
print(len(y_train))
print(len(X_train))
print(len(X_train[0]))

6116
6116
163


In [49]:
print('# inactives: {}'.format(len(y_train) - y_train.sum()))
print('# actives: {}'.format(y_train.sum()))

# inactives: 5448
# actives: 668


In [50]:
# downsample majority class (inactive compounds) to balance the dataset

# indices of each class' observations
idx_inactives = np.where(y_train == 0)[0]
idx_actives = np.where(y_train == 1)[0]

# number of observations in each class
num_inactives = len(idx_inactives)
num_actives = len(idx_actives)

# randomly sample from inactives without replacement
np.random.seed(0)
idx_inactives_downsampled = np.random.choice(idx_inactives, size=num_actives, replace=False)

# join together downsamples inactives with actives
# x_train is 2d
X_train = np.vstack((X_train[idx_inactives_downsampled], X_train[idx_actives]))

# y_train is 1d
y_train = np.hstack((y_train[idx_inactives_downsampled], y_train[idx_actives]))

In [51]:
print('# inactives: {}'.format(len(y_train) - y_train.sum()))
print('# actives: {}'.format(y_train.sum()))

# inactives: 668
# actives: 668


In [52]:
print(len(y_train))
print(len(X_train))
print(len(X_train[0]))

1336
1336
163


In [53]:
clf = BernoulliNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)

In [54]:
cmat = confusion_matrix(y_train, y_pred)
# tn fp
# fn tp
print(cmat)

[[462 206]
 [199 469]]


In [55]:
acc = accuracy_score(y_train, y_pred)

# tp / (fn + tp)
sens = cmat[1][1] / (cmat[1][0] + cmat[1][1])

# tn / (tn + fp)
spec = cmat[0][0] / (cmat[0][0] + cmat[0][1])

bacc = (sens + spec) / 2

y_score = clf.predict_proba(X_train)[:, 1]
auc = roc_auc_score(y_train, y_score)

In [56]:
print(f'accuracy: {acc}')
print(f'balanced accuracy: {bacc}')
print(f'sensitivity: {sens}')
print(f'specificity: {spec}')
print(f'auc roc: {auc}')

accuracy: 0.6968562874251497
balanced accuracy: 0.6968562874251496
sensitivity: 0.7020958083832335
specificity: 0.6916167664670658
auc roc: 0.7496985818781599


In [57]:
y_pred =clf.predict(X_test)
cmat = confusion_matrix(y_test, y_pred)
print(cmat)

[[412 194]
 [ 28  46]]


In [58]:
acc = accuracy_score(y_test, y_pred)
sens = cmat[1][1] / (cmat[1][0] + cmat[1][1])
spec = cmat[0][0] / (cmat[0][0] + cmat[0][1])
bacc = (sens + spec) / 2

y_score = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_score)

print(f'accuracy: {acc}')
print(f'balanced accuracy: {bacc}')
print(f'sensitivity: {sens}')
print(f'specificity: {spec}')
print(f'auc roc: {auc}')

accuracy: 0.6735294117647059
balanced accuracy: 0.6507448042101507
sensitivity: 0.6216216216216216
specificity: 0.6798679867986799
auc roc: 0.724099099099099


In [59]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.68      0.79       606
           1       0.19      0.62      0.29        74

    accuracy                           0.67       680
   macro avg       0.56      0.65      0.54       680
weighted avg       0.86      0.67      0.73       680



In [60]:
# decision tree

clf = DecisionTreeClassifier(random_state = 0)
clf.fit(X_train, y_train)
y_true= y_train
y_pred = clf.predict(X_train)
cmat = confusion_matrix(y_true, y_pred)
print(cmat)

acc = accuracy_score(y_true, y_pred)
sens = cmat[1][1] / (cmat[1][0] + cmat[1][1])
spec = cmat[0][0] / (cmat[0][0] + cmat[0][1])
bacc = (sens + spec) / 2

y_score = clf.predict_proba(X_train)[:, 1]
auc = roc_auc_score(y_true, y_score)

print(f'accuracy: {acc}')
print(f'balanced accuracy: {bacc}')
print(f'sensitivity: {sens}')
print(f'specificity: {spec}')
print(f'auc roc: {auc}')

[[663   5]
 [  3 665]]
accuracy: 0.9940119760479041
balanced accuracy: 0.9940119760479043
sensitivity: 0.9955089820359282
specificity: 0.9925149700598802
auc roc: 0.9998890691670551


In [61]:
y_true= y_test
y_pred = clf.predict(X_test)
cmat = confusion_matrix(y_true, y_pred)
print(cmat)

acc = accuracy_score(y_true, y_pred)
sens = cmat[1][1] / (cmat[1][0] + cmat[1][1])
spec = cmat[0][0] / (cmat[0][0] + cmat[0][1])
bacc = (sens + spec) / 2

y_score = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_true, y_score)

print(f'accuracy: {acc}')
print(f'balanced accuracy: {bacc}')
print(f'sensitivity: {sens}')
print(f'specificity: {spec}')
print(f'auc roc: {auc}')

[[422 184]
 [ 32  42]]
accuracy: 0.6823529411764706
balanced accuracy: 0.6319686022656319
sensitivity: 0.5675675675675675
specificity: 0.6963696369636964
auc roc: 0.6298724467041299


In [62]:
scores = ['roc_auc', 'balanced_accuracy']

In [63]:
ncvs = 10
max_depth_range = np.linspace(3, 7, num=5, dtype='int32')
min_samples_split_range = np.linspace(3, 7, num=5, dtype='int32')
min_samples_leaf_range = np.linspace(2, 6, num=5, dtype='int32')

param_grid = dict(max_depth = max_depth_range,
                  min_samples_split = min_samples_split_range,
                  min_samples_leaf = min_samples_leaf_range)

clf = GridSearchCV(DecisionTreeClassifier(random_state=0),
                   param_grid = param_grid, cv=ncvs, scoring=scores, refit='roc_auc',
                   return_train_score = True, iid=False)

In [64]:
clf.fit(X_train, y_train)
print('best parameter set: {}'.format(clf.best_params_))

best parameter set: {'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 3}




In [65]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_min_samples_split,params,split0_test_roc_auc,split1_test_roc_auc,...,split2_train_balanced_accuracy,split3_train_balanced_accuracy,split4_train_balanced_accuracy,split5_train_balanced_accuracy,split6_train_balanced_accuracy,split7_train_balanced_accuracy,split8_train_balanced_accuracy,split9_train_balanced_accuracy,mean_train_balanced_accuracy,std_train_balanced_accuracy
0,0.011544,0.000241,0.003056,0.000114,3,2,3,"{'max_depth': 3, 'min_samples_leaf': 2, 'min_s...",0.799955,0.760637,...,0.730449,0.727121,0.723794,0.717138,0.729888,0.722391,0.733111,0.733950,0.726377,0.005146
1,0.011313,0.000136,0.003039,0.000078,3,2,4,"{'max_depth': 3, 'min_samples_leaf': 2, 'min_s...",0.799955,0.760637,...,0.730449,0.727121,0.723794,0.717138,0.729888,0.722391,0.733111,0.733950,0.726377,0.005146
2,0.011348,0.000123,0.003065,0.000078,3,2,5,"{'max_depth': 3, 'min_samples_leaf': 2, 'min_s...",0.799955,0.760637,...,0.730449,0.727121,0.723794,0.717138,0.729888,0.722391,0.733111,0.733950,0.726377,0.005146
3,0.011338,0.000102,0.003052,0.000128,3,2,6,"{'max_depth': 3, 'min_samples_leaf': 2, 'min_s...",0.799955,0.760637,...,0.730449,0.727121,0.723794,0.717138,0.729888,0.722391,0.733111,0.733950,0.726377,0.005146
4,0.011340,0.000126,0.003015,0.000040,3,2,7,"{'max_depth': 3, 'min_samples_leaf': 2, 'min_s...",0.799955,0.760637,...,0.730449,0.727121,0.723794,0.717138,0.729888,0.722391,0.733111,0.733950,0.726377,0.005146
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,0.014600,0.000164,0.003036,0.000055,7,6,3,"{'max_depth': 7, 'min_samples_leaf': 6, 'min_s...",0.781466,0.742370,...,0.802829,0.810316,0.798669,0.809484,0.808821,0.815471,0.806314,0.809602,0.806883,0.006651
121,0.014652,0.000124,0.003102,0.000122,7,6,4,"{'max_depth': 7, 'min_samples_leaf': 6, 'min_s...",0.781466,0.742370,...,0.802829,0.810316,0.798669,0.809484,0.808821,0.815471,0.806314,0.809602,0.806883,0.006651
122,0.014656,0.000127,0.003094,0.000089,7,6,5,"{'max_depth': 7, 'min_samples_leaf': 6, 'min_s...",0.781466,0.742370,...,0.802829,0.810316,0.798669,0.809484,0.808821,0.815471,0.806314,0.809602,0.806883,0.006651
123,0.014589,0.000111,0.003036,0.000080,7,6,6,"{'max_depth': 7, 'min_samples_leaf': 6, 'min_s...",0.781466,0.742370,...,0.802829,0.810316,0.798669,0.809484,0.808821,0.815471,0.806314,0.809602,0.806883,0.006651


In [66]:
y_true, y_pred = y_train, clf.predict(X_train)
cmat = confusion_matrix(y_true, y_pred)
print(cmat)

acc = accuracy_score(y_true, y_pred)
sens = cmat[1][1] / (cmat[1][0] + cmat[1][1])
spec = cmat[0][0] / (cmat[0][0] + cmat[0][1])
bacc = (sens + spec) / 2

y_score = clf.predict_proba(X_train)[:, 1]
auc = roc_auc_score(y_true, y_score)

print(f'accuracy: {acc}')
print(f'balanced accuracy: {bacc}')
print(f'sensitivity: {sens}')
print(f'specificity: {spec}')
print(f'auc roc: {auc}')

[[529 139]
 [194 474]]
accuracy: 0.750748502994012
balanced accuracy: 0.750748502994012
sensitivity: 0.7095808383233533
specificity: 0.7919161676646707
auc roc: 0.8049432571981785


In [67]:
y_true= y_test
y_pred = clf.predict(X_test)
cmat = confusion_matrix(y_true, y_pred)
print(cmat)

acc = accuracy_score(y_true, y_pred)
sens = cmat[1][1] / (cmat[1][0] + cmat[1][1])
spec = cmat[0][0] / (cmat[0][0] + cmat[0][1])
bacc = (sens + spec) / 2

y_score = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_true, y_score)

print(f'accuracy: {acc}')
print(f'balanced accuracy: {bacc}')
print(f'sensitivity: {sens}')
print(f'specificity: {spec}')
print(f'auc roc: {auc}')

[[467 139]
 [ 24  50]]
accuracy: 0.7602941176470588
balanced accuracy: 0.7231513691909732
sensitivity: 0.6756756756756757
specificity: 0.7706270627062707
auc roc: 0.7465435732762465


In [68]:
# show first 5 lines of df_activity
# count inactive and active compounds

def ex_1():
    
    print(df_activity.head(5))
    print()
    
    print('actives: {}'.format(df_activity['activity'].sum()))
    print('inactives: {}'.format(len(df_activity) - df_activity['activity'].sum()))
    

In [69]:
ex_1()

           cid  activity
3   12850184.0         0
4      89753.0         0
5       9403.0         0
6   13218779.0         0
12    637566.0         0

actives: 742
inactives: 6056


In [70]:
# show first 5 lines of df_smiles
# number of rows

def ex_2():
    
    print(df_smiles.head(5))
    print()
    print('rows:', len(df_smiles))

In [71]:
ex_2()

        cid                                             smiles
0  12850184  C(C(=O)[C@H]([C@@H]([C@H](C(=O)[O-])O)O)O)O.C(...
1     89753  C([C@H]([C@H]([C@@H]([C@H](C(=O)[O-])O)O)O)O)O...
2      9403  C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2OC(=O)CCC...
3  13218779         C[C@@]12CC[C@@H](C1(C)C)C[C@H]2OC(=O)CSC#N
4    637566                               CC(=CCC/C(=C/CO)/C)C

rows: 6798


In [72]:
# morgan, 2, 1024 bit fp
# save fps and cids to dataframe
# print dim of df_dps
# first 5 lines of df_fps

# def ex_3():
    
#     # smile to mol
#     mols = [Chem.MolFromSmiles(smile) for smile in df_smiles['smiles']]
    
#     # mol to fp
#     fps = {}
#     for i, mol in enumerate(mols):
#         if mol is None:
#             fps[df_smiles['cid'][i]] = None
#         else:
#             fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024).ToBitString()
#             fps[df_smiles['cid'][i]] = list(fp)
#             col_len = len(fp)
    
    
#     cols = ['cid'] + [f'morgan_{i}' for i in range(col_len)]
#     df_fps = pd.DataFrame.from_dict(fps, orient='index')
    
#     return df_fps
    

In [73]:
def ex_3():
    
    fps = {}
    for i, row in df_smiles.iterrows():
        
        mol = Chem.MolFromSmiles(row['smiles'])
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024).ToBitString()
            fps[df_smiles['cid'][i]] = list(fp)
            
    cols = [f'morgan_{i}' for i in range(1, 1024 + 1)]
    df_fps = pd.DataFrame.from_dict(fps, orient='index', columns=cols).reset_index()
    df_fps.rename(columns = {'index' : 'cid'}, inplace=True)
    
    print(df_fps.shape)
    print()
    print(df_fps.head(5))
    
    return df_fps

In [74]:
df_fps = ex_3()

RDKit ERROR: [20:12:22] Explicit valence for atom # 3 Si, 8, is greater than permitted
RDKit ERROR: [20:12:23] Explicit valence for atom # 1 Si, 8, is greater than permitted


(6796, 1025)

        cid morgan_1 morgan_2 morgan_3 morgan_4 morgan_5 morgan_6 morgan_7  \
0  12850184        0        1        0        0        0        0        0   
1     89753        0        1        0        0        0        0        0   
2      9403        0        0        0        1        1        0        0   
3  13218779        0        0        0        0        0        0        0   
4    637566        0        0        0        0        0        0        0   

  morgan_8 morgan_9  ... morgan_1015 morgan_1016 morgan_1017 morgan_1018  \
0        0        0  ...           0           0           0           0   
1        0        0  ...           0           0           0           0   
2        0        0  ...           0           0           0           0   
3        0        0  ...           0           0           0           0   
4        0        0  ...           0           0           0           1   

  morgan_1019 morgan_1020 morgan_1021 morgan_1022 morgan_102

In [75]:
# merge df_activity and df_fps into df_data
# join on cid 
# remove rows with nulls
# print dim
# print 4 lines

def ex_4():
    
    df_activity['cid'] = df_activity['cid'].astype(int)
    df_data = df_activity.merge(df_fps, on='cid')
    
    df_data.dropna()
    
    print(df_data.shape)
    print()
    print(df_data.head(5))
    
    return df_data


In [76]:
df_data = ex_4()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


(6796, 1026)

        cid  activity morgan_1 morgan_2 morgan_3 morgan_4 morgan_5 morgan_6  \
0  12850184         0        0        1        0        0        0        0   
1     89753         0        0        1        0        0        0        0   
2      9403         0        0        0        0        1        1        0   
3  13218779         0        0        0        0        0        0        0   
4    637566         0        0        0        0        0        0        0   

  morgan_7 morgan_8  ... morgan_1015 morgan_1016 morgan_1017 morgan_1018  \
0        0        0  ...           0           0           0           0   
1        0        0  ...           0           0           0           0   
2        0        0  ...           0           0           0           0   
3        0        0  ...           0           0           0           0   
4        0        0  ...           0           0           0           1   

  morgan_1019 morgan_1020 morgan_1021 morgan_1022 morg

In [95]:
# fingerprints to x
# activity to y
# print dims of x and y
# remove 0 variance features from X
# split data into 90 train, 10 test
# print dims
# downsample

def ex_5():
    
    # x and y
    X = df_data.iloc[:, 2:]
    y = df_data['activity'].values
    
    print('x shape: {}'.format(X.shape))
    print('y shape: {}'.format(y.shape))
    print()
    
    # remove 0 var feats
    sel = VarianceThreshold()
    X = sel.fit_transform(X)
    
    print('end x shape: {}'.format(X.shape)) 
    print()
    
    # split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3100, stratify=y, 
                                                        test_size=0.1)
    print('=========================')
    print('x train: {}'.format(X_train.shape))
    print('y train: {}'.format(y_train.shape))
    print()
    print('x test: {}'.format(X_test.shape))
    print('y test: {}'.format(y_test.shape))
    print()
    
    # downsample inactive compounds in train to balance
    idx_inactives = np.where(y_train == 0)[0]
    idx_actives = np.where(y_train == 1)[0]
    
    num_inactivate = len(idx_inactives)
    num_actives = len(idx_actives)
    
    print('=========================')
    print('starting inactives:', num_inactives)
    print('actives:', num_actives)
    
    # random sample without replacement
    idx_inactives_downsampled = np.random.choice(idx_inactives, size=num_actives, replace=False)
    
    # join
    X_train = np.vstack([X_train[idx_inactives_downsampled], X_train[idx_actives]])
    y_train = np.hstack([y_train[idx_inactives_downsampled], y_train[idx_actives]])
    
    print('downsampled inactives:', len(idx_inactives_downsampled))
    print()
    print('x train: {}'.format(X_train.shape))
    print('y train: {}'.format(y_train.shape))
    
    return X_train, X_test, y_train, y_test

In [96]:
X_train, X_test, y_train, y_test = ex_5()

x shape: (6796, 1024)
y shape: (6796,)

end x shape: (6796, 1024)

x train: (6116, 1024)
y train: (6116,)

x test: (680, 1024)
y test: (680,)

starting inactives: 5448
actives: 668
downsampled inactives: 668

x train: (1336, 1024)
y train: (1336,)


In [123]:
# random forest
# 10 fold gridsearchcvjjj
# print mean balanced accuracies for each param value

def ex_6(X_train, y_train):
    
    rf = RandomForestClassifier(max_depth = 4, min_samples_leaf = 2, min_samples_split=3)
    param_grid = {'n_estimators' : np.linspace(5, 200, 40, dtype=np.int64)}
    
    ba = make_scorer(balanced_accuracy_score)
    grid = GridSearchCV(rf, param_grid=param_grid, scoring=ba, cv=10, return_train_score=True)
    grid.fit(X_train, y_train)
                  
    return grid

In [124]:
grid = ex_6(X_train, y_train)

In [125]:
print(grid.best_estimator_)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [128]:
df_grid = pd.DataFrame(grid.cv_results_)
df_grid

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.057065,0.001255,0.006365,0.000237,5,{'n_estimators': 5},0.641791,0.671642,0.626866,0.679104,...,0.721298,0.737937,0.731281,0.68802,0.661008,0.677618,0.679081,0.677325,0.693796,0.02525
1,0.064602,0.001503,0.006733,0.000261,10,{'n_estimators': 10},0.69403,0.723881,0.701493,0.731343,...,0.738769,0.757072,0.74376,0.729617,0.725753,0.729888,0.715664,0.719919,0.729721,0.014037
2,0.072507,0.000995,0.007209,0.000222,15,{'n_estimators': 15},0.716418,0.701493,0.708955,0.701493,...,0.742097,0.752912,0.730449,0.737105,0.724786,0.744051,0.729002,0.72237,0.734866,0.008937
3,0.082109,0.00201,0.007423,0.000211,20,{'n_estimators': 20},0.686567,0.738806,0.708955,0.708955,...,0.741265,0.733777,0.744592,0.752912,0.748065,0.747301,0.742224,0.758958,0.746993,0.007118
4,0.090915,0.002387,0.007688,0.000253,25,{'n_estimators': 25},0.701493,0.768657,0.716418,0.738806,...,0.770383,0.755408,0.767055,0.763727,0.789694,0.765579,0.748933,0.743197,0.763642,0.012519
5,0.095909,0.000772,0.007973,0.000267,30,{'n_estimators': 30},0.723881,0.708955,0.746269,0.738806,...,0.741265,0.737937,0.751248,0.75208,0.768988,0.777269,0.762225,0.753133,0.754914,0.011419
6,0.102219,0.001176,0.008313,0.000266,35,{'n_estimators': 35},0.701493,0.701493,0.708955,0.776119,...,0.767887,0.74376,0.769551,0.768719,0.782248,0.773894,0.772181,0.743994,0.761142,0.014353
7,0.110256,0.003211,0.008636,0.000289,40,{'n_estimators': 40},0.679104,0.753731,0.708955,0.716418,...,0.749584,0.767887,0.77371,0.784526,0.790575,0.774765,0.750596,0.758835,0.765547,0.015033
8,0.119221,0.004898,0.00889,0.000163,45,{'n_estimators': 45},0.686567,0.679104,0.686567,0.686567,...,0.762063,0.733777,0.757903,0.75624,0.76564,0.766421,0.767253,0.759775,0.760318,0.011163
9,0.124669,0.00219,0.009263,0.000295,50,{'n_estimators': 50},0.679104,0.761194,0.716418,0.679104,...,0.772879,0.748752,0.778702,0.775374,0.783071,0.763929,0.766427,0.772203,0.769295,0.009283


In [129]:
df_grid.groupby(['param_n_estimators', ]).agg({'mean_test_score' : 'mean',
                                               'mean_train_score': 'mean'}).reset_index()

Unnamed: 0,param_n_estimators,mean_test_score,mean_train_score
0,5,0.637596,0.693796
1,10,0.692232,0.729721
2,15,0.696732,0.734866
3,20,0.691633,0.746993
4,25,0.702702,0.763642
5,30,0.716961,0.754914
6,35,0.702047,0.761142
7,40,0.704308,0.765547
8,45,0.687076,0.760318
9,50,0.698259,0.769295


In [137]:
# predict best grid search on training
# confusion matrix
# report acc, balanced acc, sensitivity, specificity, auc roc

def ex_7(grid, X_train, y_train):
    
    y_pred = grid.best_estimator_.predict(X_train)
    cmat = confusion_matrix(y_train, y_pred)
    
    print(cmat)
    
    acc = accuracy_score(y_train, y_pred)
    sens = cmat[1][1] / (cmat[1][0] + cmat[1][1])
    spec = cmat[0][0] / (cmat[0][0] + cmat[0][1])
    bacc = (sens + spec) / 2
    
    y_score = grid.best_estimator_.predict_proba(X_train)[:,1]
    auc = roc_auc_score(y_train, y_score)
    
    print()
    print('acc:', acc)
    print('sens:', sens)
    print('spec:', spec)
    print('bacc', bacc)
    print('auc roc:', auc)

In [138]:
ex_7(grid, X_train, y_train)

[[530 138]
 [153 515]]

acc: 0.782185628742515
sens: 0.7709580838323353
spec: 0.7934131736526946
bacc 0.782185628742515
auc roc: 0.8518008892394852


In [139]:
# pred on test
ex_7(grid, X_test, y_test)

[[424 182]
 [ 20  54]]

acc: 0.7029411764705882
sens: 0.7297297297297297
spec: 0.6996699669966997
bacc 0.7146998483632148
auc roc: 0.7544376059227544
