# Import modules

In [1]:
%matplotlib inline
#%matplotlib notebook

#Basic stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# ML models
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score
from sklearn.metrics import make_scorer

# Initialize tqdm progress bar
from tqdm import tqdm, tnrange, tqdm_notebook
from tqdm.auto import tqdm
tqdm.pandas(tqdm_notebook)

In [2]:
%reload_ext autoreload
%autoreload 2
# The averaged random forest model
from AverageRF import AverageRF
from utils import train_test_split_data, evaluate_model

# Data

In [6]:
active = pd.read_pickle("./active.pkl.bz2")
inactive = pd.read_pickle("./inactive.pkl.bz2")
inconclusive = pd.read_pickle("./inconclusive.pkl.bz2")

In [7]:
# Results *should* be:
# active       =  (829, 10)
# inactive     =  (50000, 7)
# Inconclusive =  (1526, 6)

print("active       = ", active.shape)
print("inactive     = ", inactive.shape)
print("Inconclusive = ", inconclusive.shape)

active       =  (829, 10)
inactive     =  (50000, 7)
Inconclusive =  (1526, 6)


In [8]:
# Add active bit for classification tasks
active['ActiveBit']   = True
inactive['ActiveBit'] = False

In [9]:
# Prepare a DataFrame with only the data we need
columns = ['PUBCHEM_SID', 'RDKit Fingerprints', 'Morgan Fingerprints','ActiveBit']
data = pd.concat([active.filter(columns,axis=1),inactive],
                 axis=0,join='inner', ignore_index=True)

In [13]:
X_rdkit  = np.array(list(data['RDKit Fingerprints'])).astype(int)
y_data = data['ActiveBit'].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split_data(X_rdkit, y_data, rate_inactives=0.90)

X_train:  7461 	y_train:  7461 	( 746 actives and 6715 inactives)
X_test:   829  	y_test:   829  	( 83 actives and 746 inactives)


# Fit to RDKit Fingerprints

In [19]:
avgRF_cls_rdkit = AverageRF(model_type="classifier", n_forests=5, n_estimators=10, verbosity=1)

In [20]:
avgRF_cls_rdkit.fit(X_train,y_train)

SPLIT: 0  [TRAIN: 5968 	TEST: 1493] 	 Training Time:0.282 seconds.
Confusion Matrix = 
 [[1336    7]
 [  87   63]]
	  Precision = 0.9000
	  Recall    = 0.4200
	  F1-score  = 0.5727
	  ROC AUC   = 0.8441
	  Score     = 0.9370 

SPLIT: 1  [TRAIN: 5969 	TEST: 1492] 	 Training Time:0.297 seconds.
Confusion Matrix = 
 [[1325   18]
 [  92   57]]
	  Precision = 0.7600
	  Recall    = 0.3826
	  F1-score  = 0.5089
	  ROC AUC   = 0.8401
	  Score     = 0.9263 

SPLIT: 2  [TRAIN: 5969 	TEST: 1492] 	 Training Time:0.301 seconds.
Confusion Matrix = 
 [[1327   16]
 [ 103   46]]
	  Precision = 0.7419
	  Recall    = 0.3087
	  F1-score  = 0.4360
	  ROC AUC   = 0.8492
	  Score     = 0.9202 

SPLIT: 3  [TRAIN: 5969 	TEST: 1492] 	 Training Time:0.304 seconds.
Confusion Matrix = 
 [[1322   21]
 [  91   58]]
	  Precision = 0.7342
	  Recall    = 0.3893
	  F1-score  = 0.5088
	  ROC AUC   = 0.8531
	  Score     = 0.9249 

SPLIT: 4  [TRAIN: 5969 	TEST: 1492] 	 Training Time:0.342 seconds.
Confusion Matrix = 
 [[13

<AverageRF.AverageRF at 0x1b44a1e3a58>