# Imports

In [2]:
%matplotlib inline
#%matplotlib notebook

# The usual stuff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# Progress bar
from tqdm import tnrange, tqdm_notebook
from tqdm.auto import tqdm

# For treating molecules
import molvs
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.rdmolops import RDKFingerprint
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect

# ML models
## Model Selection
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV

## Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score
from sklearn.metrics import make_scorer

## Regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, median_absolute_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Functions

In [None]:
# Add Fingerprints
def get_rdkfingerprints(mol):
    fp = RDKFingerprint(mol)
    return np.array(list(map(int,fp.ToBitString())))

def get_morganfingerprints(mol):
    fp = GetMorganFingerprintAsBitVect(mol,2)
    return np.array(list(map(int,fp.ToBitString())))

def add_fingerprints(frame):
    """
    Gets as input a Pandas Dataframe of one molecule per row, with a required 'mol' column.
    This 'mol' column must contain an RDKit mol object for each row (molecule)
    
    Adds to the dataframe a new column with the fingerprint.
    """

    tqdm.pandas(desc="Generating Morgan Fingerprints:")
    frame['Morgan Fingerprints'] = frame['mol'].progress_apply(get_morganfingerprints)
    
    return

# Data

In [3]:
active = pd.read_pickle("./active.pkl.bz2")
inactive = pd.read_pickle("./inactive.pkl.bz2")
inconclusive = pd.read_pickle("./inconclusive.pkl.bz2")

In [13]:
active = active.drop(['RDKit Fingerprints','Morgan Fingerprints'], axis=1)
inactive = inactive.drop(['RDKit Fingerprints','Morgan Fingerprints'], axis=1)
inconclusive = inconclusive.drop(['RDKit Fingerprints','Morgan Fingerprints'], axis=1)

In [14]:
# Results *should* be:
# active       =  (829, 8)
# inactive     =  (50000, 5)
# Inconclusive =  (1526, 4)

print("active       = ", active.shape)
print("inactive     = ", inactive.shape)
print("Inconclusive = ", inconclusive.shape)

active       =  (829, 8)
inactive     =  (50000, 5)
Inconclusive =  (1526, 4)
