In [1]:
# IIRC the installation instructions for the two most important libraries are
# conda install -c conda-forge rdkit
# conda install -c fastchan fastai anaconda

from tqdm import tqdm
import os
import pandas as pd
import numpy as np
import pdb
import multiprocessing as mp
from functools import partial
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity, BulkTanimotoSimilarity
from rdkit import RDLogger
from rdkit.Chem import rdMolDescriptors
import pickle
RDLogger.DisableLog('rdApp.*')
import random
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from fastai.tabular.all import *

In [2]:
# from https://github.com/wjm41/mpro-rank-gen

def Morgan_Fingerprint(smile, nbits = 512):
    return AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile),3,nBits=nbits, useFeatures = True)

def Atom_Pair(smile):
    return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(Chem.MolFromSmiles(smile),nBits = 512)

def TopologicalTorsion(smile):
    return rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(Chem.MolFromSmiles(smile),nBits = 512)

def concat_fingerprints(smile): # our default feature set throughout this model is a concatenation of 3 fingerprint sets
    MF = Morgan_Fingerprint(smile)
    AP = Atom_Pair(smile)
    TT = TopologicalTorsion(smile)
    return np.array(MF + AP + TT)

def get_similarity(ref, comps):
    return BulkTanimotoSimilarity(ref, comps)

def IC50_diff(compound_a, compound_b):
    if (compound_a == np.inf) | (compound_b == np.inf):
        return np.NaN
    else:
        return compound_a - compound_b

def duplicate_meta_data(meta_data): # needed when doing the 'Train-Test' split to account for the 'reverses'
    
    original_meta = pd.DataFrame(meta_data)
    duplicate_meta = pd.DataFrame(columns = original_meta.columns)
    duplicate_meta['Data_Split'], duplicate_meta['Test_Style'] = original_meta['Data_Split'], original_meta['Test_Style']
    duplicate_meta['Compound_A'], duplicate_meta['Compound_B'] = original_meta['Compound_B'], original_meta['Compound_A']
    duplicate_meta['Compound_A_IC50'], duplicate_meta['Compound_B_IC50'] = original_meta['Compound_B_IC50'], original_meta['Compound_A_IC50']
    duplicate_meta['IC50_diff'] = -original_meta['IC50_diff']
    
    assert original_meta.shape == duplicate_meta.shape, "Shapes not matching"
    
    return original_meta.append(duplicate_meta, ignore_index = True).reset_index(drop = True)

### Import Raw Data and Filter

In [3]:
df = pd.read_csv('activity_data_21Aug2021.csv')
df = df[(df['acrylamide'] == False) & (df['chloroacetamide'] == False)].copy()
df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,SMILES,CID,canonical_CID,r_inhibition_at_20_uM,r_inhibition_at_50_uM,r_avg_IC50,f_inhibition_at_20_uM,f_inhibition_at_50_uM,f_avg_IC50,f_avg_pIC50,relative_solubility_at_20_uM,relative_solubility_at_100_uM,trypsin_IC50,NMR_std_ratio,acrylamide,chloroacetamide,series,frag_id
0,Cc1nccn1Cc1ccc2c(NC(=O)C3CCOc4ccc(Cl)cc43)cncc2c1,MAT-POS-7ddaf7de-1,MAT-POS-7ddaf7de-1,,,,,,5.315361,,,,,,False,False,3-aminopyridine-like,
1,CS(=O)(=O)c1ccc2c(NC(=O)C3CCOc4ccc(Cl)cc43)cncc2c1,MAT-POS-7ddaf7de-2,MAT-POS-7ddaf7de-2,,,,,,0.427100,,,,,,False,False,3-aminopyridine-like,
2,O=C(Nc1cncc2cc(CN3CC4(CNC4)C3)ccc12)C1CCOc2ccc(Cl)cc21,MAT-POS-7ddaf7de-3,MAT-POS-7ddaf7de-3,,,,,,1.437438,,,,,,False,False,3-aminopyridine-like,
3,Cc1c(N)cncc1NC(=O)C1CCOc2ccc(Cl)cc21,MAT-POS-7ddaf7de-4,MAT-POS-7ddaf7de-4,,,,,,3.381693,,,,,,False,False,3-aminopyridine-like,
4,O=C1N(c2cncc3ccccc23)CC[C@@]1(O)c1cccc(Cl)c1,MAT-POS-8695a11f-1,MAT-POS-8695a11f-1,,,,,,90.674247,,,,,,False,False,3-aminopyridine-like,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1349,N#Cc1cc(Cl)cc(NC(=O)Nc2cccnc2)c1,WAR-XCH-eb7b662f-2,WAR-XCH-eb7b662f-2,,,,,,99.010000,,,,99.0,,False,False,3-aminopyridine-like,
1350,N#Cc1cccc(NC(=O)Nc2cncc(N)c2)c1,DAR-DIA-23aa0b97-8,DAR-DIA-23aa0b97-8,,,,,,54.272964,,,,99.0,,False,False,3-aminopyridine-like,
1351,N#Cc1cccc(NC(=O)Nc2c[nH]c3ncccc23)c1,DAR-DIA-23aa0b97-11,DAR-DIA-23aa0b97-11,,,,,,63.554925,,,,99.0,,False,False,3-aminopyridine-like,
1352,N#Cc1cccc(NC(=O)Cc2cncc3ccccc23)c1,DAR-DIA-23aa0b97-19,DAR-DIA-23aa0b97-19,,,14.641091,,,26.680129,,,,,,False,False,3-aminopyridine-like,


In [4]:
noncovalent = pd.DataFrame()
noncovalent['SMILES'] = df['SMILES']
noncovalent['CID'] = df['CID']
noncovalent['r_avg_IC50'] = df['r_avg_IC50']
noncovalent['f_avg_IC50'] = df['f_avg_IC50']
noncovalent

Unnamed: 0,SMILES,CID,r_avg_IC50,f_avg_IC50
0,Cc1nccn1Cc1ccc2c(NC(=O)C3CCOc4ccc(Cl)cc43)cncc2c1,MAT-POS-7ddaf7de-1,,5.315361
1,CS(=O)(=O)c1ccc2c(NC(=O)C3CCOc4ccc(Cl)cc43)cncc2c1,MAT-POS-7ddaf7de-2,,0.427100
2,O=C(Nc1cncc2cc(CN3CC4(CNC4)C3)ccc12)C1CCOc2ccc(Cl)cc21,MAT-POS-7ddaf7de-3,,1.437438
3,Cc1c(N)cncc1NC(=O)C1CCOc2ccc(Cl)cc21,MAT-POS-7ddaf7de-4,,3.381693
4,O=C1N(c2cncc3ccccc23)CC[C@@]1(O)c1cccc(Cl)c1,MAT-POS-8695a11f-1,,90.674247
...,...,...,...,...
1349,N#Cc1cc(Cl)cc(NC(=O)Nc2cccnc2)c1,WAR-XCH-eb7b662f-2,,99.010000
1350,N#Cc1cccc(NC(=O)Nc2cncc(N)c2)c1,DAR-DIA-23aa0b97-8,,54.272964
1351,N#Cc1cccc(NC(=O)Nc2c[nH]c3ncccc23)c1,DAR-DIA-23aa0b97-11,,63.554925
1352,N#Cc1cccc(NC(=O)Cc2cncc3ccccc23)c1,DAR-DIA-23aa0b97-19,14.641091,26.680129


### Preprocess IC50 values and molecular fingerprints

In [5]:
# which columns represent IC50 values and what their upper cutoffs for valid measurements are
IC50_max = {
    'r_avg_IC50': 98,
    'f_avg_IC50': 98,
}

In [6]:
for IC50col in IC50_max:
    noncovalent.loc[noncovalent[IC50col] > IC50_max[IC50col], IC50col] = np.inf
noncovalent

Unnamed: 0,SMILES,CID,r_avg_IC50,f_avg_IC50
0,Cc1nccn1Cc1ccc2c(NC(=O)C3CCOc4ccc(Cl)cc43)cncc2c1,MAT-POS-7ddaf7de-1,,5.315361
1,CS(=O)(=O)c1ccc2c(NC(=O)C3CCOc4ccc(Cl)cc43)cncc2c1,MAT-POS-7ddaf7de-2,,0.427100
2,O=C(Nc1cncc2cc(CN3CC4(CNC4)C3)ccc12)C1CCOc2ccc(Cl)cc21,MAT-POS-7ddaf7de-3,,1.437438
3,Cc1c(N)cncc1NC(=O)C1CCOc2ccc(Cl)cc21,MAT-POS-7ddaf7de-4,,3.381693
4,O=C1N(c2cncc3ccccc23)CC[C@@]1(O)c1cccc(Cl)c1,MAT-POS-8695a11f-1,,90.674247
...,...,...,...,...
1349,N#Cc1cc(Cl)cc(NC(=O)Nc2cccnc2)c1,WAR-XCH-eb7b662f-2,,inf
1350,N#Cc1cccc(NC(=O)Nc2cncc(N)c2)c1,DAR-DIA-23aa0b97-8,,54.272964
1351,N#Cc1cccc(NC(=O)Nc2c[nH]c3ncccc23)c1,DAR-DIA-23aa0b97-11,,63.554925
1352,N#Cc1cccc(NC(=O)Cc2cncc3ccccc23)c1,DAR-DIA-23aa0b97-19,14.641091,26.680129


In [7]:
noncovalent['bits'] = noncovalent['SMILES'].apply(concat_fingerprints)
noncovalent

Unnamed: 0,SMILES,CID,r_avg_IC50,f_avg_IC50,bits
0,Cc1nccn1Cc1ccc2c(NC(=O)C3CCOc4ccc(Cl)cc43)cncc2c1,MAT-POS-7ddaf7de-1,,5.315361,"[1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...]"
1,CS(=O)(=O)c1ccc2c(NC(=O)C3CCOc4ccc(Cl)cc43)cncc2c1,MAT-POS-7ddaf7de-2,,0.427100,"[1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...]"
2,O=C(Nc1cncc2cc(CN3CC4(CNC4)C3)ccc12)C1CCOc2ccc(Cl)cc21,MAT-POS-7ddaf7de-3,,1.437438,"[1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...]"
3,Cc1c(N)cncc1NC(=O)C1CCOc2ccc(Cl)cc21,MAT-POS-7ddaf7de-4,,3.381693,"[1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
4,O=C1N(c2cncc3ccccc23)CC[C@@]1(O)c1cccc(Cl)c1,MAT-POS-8695a11f-1,,90.674247,"[1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...]"
...,...,...,...,...,...
1349,N#Cc1cc(Cl)cc(NC(=O)Nc2cccnc2)c1,WAR-XCH-eb7b662f-2,,inf,"[1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
1350,N#Cc1cccc(NC(=O)Nc2cncc(N)c2)c1,DAR-DIA-23aa0b97-8,,54.272964,"[1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
1351,N#Cc1cccc(NC(=O)Nc2c[nH]c3ncccc23)c1,DAR-DIA-23aa0b97-11,,63.554925,"[1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
1352,N#Cc1cccc(NC(=O)Cc2cncc3ccccc23)c1,DAR-DIA-23aa0b97-19,14.641091,26.680129,"[1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ...]"


### Generate Model Input Data (differences of fingerprints)

In [8]:
def IC50subcompare(row1,row2,col):
    if pd.isnull(row1[col]) or pd.isnull(row1[col]):
        return None
    if row1[col] > row2[col]:
        return 1
    if row1[col] < row2[col]:
        return -1
    if row1[col] == row2[col]:
        return 0
    return None

In [9]:
def IC50compare(row1,row2):
    comparevals = [x for x in [IC50subcompare(row1,row2,col) for col in IC50_max] if x is not None]
    if comparevals == []:
        return None
    if comparevals[:-1] == comparevals[1:]:
        return comparevals[0]
    return None

In [10]:
trainxL = []
trainyL = []
testxL = []
testyL = []

In [11]:
# 80-20 split of train:test data

for idx1,row1 in tqdm(noncovalent.iterrows()):
    for idx2,row2 in noncovalent.iterrows():
        if idx2 <= idx1:
            continue
        if idx1 % 5 > 0 and idx2 % 5 > 0:
            comp = IC50compare(row1,row2)
            if comp == -1 or comp == 1:
                bitdiff = row1['bits'] - row2['bits']
                trainxL.append(bitdiff)
                trainyL.append(comp)
                trainxL.append(-bitdiff)
                trainyL.append(-comp)
        if idx1 % 5 == 0 and idx2 % 5 == 0:
            comp = IC50compare(row1,row2)
            if comp == -1 or comp == 1:
                bitdiff = row1['bits'] - row2['bits']
                testxL.append(bitdiff)
                testyL.append(comp)
                testxL.append(-bitdiff)
                testyL.append(-comp)

1354it [01:09, 19.49it/s]


In [12]:
len(trainxL)

911056

In [13]:
len(testxL)

55954

### Perform PCA to reduce fingerprint dimensionality

In [14]:
trainx = np.array(trainxL)
testx = np.array(testxL)
trainy = np.array(trainyL)
testy = np.array(testyL)

preprocess = PCA(n_components = 20)
trainx = preprocess.fit_transform(trainx)
testx = preprocess.transform(testx)

trainx

array([[-1.42287733, -2.05956158,  2.66427625, ..., -1.50377958,
         0.19900011,  0.01413665],
       [ 1.42287733,  2.05956158, -2.66427625, ...,  1.50377958,
        -0.19900011, -0.01413665],
       [ 2.42597525,  3.03850404,  1.33186523, ..., -1.77801363,
         0.22971145,  0.99451075],
       ...,
       [-1.43355887, -0.06919312,  1.07238095, ..., -1.59877233,
         0.59134625, -0.6340201 ],
       [ 0.73247876,  1.05716899,  0.3503091 , ...,  0.52749996,
         0.4120776 ,  0.3123983 ],
       [-0.73247876, -1.05716899, -0.3503091 , ..., -0.52749996,
        -0.4120776 , -0.3123983 ]])

In [15]:
trainx.shape

(911056, 20)

In [16]:
testx.shape

(55954, 20)

### Turn into tabular structure for FastAI

In [17]:
def relabel(value):
    if value == 1: return 'Higher_Activity'
    elif value == -1: return 'Lower_Activity'
    else: print('Unknown value: ', value)

In [18]:
columns = ['Feature ' + str(i) for i in range(trainx.shape[1])]

df_train = pd.DataFrame(trainx, columns = columns).astype('float32')
df_train['Target'] = [relabel(value) for value in trainy]
df_train = df_train.astype({'Target': 'str'})

df_valid = pd.DataFrame(testx, columns = columns).astype('float32')
df_valid['Target'] = [relabel(value) for value in testy]
df_valid = df_valid.astype({'Target': 'str'})

df = df_train.append(df_valid).reset_index(drop = True)

df

Unnamed: 0,Feature 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,...,Feature 11,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17,Feature 18,Feature 19,Target
0,-1.422877,-2.059561,2.664276,-1.989197,1.672933,-1.895161,-1.149874,-2.199205,-1.902317,0.494687,...,-0.302224,-0.520605,-0.769450,2.050705,0.326050,-0.182935,-1.503780,0.199000,0.014137,Lower_Activity
1,1.422877,2.059561,-2.664276,1.989197,-1.672933,1.895161,1.149874,2.199205,1.902317,-0.494687,...,0.302224,0.520605,0.769450,-2.050705,-0.326050,0.182935,1.503780,-0.199000,-0.014137,Higher_Activity
2,2.425975,3.038504,1.331865,-0.634069,-1.789419,-1.796271,0.574236,-0.469507,-1.226418,0.678929,...,-1.239058,-0.088928,0.739901,1.233633,0.188288,-0.359853,-1.778014,0.229711,0.994511,Lower_Activity
3,-2.425975,-3.038504,-1.331865,0.634069,1.789419,1.796271,-0.574236,0.469507,1.226418,-0.678929,...,1.239058,0.088928,-0.739901,-1.233633,-0.188288,0.359853,1.778014,-0.229711,-0.994511,Higher_Activity
4,1.527906,3.878920,2.436285,-4.664529,-0.111651,2.546142,-6.601839,-2.847409,-1.661969,-1.407384,...,-0.031071,-3.294752,-0.811415,-2.406623,-0.024293,-1.792319,-2.185110,1.275382,2.916352,Lower_Activity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967005,-3.779172,-2.575904,1.899461,2.426953,0.140488,-2.432817,-2.725577,-1.416294,-2.982276,-0.673467,...,0.636237,-2.211469,0.886746,-0.588064,-0.129850,0.364948,0.276904,0.142811,1.325463,Higher_Activity
967006,3.739154,2.540243,-1.048152,-1.221055,-1.234512,2.458906,2.177336,0.735746,2.982047,-0.422111,...,-1.544921,0.809506,-0.104953,1.524955,-1.027184,-1.722443,0.562411,1.085703,-0.621201,Lower_Activity
967007,-3.739154,-2.540243,1.048152,1.221055,1.234512,-2.458906,-2.177336,-0.735746,-2.982047,0.422111,...,1.544921,-0.809506,0.104953,-1.524955,1.027184,1.722443,-0.562411,-1.085703,0.621201,Higher_Activity
967008,-0.040019,-0.035660,0.851309,1.205898,-1.094024,0.026089,-0.548241,-0.680548,-0.000229,-1.095577,...,-0.908684,-1.401963,0.781793,0.936891,-1.157034,-1.357495,0.839315,1.228514,0.704262,Lower_Activity


### Create FastAI Tabular Model

In [19]:
valid_idx = range(len(df)-len(df_valid), len(df))
dls = TabularDataLoaders.from_df(df, cont_names = columns, y_names = 'Target', valid_idx=valid_idx)

In [20]:
# dls.one_batch() # debug error if needed

In [21]:
learn = tabular_learner(dls, metrics=accuracy, config={'ps':0.2})

In [22]:
# I've been getting around 79% accuracy on the testing set

# Note: we are evaluating the ability to rank a testing-set drug against another testing-set drug,
# which is harder than ranking a testing-set drug against a training-set drug.

learn.fit_one_cycle(3, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,0.427571,0.457879,0.784573,01:08
1,0.422576,0.450368,0.787844,01:07
2,0.419812,0.448801,0.789649,01:05
