In [1]:
# imports
import numpy as np
import pandas as pd
import random
import os

from python_utilities.parallel import Parallelizer
from e3fp.config.params import default_params
from e3fp.pipeline import params_to_dicts
from e3fp.pipeline import fprints_from_smiles

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split


In [2]:
# functions
def multiple_labels(df, col_name):
    df[col_name] = df[col_name].str.split(', ')
    df = df.explode(col_name, ignore_index=True)
    return df

def class_balanced_split(X, Y, n_classes, train_split=0.8, val_split=None, numsplits=5):
    def divide(lst, n):
        p = len(lst) // n
        if len(lst)-p > 0:
            return [lst[:p]] + divide(lst[p:], n-1)
        else:
            return [lst]
    def seed():
        return 0.42

    n_classes = list(Y)[0].shape[0]
    Xtrains = []
    Ytrains = []
    Xtest = []
    Ytest = []
    Xval = []
    Yval = []
    balanced_dict = {}
    
    # making train/test/val splits within classes
    for i in range(n_classes):
        xlist = []
        ylist = []
        for x, y in zip(X,Y):
            if np.argmax(y) == i:
                xlist.append(x)
                ylist.append(y)
        X_train, X_test, y_train, y_test = train_test_split(xlist, ylist, train_size=train_split, random_state=0)
        if val_split:
            val_train_size = round(1-val_split/(1-train_split), 3)
            X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, train_size=val_train_size, random_state=0)
            balanced_dict[i] = (X_train, y_train, X_test, y_test, X_val, y_val)
        else:
            balanced_dict[i] = (X_train, y_train, X_test, y_test)
    # recombining train/test/val
    for i in range(n_classes):
        Xtrains.extend(balanced_dict[i][0])
        Ytrains.extend(balanced_dict[i][1])
        Xtest.extend(balanced_dict[i][2])
        Ytest.extend(balanced_dict[i][3])
        if val_split:
            Xval.extend(balanced_dict[i][4])
            Yval.extend(balanced_dict[i][5])

    # shuffling recombined train/test/val
    train = list(zip(Xtrains,Ytrains))
    test = list(zip(Xtest,Ytest))
    random.shuffle(train, seed)
    random.shuffle(test, seed)   
    # splitting into numsplits and shuffling 
    split_train = divide(train, numsplits)
    split_test = divide(test, numsplits)
    for split in split_train:
        random.shuffle(split, seed)
    for split in split_test:
        random.shuffle(split, seed)
    if val_split:
        val = list(zip(Xval,Yval))
        random.shuffle(val, seed)
        split_val = divide(val, numsplits)
        for split in split_val:
            random.shuffle(split, seed)

    # unzipping and making list of lists
    Xtrains = []
    Ytrains = []
    Xtests = []
    Ytests = []
    for i in range(numsplits):
        # you can use zip(*iterable) to unzip split_train[i]
        Xtrains.append([train[0] for train in split_train[i]])
        Ytrains.append([train[1] for train in split_train[i]])
        Xtests.append(np.array([test[0] for test in split_test[i]]))
        Ytests.append([test[1] for test in split_test[i]])
    if val_split:
        Xvals = []
        Yvals = []
        for i in range(numsplits):
            Xvals.append([val[0] for val in split_val[i]])
            Yvals.append([val[1] for val in split_val[i]])
        return Xtrains, Ytrains, Xtests, Ytests, Xvals, Yvals
    return Xtrains, Ytrains, Xtests, Ytests

def split_labels(df, col_name):
    df = df.copy()
    df[col_name] = df[col_name].str.split(', ')
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df[col_name].to_list())
    return y, mlb.classes_

def generate_chemprop_data(labeled_df, excluded_labels, dir, train_split=0.8, val_split=0.1, holdout_split=0.2):
    '''
    Class-balanced split of train, test, validation, held out, non-held out, and all .csv files from a dataset
    '''
    if not os.path.isdir(dir):
        os.makedirs(dir)
    multilabel_multiclass_drugs = labeled_df.copy()
    multilabel_multiclass_drugs.replace('DNA','nucleic acid', inplace=True)
    multilabel_multiclass_drugs.replace('protein','protein biosynthesis', inplace=True)
    mask = [label not in excluded_labels for label in multilabel_multiclass_drugs['labels']]
    masked_multilabel_df = multilabel_multiclass_drugs[mask]
    Y, classes = split_labels(masked_multilabel_df, 'labels')
    classes = [label.replace(' ','_') for label in classes]
    X = masked_multilabel_df['canon_SMILES'].to_list()
    Xcp, Ycp, Xhold, Yhold = class_balanced_split(X, Y, n_classes=len(classes), train_split=1-holdout_split, numsplits=1)
    Xtrains, Ytrains, Xtests, Ytests, Xvals, Yvals = class_balanced_split(Xcp[0], Ycp[0], n_classes=len(classes), train_split=train_split, val_split=val_split, numsplits=1)
    
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    val_df = pd.DataFrame()
    hold_df = pd.DataFrame()
    train_df['smiles'] = Xtrains[0]
    test_df['smiles'] = Xtests[0]
    val_df['smiles'] = Xvals[0]
    hold_df['smiles'] = Xhold[0]
    for i in range(len(classes)):
        train_df[classes[i]] = np.vstack(Ytrains[0])[:,i]
        test_df[classes[i]] = np.vstack(Ytests[0])[:,i]
        val_df[classes[i]] = np.vstack(Yvals[0])[:,i]
        hold_df[classes[i]] = np.vstack(Yhold[0])[:,i]
    print(' '.join(classes))
    train_df.to_csv(dir+'train.csv',index=False)
    test_df.to_csv(dir+'test.csv',index=False)
    val_df.to_csv(dir+'val.csv',index=False)
    hold_df.to_csv(dir+'holdout.csv',index=False)
    hold_df['smiles'].to_csv(dir+'holdout_smiles_only.csv',index=False)
    all_df = pd.concat([train_df,test_df,val_df,hold_df])
    all_df.to_csv(dir+'all.csv',index=False)
    nonholdout_df = pd.concat([train_df,test_df,val_df])
    nonholdout_df.to_csv(dir+'nonholdout.csv',index=False)

# Part 1: Read in data

In [3]:
# get morgan fingerprints for all
data_dir = '../data/'
file_name = 'abx_MOA_fully_relabeled_v2.csv'
file_path = data_dir+file_name

drugs = pd.read_csv(file_path)
print('initial length:',len(drugs))
not_nans = [type(smi) != float for smi in list(drugs['canon_SMILES'])]
drugs = drugs[not_nans] # filter for having SMILE
print('filtered for SMILES:',len(drugs))

# multilabel multiclass
labeled = [moa != 'unknown' for moa in drugs['labels']]
labeled_drugs = drugs[labeled] # filter labeled drugs i.e. not unknown
multilabel_multiclass_drugs = labeled_drugs.copy()
multilabel_multiclass_drugs.replace('DNA','nucleic acid', inplace=True)
multilabel_multiclass_drugs.replace('protein','protein biosynthesis', inplace=True)
# single label multiclass
labeled_drugs = multiple_labels(labeled_drugs,'labels') # dealing with multiple MOA
labeled_drugs.replace('DNA','nucleic acid', inplace=True)
labeled_drugs.replace('protein','protein biosynthesis', inplace=True)

initial length: 2857
filtered for SMILES: 2856


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df[col_name].str.split(', ')


In [4]:
# generating chemprop train, test, val, non-held out (used for hyperparameter optimization),
# held out (used for test set when hyperparamter optimizing) .csv files
dir = '../data/chemprop_nonhuman_multilabel_inputs/'
excluded_labels = ['human']
generate_chemprop_data(multilabel_multiclass_drugs,excluded_labels,dir)

CoA_synthesis PMF cell_division cell_wall membrane nucleic_acid oxidative_stress protein_biosynthesis


# Part 2: Generating E3FPs

In [5]:
# loading smiles, parameters
dir = '../data/chemprop_nonhuman_multilabel_inputs/'
test_df = pd.read_csv(dir+'all.csv')
smiles = test_df['smiles'].to_list()

confgen_params, fprint_params = params_to_dicts(default_params)
confgen_params = {'num_conf': -1, 'first': -1, 'pool_multiplier': 1, 'rmsd_cutoff': 0.5, 'max_energy_diff': None, 'forcefield': 'uff', 'out_dir': 'conformers', 'compress': 2, 'seed': -1, 'standardise': False}

In [None]:
# generating conformers in parallel
smiles_dict = {str(i): smiles[i] for i in range(len(smiles))}
smiles_iter = ((smiles, name) for name, smiles in smiles_dict.items())
kwargs = {"confgen_params": confgen_params, "fprint_params": fprint_params}
parallelizer = Parallelizer(parallel_mode="processes")
%time fprints_list = parallelizer.run(fprints_from_smiles, smiles_iter, kwargs=kwargs) 
len(fprints_list) 

In [None]:

fprints_list

In [None]:
from joblib import dump, load
import pickle
filehandler = open('../data/training_e3fps', 'w') 
pickle.dump(fprints_list, filehandler)
dump(fprints_list,'../data/training_e3fps.joblib')