<a href="https://colab.research.google.com/github/cybertraining-dsc/sp21-599-359/blob/develop/project/code/predicting_molecular_activity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisite Packages

In [24]:
# prerequisites
!pip3 install cloudmesh-installer
!pip3 install cloudmesh-common
!pip3 install kaggle



# Sign Up for Kaggle Account and Generate API Token

1. Sign Up for Kaggle account at https://www.kaggle.com. 

2. In Kaggle 'Profile'->'Account', generate api token by clicking 'Create New API Token'.

# Upload Token into Colab
3. Upload json file, with Kaggle api token and username, into Colab by running code below. Select downloaded 'kaggle.json' file when prompted and click 'Uplaod'.

NB third-party cookies should be enabled for upload to work. 

In [25]:
from google.colab import files

# upload json file with api token and username  
files.upload()

# create directory for token in Colab 
!mkdir ~/.kaggle

# move file to directory
!mv kaggle.json ~/.kaggle/

# modify permissions on directory
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
mkdir: cannot create directory ‘/root/.kaggle’: File exists


# Download Merck Molecular Activity Challenge Dataset 

In [27]:
# download dataset
!kaggle competitions download -c MerckActivity

# make train and test directories for raw data
!mkdir -p /content/raw/train /content/raw/test

# unzip train and test datasets into respective folders; only overwrite exiting files if extracted files have newer timestamp, else skip 
!unzip /content/TrainingSet.zip -u -d /content/raw/train

!unzip /content/TestSet.zip -u -d /content/raw/test

# paths to raw data
path_to_raw_train_data = '/content/raw/train/TrainingSet/'
path_to_raw_test_data = '/content/raw/test/TestSet/'

TrainingSet.7z: Skipping, found more recently modified local copy (use --force to force download)
ntree20_benchmark.R: Skipping, found more recently modified local copy (use --force to force download)
TestSet.7z: Skipping, found more recently modified local copy (use --force to force download)
Rsquared.R: Skipping, found more recently modified local copy (use --force to force download)
ntree20_basicBenchmark.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
TestSet.zip: Skipping, found more recently modified local copy (use --force to force download)
TrainingSet.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  /content/TrainingSet.zip
caution: filename not matched:  -u
Archive:  /content/TestSet.zip
caution: filename not matched:  -u


#Preprocess Dataset

In [28]:
import pandas as pd

# make train and test directories for preprocessed data
!mkdir -p /content/preprocessed/train /content/preprocessed/test 

# paths to processed data
path_to_preprocessed_train_data = '/content/preprocessed/train/'
path_to_preprocessed_test_data = '/content/preprocessed/test/'

# cycle through 15 data sets preprocessing them for learning
StopWatch.start("preprocessing")
dataset_file_no = 1

while dataset_file_no <= 15:

    dataset_train_file_name = 'ACT' + str(dataset_file_no) + '_competition_training.csv'
    dataset_test_file_name = 'ACT' + str(dataset_file_no) + '_competition_test.csv'

    train_filename = path_to_raw_train_data + dataset_train_file_name
    test_filename =  path_to_raw_test_data + dataset_test_file_name

    train_filename_processed = path_to_preprocessed_train_data + dataset_train_file_name
    test_filename_processed = path_to_preprocessed_test_data + dataset_test_file_name

    print ('Preprocessing dataset ', 'ACT' + str(dataset_file_no))

    train = pd.read_csv(train_filename)
    test = pd.read_csv(test_filename)

    print (len(train.columns.values))
    print (len(test.columns.values))

    train_inx_set = set(train.columns.values)
    test_inx_set = set(test.columns.values)

    # remove molecule label and columns that are not common to both training and test sets
    train_inx = [inx for inx in train.columns.values if inx in set.intersection(train_inx_set, test_inx_set)]
    test_inx = [inx for inx in test.columns.values if inx in set.intersection(train_inx_set, test_inx_set)]

    train_inx.insert(0,'Act')
    train_inx.remove('MOLECULE')
    test_inx.remove('MOLECULE')

    #print (train_inx)
    #print (test_inx)

    train = train[train_inx]
    test = test[test_inx]

    #print (train.shape)
    #print (test.shape) 

    # save data to csv
    train.to_csv(train_filename_processed, index=False)
    test.to_csv(test_filename_processed, index=False)
    
    print(train.head(5))
    print ('Preprocessing dataset ', 'ACT' + str(dataset_file_no), ' complete')
    
    dataset_file_no += 1
StopWatch.stop("preprocessing")

StopWatch.benchmark()

Preprocessing dataset  ACT1
9493
9492
      Act  D_3  D_4  D_5  D_6  ...  D_11068  D_11070  D_11074  D_11076  D_11078
0  6.0179    0    0    0    0  ...        0        0        0        0        0
1  4.3003    0    0    0    0  ...        0        0        0        0        0
2  5.2697    0    0    0    0  ...        0        0        0        0        0
3  6.1797    0    0    0    0  ...        0        0        0        0        0
4  4.3003    0    0    0    0  ...        0        0        0        0        0

[5 rows x 9492 columns]
Preprocessing dataset  ACT1  complete
Preprocessing dataset  ACT2
5879
5878
      Act  D_3  D_4  D_5  D_7  ...  D_11048  D_11049  D_11051  D_11056  D_11072
0  6.7153    0    0    0    0  ...        0        0        0        0        0
1  6.4912    0    0    0    0  ...        0        0        0        0        0
2  5.8528    0    0    0    0  ...        0        0        0        0        0
3  6.3854    0    0    0    0  ...        0        0        0

# Predicting Molecular Activity

In [29]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas as pd
import keras.backend as K
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, InputLayer, Concatenate
from keras.utils import to_categorical, plot_model
from keras.utils.vis_utils import model_to_dot
from keras.optimizers import Adam
from cloudmesh.common.StopWatch import StopWatch

# make outputs directory
!mkdir -p /content/outputs/

path_to_train_data = '/content/preprocessed/train/'
path_to_test_data = '/content/preprocessed/test/'
path_to_outputs = '/content/outputs/'

# define parameters
hidden_units = 512
dropout = 0.45
BATCH_SIZE = 128
feature_dim = 128
opti = Adam(lr=0.0001, beta_1=0.5)

# define fully connected network/MLP
def fcn_model(input_shape=(feature_dim,)):
    model = Sequential()
    model.add(Activation('relu'))
    model.add(Dropout(dropout))
    model.add(Dense(hidden_units))
    model.add(Activation('relu'))
    model.add(Dropout(dropout))
    model.add(Dense(hidden_units))
    model.add(Activation('relu'))
    model.add(Dropout(dropout))
    model.add(Dense(hidden_units))
    model.add(Activation('relu'))
    model.add(Dropout(0.10))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))
    model.build(input_shape)
    model.summary()
    model.compile(loss='mean_squared_error', optimizer=opti, metrics=[Rsqured])
    print("\nTraining on dataset number", act_ds," of 15:\n")
    model.fit(x_train, y_train, epochs=15, batch_size=BATCH_SIZE)
    loss, R2 = model.evaluate(x_test,y_test,batch_size=BATCH_SIZE)
    print("\nCorrelation coefficient:", R2)
    return model

# define correlation coefficient (R^2) formula
def Rsqured(x,y):
    
    # approach adopted from RuwanT
    # URL: https://github.com/RuwanT/merck/blob/master/main.py

    x = K.batch_flatten(x)
    y = K.batch_flatten(y)

    avx = K.mean(x)
    avy = K.mean(y)

    num = K.sum((x-avx) * (y-avy))
    num = num * num

    denom = K.sum((x-avx)*(x-avx)) * K.sum((y-avy)*(y-avy))

    return num/denom

# iterate through 15 distinct datasets of high throughput screening (HTS) assays
StopWatch.start("train-evaluate-predict")    
act_ds = 1
while act_ds <= 15:
    print("\nReading from dataset", act_ds,"of 15:\n")
    data_train_main = pd.read_csv(path_to_train_data + 'ACT' + str(act_ds) + '_competition_training.csv')
    data_ac = pd.read_csv(path_to_test_data + 'ACT' + str(act_ds) + '_competition_test.csv')

    # split each of the datasets into set for training (80%), set for testing/evluation (10%), and 
    # set for use with validating prediction (10%)
    data_train = data_train_main.sample(frac = 0.8)
    data_test = data_train_main.drop(data_train.index)
    data_prediction = data_test.sample(frac = 0.5)
    data_test = data_test.drop(data_prediction.index)

    activity_inx = data_train.columns.get_loc('Act')
    feature_dim = data_train.shape[1] - (activity_inx+1)
    #print("no. of feature columns:", feature_dim)
    
    # identify molecular activity labels
    y_train = data_train['Act']
    #print("shape of y_train:", y_train.shape)
    num_labels = len(np.unique(y_train))
    #print("no. of unique labels:", num_labels)
    y_test = data_test['Act']

    # identify and filter for feature-set/molecular-substructure frequencies
    train_set_inx = set(data_train.columns.values)
    test_set_inx = set(data_ac.columns.values)

    train_inx = [inx for inx in data_train.columns.values if inx in set.intersection(train_set_inx, test_set_inx)]
    test_inx = [inx for inx in data_test.columns.values if inx in set.intersection(train_set_inx, test_set_inx)]
    predict_inx = [inx for inx in data_prediction.columns.values if inx in set.intersection(train_set_inx, test_set_inx)]

    data_train = data_train[train_inx]
    data_test = data_test[test_inx]
    data_prediction = data_prediction[predict_inx]
    #print(data_train.head(5))

    # format feature-set input data
    x_train = data_train[0:]
    x_train = np.asarray(x_train).astype('float32')
    x_test = data_test[0:]
    x_test = np.asarray(x_test).astype('float32')
    x_predict = data_prediction[0:]
    x_predict = np.asarray(x_predict).astype('float32')

    input_size = x_train.shape[0]

    
    # call fully connected network for learning and evaluation. Predict biological activity from finalized model 
    y_predict = fcn_model(input_shape=(input_size,feature_dim)).predict(x_predict,batch_size=BATCH_SIZE)
    
    # display inputs and outputs for the prediction step and write results to text file

    prediction_file = open(path_to_outputs +"molecular_activity_prediction.txt","a")

    for i in range(len(x_predict)):
      print("Substructure/feature frequencies=%s, Predicted biological activity=%s" % (x_predict[i], np.mean(y_predict[i])))
      prediction_file.write(str(x_predict[i])+","+str(np.mean(y_predict[i]))+"\n")
    act_ds +=1
    prediction_file.close()
StopWatch.stop("train-evaluate-predict")

StopWatch.benchmark()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Substructure/feature frequencies=[0. 0. 0. ... 0. 0. 0.], Predicted biological activity=0.0003244646
Substructure/feature frequencies=[0. 0. 0. ... 0. 0. 0.], Predicted biological activity=0.0003244646
Substructure/feature frequencies=[0. 0. 0. ... 0. 0. 0.], Predicted biological activity=0.00032446464
Substructure/feature frequencies=[0. 0. 0. ... 0. 0. 0.], Predicted biological activity=0.00032446464
Substructure/feature frequencies=[0. 0. 0. ... 0. 0. 0.], Predicted biological activity=0.00032446464
Substructure/feature frequencies=[0. 0. 0. ... 0. 0. 0.], Predicted biological activity=0.00032446464
Substructure/feature frequencies=[0. 0. 0. ... 0. 0. 0.], Predicted biological activity=0.00032446464
Substructure/feature frequencies=[0. 0. 0. ... 0. 0. 0.], Predicted biological activity=0.00032446464
Substructure/feature frequencies=[0. 0. 0. ... 0. 0. 0.], Predicted biological activity=0.00032446464
Substructure/featur