<a href="https://colab.research.google.com/github/cybertraining-dsc/sp21-599-359/blob/develop/project/code/predicting_molecular_activity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisite Packages

In [1]:
!pip3 install cloudmesh-installer
!pip3 install cloudmesh-common
!pip3 install kaggle

Collecting cloudmesh-installer
  Downloading https://files.pythonhosted.org/packages/70/9a/599cb93400ea614646e1577649a7355fbcc4b308becf9c55bc89b3c781fc/cloudmesh_installer-4.4.28-py2.py3-none-any.whl
Collecting bump2version==1.0.0
  Downloading https://files.pythonhosted.org/packages/61/10/560509d9bfe8300e03d268dadec74fac7ae04a430f62e2d06d11d9e4e704/bump2version-1.0.0-py2.py3-none-any.whl
Collecting cloudmesh-common
[?25l  Downloading https://files.pythonhosted.org/packages/08/b9/60e838cd76b05e1991ffed2d1387c461a2fefd1e0aa09b230bff0624ff69/cloudmesh_common-4.3.65-py2.py3-none-any.whl (80kB)
[K     |████████████████████████████████| 81kB 2.8MB/s 
Collecting pytest-cov
  Downloading https://files.pythonhosted.org/packages/e3/1a/6affecd2344efee7f2487fac82242474cbac09f9e04929da5944907baf11/pytest_cov-2.11.1-py2.py3-none-any.whl
Collecting pipdeptree
  Downloading https://files.pythonhosted.org/packages/fa/22/8f1350b55e4297670813815142425b58829036197f0b4a0fc8f543928717/pipdeptree-2.0.0-py

# Sign Up for Kaggle Account and Generate API Token

1. Sign Up for Kaggle account at https://www.kaggle.com. 

2. In Kaggle 'Profile'->'Account', generate api token by clicking 'Create New API Token'.

# Upload Token into Colab
3. Upload json file, with Kaggle api token and username, into Colab by running code below. Select downloaded 'kaggle.json' file when prompted and click 'Uplaod'.

NB third-party cookies should be enabled for upload to work. 

In [2]:
from google.colab import files

# upload json file with api token and username  
files.upload()

# create directory for token in Colab 
!mkdir ~/.kaggle

# move file to directory
!mv kaggle.json ~/.kaggle/

# modify permissions on directory
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


# Download Merck Molecular Activity Challenge Dataset 

In [3]:
# download dataset
!kaggle competitions download -c MerckActivity

# make train and test directories for raw data
!mkdir -p /content/raw/train /content/raw/test

# unzip train and test datasets into respective folders
!unzip /content/TrainingSet.zip -d /content/raw/train

!unzip /content/TestSet.zip -d /content/raw/test

# paths to raw data
path_to_raw_train_data = '/content/raw/train/TrainingSet/'
path_to_raw_test_data = '/content/raw/test/TestSet/'

Downloading TrainingSet.7z to /content
 98% 33.0M/33.6M [00:00<00:00, 80.6MB/s]
100% 33.6M/33.6M [00:00<00:00, 164MB/s] 
Downloading ntree20_benchmark.R to /content
  0% 0.00/1.33k [00:00<?, ?B/s]
100% 1.33k/1.33k [00:00<00:00, 1.29MB/s]
Downloading TrainingSet.zip to /content
 94% 50.0M/53.1M [00:00<00:00, 70.6MB/s]
100% 53.1M/53.1M [00:00<00:00, 120MB/s] 
Downloading Rsquared.R to /content
  0% 0.00/523 [00:00<?, ?B/s]
100% 523/523 [00:00<00:00, 484kB/s]
Downloading TestSet.zip to /content
 28% 5.00M/18.0M [00:00<00:00, 50.3MB/s]
100% 18.0M/18.0M [00:00<00:00, 115MB/s] 
Downloading ntree20_basicBenchmark.csv.zip to /content
  0% 0.00/499k [00:00<?, ?B/s]
100% 499k/499k [00:00<00:00, 164MB/s]
Downloading TestSet.7z to /content
 78% 9.00M/11.5M [00:00<00:00, 42.8MB/s]
100% 11.5M/11.5M [00:00<00:00, 45.8MB/s]
Archive:  /content/TrainingSet.zip
   creating: /content/raw/train/TrainingSet/
  inflating: /content/raw/train/TrainingSet/ACT10_competition_training.csv  
  inflating: /content/r

#Preprocess Dataset

In [24]:
import pandas as pd

# make train and test directories for preprocessed data
!mkdir -p /content/preprocessed/train /content/preprocessed/test

# paths to processed data
path_to_preprocessed_train_data = '/content/preprocessed/train/'
path_to_preprocessed_test_data = '/content/preprocessed/test/'

# Remove molecule substructures (features) that are not common to both training and test sets.
dataset_file_no = 1

while dataset_file_no <= 15:

    dataset_train_file_name = 'ACT' + str(dataset_file_no) + '_competition_training.csv'
    dataset_test_file_name = 'ACT' + str(dataset_file_no) + '_competition_test.csv'

    train_filename = path_to_raw_train_data + dataset_train_file_name
    test_filename =  path_to_raw_test_data + dataset_test_file_name

    train_filename_processed = path_to_preprocessed_train_data + dataset_train_file_name
    test_filename_processed = path_to_preprocessed_test_data + dataset_test_file_name

    print ('Preprocessing dataset ', 'ACT' + str(dataset_file_no))

    train = pd.read_csv(train_filename)
    test = pd.read_csv(test_filename)

    print (len(train.columns.values))
    print (len(test.columns.values))

    train_inx_set = set(train.columns.values)
    test_inx_set = set(test.columns.values)

    # remove molecule label and columns that are not common to both training and test sets
    train_inx = [inx for inx in train.columns.values if inx in set.intersection(train_inx_set, test_inx_set)]
    test_inx = [inx for inx in test.columns.values if inx in set.intersection(train_inx_set, test_inx_set)]

    train_inx.insert(0,'Act')
    train_inx.remove('MOLECULE')
    test_inx.remove('MOLECULE')

    #print (train_inx)
    #print (test_inx)

    train = train[train_inx]
    test = test[test_inx]

    #print (train.shape)
    #print (test.shape) 

    # save data to csv
    train.to_csv(train_filename_processed, index=False)
    test.to_csv(test_filename_processed, index=False)
    
    print(train.head(5))
    print ('Preprocessing dataset ', 'ACT' + str(dataset_file_no), ' complete')
    
    dataset_file_no += 1



Preprocessing dataset  ACT1
9493
9492
      Act  D_3  D_4  D_5  D_6  ...  D_11068  D_11070  D_11074  D_11076  D_11078
0  6.0179    0    0    0    0  ...        0        0        0        0        0
1  4.3003    0    0    0    0  ...        0        0        0        0        0
2  5.2697    0    0    0    0  ...        0        0        0        0        0
3  6.1797    0    0    0    0  ...        0        0        0        0        0
4  4.3003    0    0    0    0  ...        0        0        0        0        0

[5 rows x 9492 columns]
Done dataset  ACT1
Preprocessing dataset  ACT2
5879
5878
      Act  D_3  D_4  D_5  D_7  ...  D_11048  D_11049  D_11051  D_11056  D_11072
0  6.7153    0    0    0    0  ...        0        0        0        0        0
1  6.4912    0    0    0    0  ...        0        0        0        0        0
2  5.8528    0    0    0    0  ...        0        0        0        0        0
3  6.3854    0    0    0    0  ...        0        0        0        0        0


# Predicting Molecular Activity

In [37]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas as pd
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, InputLayer, Concatenate
from keras.utils import to_categorical, plot_model
from keras.utils.vis_utils import model_to_dot
from keras.optimizers import Adam
#from cloudmesh.common.StopWatch import StopWatch

path_to_train_data = '/content/preprocessed/train/'
path_to_test_data = '/content/preprocessed/test/'

#Define parameters
hidden_units = 512
dropout = 0.45
BATCH_SIZE = 128
feature_dim = 128
opti = Adam(lr=0.0001, beta_1=0.5)

# Define fully connected network/MLP
def fcn_model(input_shape=(feature_dim,)):
    model = Sequential()
    model.add(Activation('relu'))
    model.add(Dropout(dropout))
    model.add(Dense(hidden_units))
    model.add(Activation('relu'))
    model.add(Dropout(dropout))
    model.add(Dense(hidden_units))
    model.add(Activation('relu'))
    model.add(Dropout(dropout))
    model.add(Dense(hidden_units))
    model.add(Activation('relu'))
    model.add(Dropout(0.10))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))
    model.build(input_shape)
    model.summary()
    model.compile(loss='mean_squared_error', optimizer=opti, metrics=[Rsqured])
    print("\nTraining on dataset number", act_ds," of 15:\n")
    model.fit(x_train, y_train, epochs=5, batch_size=BATCH_SIZE)
    return model

def Rsqured(x,y):
    x = K.batch_flatten(x)
    y = K.batch_flatten(y)

    avx = K.mean(x)
    avy = K.mean(y)

    num = K.sum((x-avx) * (y-avy))
    num = num * num

    denom = K.sum((x-avx)*(x-avx)) * K.sum((y-avy)*(y-avy))

    return num/denom

act_ds = 1
while act_ds <= 15:
    print("\nReading from dataset number", act_ds," of 15:\n")
    data_train = pd.read_csv(path_to_train_data + 'ACT' + str(act_ds) + '_competition_training.csv')
    data_test = pd.read_csv(path_to_test_data + 'ACT' + str(act_ds) + '_competition_test.csv')

    activity_inx = data_train.columns.get_loc('Act')
    feature_dim = data_train.shape[1] - (activity_inx+1)
    #print("no. of feature columns:", feature_dim)
    
    y_train = data_train['Act']
    #print("shape of y_train:", y_train.shape)
    num_labels = len(np.unique(y_train))
    #print("no. of unique labels:", num_labels)


    train_set_inx = set(data_train.columns.values)
    test_set_inx = set(data_test.columns.values)

    train_inx = [inx for inx in data_train.columns.values if inx in set.intersection(train_set_inx, test_set_inx)]
    test_inx = [inx for inx in data_test.columns.values if inx in set.intersection(train_set_inx, test_set_inx)]

    data_train = data_train[train_inx]
    data_test = data_test[test_inx]
    #print(data_train.head(5))
    x_train = data_train[0:]
    input_size = x_train.shape[0]
    x_train = np.asarray(x_train).astype('float32')
    #print("shape of x_train:", x_train.shape)
    
    
    fcn_model(input_shape=(input_size,feature_dim))
    act_ds +=1


Reading from dataset number  1 :
shape of x_train: (37241, 9491)
Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
activation_115 (Activation)  (37241, 9491)             0         
_________________________________________________________________
dropout_92 (Dropout)         (37241, 9491)             0         
_________________________________________________________________
dense_92 (Dense)             (37241, 512)              4859904   
_________________________________________________________________
activation_116 (Activation)  (37241, 512)              0         
_________________________________________________________________
dropout_93 (Dropout)         (37241, 512)              0         
_________________________________________________________________
dense_93 (Dense)             (37241, 512)              262656    
______________________________________________________