# Mechanisms of Action (MoA) Prediction

In [1]:
# Initial random imports
import random
import os
import copy
import warnings
# warnings.filterwarnings('ignore')

# Importing numpy
import numpy as np

# Importing pandas
import pandas as pd

# Importing matplotlib
import matplotlib.pyplot as plt

# Importing seaborn
import seaborn as sns

# Importing sklearn
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

# Importing pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# setting the seed, so that every time the seed is started from the same number

def set_seed_characteristics(seed=55):
    # Setting a random seed value
    
    random.seed(seed)
    
    # for guaranteering the reproducability of numbers by setting seed for NumPy
    
    np.random.seed(seed) 
    
    # for setting the seed for cuda or cpu
    
    torch.manual_seed(seed) 

    # To ensure that Pytorch doesnt just switch to the fastest possible algorithm but 
    # ensures that it selects a deterministic algorithm
    
    torch.backends.cudnn.deterministic = True

# Reading the CSV Files

In [3]:
train_features = pd.read_csv('input/train_features.csv')
# Reading the head rows and columns of train features
train_features_head = train_features.head()

train_targets_scored = pd.read_csv('input/train_targets_scored.csv')
# Reading the head rows and columns of train targets scored
train_targets_scored_head = train_targets_scored.head()

test_features = pd.read_csv('input/test_features.csv')
# Reading the head rows and columns of train targets non-scored
test_features_head = test_features.head()

In [4]:
# Printing the head - training features 
train_features_head

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
3,id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
4,id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


In [5]:
# Printing the head - train targets scored 
train_targets_scored_head

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Printing the head - test features
test_features_head

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_0004d9e33,trt_cp,24,D1,-0.5458,0.1306,-0.5135,0.4408,1.55,-0.1644,...,0.0981,0.7978,-0.143,-0.2067,-0.2303,-0.1193,0.021,-0.0502,0.151,-0.775
1,id_001897cda,trt_cp,72,D1,-0.1829,0.232,1.208,-0.4522,-0.3652,-0.3319,...,-0.119,-0.1852,-1.031,-1.367,-0.369,-0.5382,0.0359,-0.4764,-1.381,-0.73
2,id_002429b5b,ctl_vehicle,24,D1,0.1852,-0.1404,-0.3911,0.131,-1.438,0.2455,...,-0.2261,0.337,-1.384,0.8604,-1.953,-1.014,0.8662,1.016,0.4924,-0.1942
3,id_00276f245,trt_cp,24,D2,0.4828,0.1955,0.3825,0.4244,-0.5855,-1.202,...,0.126,0.157,-0.1784,-1.12,-0.4325,-0.9005,0.8131,-0.1305,0.5645,-0.5809
4,id_0027f1083,trt_cp,48,D1,-0.3979,-1.268,1.913,0.2057,-0.5864,-0.0166,...,0.4965,0.7578,-0.158,1.051,0.5742,1.09,-0.2962,-0.5313,0.9931,1.838


In [7]:
# Pytorch data loader implementation of MoA dataset
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct

# Pytorch data loader implementation of test dataset
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct

In [8]:
# Pytorch model for the MoA determination

class Model(nn.Module):
    
    # Instantiaing all the models before utilizing
    # them later in the forward function.
    def __init__(self, num_features, num_targets, hidden_size):
        
        # super keyword used to access data from the parent
        # pytorch.nn.Module class
        super(Model, self).__init__()
        # Applying batch normalization. This is done to standardize
        # the input for each mini batches and will help reduce the
        # number of epochs for which the training is done. This limits
        # the covariate shift (this is the value by which the hidden
        # layer values shift around) and allows to learn from a more 
        # stable set of data. Sometimes, it also allows for a
        # higher learning rate.This is also used for regularization
        # and helps reduce over fitting. Generally, if batch 
        # normalization is used, you can use a smaller dropout,
        # which in turn means that lesser layers can be lost 
        # in every step.
        self.batch_norm1 = nn.BatchNorm1d(num_features)        
        # For regularization purposes the dropout is set
        # This is done by setting a probablity. Random 
        # neural networks are picked at a probablity, say p
        # or dropped at a probablity of 1-p. This is essential 
        # to prevent overfitiing of the model and also reduces
        # the computation time. A fully connected neural network, if
        # run without dropout will start forming dependancies between
        # each other and this can lead to over-fitting.
        self.dropout1 = nn.Dropout(0.2)
        # nn.utils.weight_norm : This is weight normalization. Usually,
        #                        faster than batch normalization
        # nn.Linear : Applying linear transform to the incoming data
        #             and creates a single layer feed forward network.
        # input size : num_features
        # output size : hidden_size
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.2)
        # input size : hidden_size
        # output size : hidden_size
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.25)
        # input size : hidden_size
        # output size : num_targets
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
    
    # The forward function basically defines the model
    def forward(self, x):
        
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

In [9]:
# Function to train the model

def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss

In [10]:
# Function to validate the model

def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

In [11]:
# Adding the inference function

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds

In [12]:
# Adding dummy inserts to the cp_time and cp_dose columns
# Usually done to categorical variables

def process_data(data):
    
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    
    return data

In [13]:
set_seed_characteristics(seed=55)

In [14]:
# Seperating out the GENES and CELLS Column

GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [15]:
# Since our dimensions are really high, we can resort to 
# using PCA for dimensionality reduction, but is still able 
# to capture the characteristics of the data.

# Now, this can be done by choosing a random dimension, and 
# having the same random state as before. By doing this
# we observe that we do not encounter
# any 'nan' errors during training.

# Doing PCA for Genes

# can choose any random number here
n_comp_genes = 20

# Concatenating the training and test set
data = pd.concat([pd.DataFrame(train_features[GENES]), pd.DataFrame(test_features[GENES])])

# Performing PCA and converting to a n_comp_genes number of columns
pca = PCA(n_components = n_comp_genes, random_state=55)

# Fitting the PCA transform
data2 = pca.fit_transform(data[GENES])

# Splitting the training and test columns
train2 = data2[:train_features.shape[0]] 
test2 = data2[-test_features.shape[0]:]

# Converting training and testing  into Pandas data frame shape
train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp_genes)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp_genes)])

# Concatenating these back to the original features
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [16]:
# Doing PCA for Cells

# can choose any random number here
n_comp_cells = 32

# Concatenating the training and test set
data = pd.concat([pd.DataFrame(train_features[CELLS]), pd.DataFrame(test_features[CELLS])])

# Performing PCA and converting to a n_comp_cells number of columns
pca = PCA(n_components = n_comp_cells, random_state=55)

# Fitting the PCA transform
data2 = pca.fit_transform(data[CELLS])

# Splitting the training and test columns
train2 = data2[:train_features.shape[0]]
test2 = data2[-test_features.shape[0]:]

# Converting training and testing  into Pandas data frame shape
train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp_cells)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp_cells)])

# Concatenating these back to the original features
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [17]:
# Setting a desired threshold to calculate the VarianceThreshold.
# As per the math all the Features with a training-set variance 
# lower than this threshold will be removed.
var_thresh = VarianceThreshold(threshold=0.7)

# Combining training and test features
data = train_features.append(test_features)

# Fits to the data, before transforming it
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

# Extracting the training and the testing data out of the
# transformed data
train_features_transformed = data_transformed[ : train_features.shape[0]]
test_features_transformed = data_transformed[-test_features.shape[0] : ]


In [18]:
# Extracting the training features in a suitable 
# pandas dataset format and numbering the columns
# after the labels of 'sig_id', 'cp_type', 'cp_time', 'cp_dose'.
train_features = pd.DataFrame(train_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4), columns=['sig_id','cp_type','cp_time','cp_dose'])
train_features = pd.concat([train_features, pd.DataFrame(train_features_transformed)], axis=1)

# Extracting the testing features in a suitable 
# pandas dataset format and numbering the columns
# after the labels of 'sig_id', 'cp_type', 'cp_time', 'cp_dose'.

test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4), columns=['sig_id','cp_type','cp_time','cp_dose'])
test_features = pd.concat([test_features, pd.DataFrame(test_features_transformed)], axis=1)