In [1]:
import pandas as pd
from keras.models import Sequential
from keras.layers import *
from keras.utils import np_utils
import keras
from keras import losses, optimizers, metrics

Using TensorFlow backend.


In [2]:
class ProteinLigandDataGenerator(keras.utils.Sequence):
    def __init__(self, csv_path, batch_size, shuffle=True, dim=(24,24,24), n_channels=4):
        self.df = self.prepare_df(csv_path)
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indexes = None
        self.dim = dim
        self.n_channels = n_channels
        self.on_epoch_end()
    
    def prepare_df(self, csv_path):
        # oversampling happens here, 
        match_count = 1
        random_count = 4
        supplement = random_count - match_count
        
        df = pd.read_csv(csv_path)
        same_idx = df[df['pro_id'] == df['lig_id']]
        duplicates = [same_idx]*supplement
        return df.append(duplicates, ignore_index=True)
        
    
    def __len__(self):
        '''
        Batches per epoch
        '''
        return int(len(self.df.index)/self.batch_size)
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size : (index+1)*self.batch_size]
        X, y = self.generate_data(indexes)
        
        return X, y
        
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.df.index))
        if self.shuffle:
            np.random.shuffle(self.indexes)
    
    def generate_data(self, indexes):
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)
        
        sub_df = self.df.iloc[indexes]
        sub_df.reset_index(inplace=True)
        for row in sub_df.itertuples():
            idx = row[0]
            full_X = np.load(row.dests)
            X[idx,:,:,:,:] = full_X
            y[idx] = row.score
            
        return X, y

In [3]:
train_generator = ProteinLigandDataGenerator('./data/csv/train_lig_2_pro_pairs.csv', 12)
test_generator = ProteinLigandDataGenerator('./data/csv/test_lig_2_pro_pairs.csv', 12)

In [4]:
model = Sequential()
model.add(Conv3D(96, kernel_size=3, activation='relu', input_shape=(24,24,24,4)))
model.add(BatchNormalization())
model.add(Conv3D(128, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool3D())
model.add(Conv3D(196, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Conv3D(256, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(GlobalAveragePooling3D())
model.add(Dense(1, activation='sigmoid'))

sgd = optimizers.SGD(lr=1, decay=1e-5)
model.compile(loss=losses.mean_squared_error, optimizer=sgd, metrics=[metrics.mse])

In [5]:
history = model.fit_generator(train_generator, epochs=4, 
                              use_multiprocessing=True, workers=2, 
                              verbose=1, validation_data=test_generator)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 3/4


In [6]:
model.save('models/try_2_features.h5')