In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os.path

In [2]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Flatten
from keras.layers.wrappers import TimeDistributed
from keras.callbacks import History
from keras.optimizers import Adam, SGD
from keras.layers.normalization import BatchNormalization

Using TensorFlow backend.


In [3]:
if not os.path.exists(os.getcwd()+'\\Output'):
    os.makedirs(os.getcwd()+'\\Output')
# os.path.exists: Return True if path refers to an existing path
# getcwd() returns current working directory of a process.

In [4]:
# seed for random number reproductio
np.random.seed(180221)
episodes = 50

In [5]:
def shuffle3D(arr):
    for a in arr:
        np.random.shuffle(a)
        
        
def dimX(x,ts):
    '''Adding timestep to feature'''
    x=np.asarray(x)
    newX=[]
    for i, c in enumerate(x):
        newX.append([])
        for j in range(ts):
            newX[i].append(c)
    return np.array(newX)
    # np.asarray(a): convert the input a to an array

    
def dimY(Y,ts):
    '''adding timestep to target'''
    temp = np.zeros((len(Y), ts, len(chars)), dtype=np.bool) #???? what is chars
    for i, c in enumerate(Y):
        for j, s in enumerate(c):
            #print i, j, s
            temp[i, j, char_idx[s]] = 1
    return np.array(temp)



def prediction(preds):
    '''prediction of sequence with argmax'''
    y_pred=[]
    for i,c in enumerate(preds):
        y_pred.append([])
        for j in c:
            y_pred[i].append(np.argmax(j))
    return np.array(y_pred)


def seq_txt(y_pred):
    '''sequence to text conversion'''
    newY=[]
    for i,c in enumerate(y_pred):
        newY.append([])
        for j in c:
            newY[i].append(idx_char[j])
    return np.array(newY)


def smiles_output(s):
    '''joined smiles output'''
    smiles=np.array([])
    for i in s:
        j=''.join(str(k) for k in i)
        smiles=np.append(smiles,j)
    return smiles

# Read data

In [53]:
##read csv file
data = pd.read_csv('stahl.csv')
data = data.reindex(np.random.permutation(data.index))
print('Number of data:',len(data))
data.head(10)

Number of data: 335


Unnamed: 0,SMILES,cox2,estrogen,gelatinase,neuramidase,kinase,thrombin,none
150,CCC(C1=CC=CC=C1)=C(C2=CC=CC=C2)C3=CC=C(OCC[NH]...,0,1,0,0,0,0,0
16,C[S](=O)(=O)C1=CC=C(C=C1)C2=C(C(=O)OC23CCCC3)C...,1,0,0,0,0,0,0
215,CC(C)C(NC(=O)CNC(=O)C(CS)NC(=O)C(CC[NH+]=C(N)N...,0,0,1,0,0,0,0
278,NC1=CC(=CC=C1)CNC(=O)C2CCCN2C(=O)C([NH3+])CC3=...,0,0,0,0,0,1,0
250,CCC(CC)NC1=NC(=CC=N1)C2=C(N=C[N]2C3CC[NH](C)CC...,0,0,0,0,1,0,0
206,CNC(=O)C(CC1=CC=CC=C1)NC(=O)C(CC(C)C)C(CSC2=CC...,0,0,1,0,0,0,0
313,NC(=[NH2+])C1=CC=C(CC(N2CCC(N[S](=O)(=O)C3=CC4...,0,0,0,0,0,1,0
258,CC[S](=O)C1=CC=C(C=C1)[N]2C=C(C3=CC=NC=C3)C(=N...,0,0,0,0,1,0,0
95,C[S](=O)(=O)C1=CC=C(C=C1)C2=C(SC(=C2)Br)C3=CC=...,1,0,0,0,0,0,0
48,COC1(CCOCC1)C2=C(F)C=C(OCC3=NC(=C(O3)C4=CC=C(C...,1,0,0,0,0,0,0


In [54]:
# Take the SMILES string
Y=data.SMILES
print(type(Y))
Y.head()

<class 'pandas.core.series.Series'>


150    CCC(C1=CC=CC=C1)=C(C2=CC=CC=C2)C3=CC=C(OCC[NH]...
16     C[S](=O)(=O)C1=CC=C(C=C1)C2=C(C(=O)OC23CCCC3)C...
215    CC(C)C(NC(=O)CNC(=O)C(CS)NC(=O)C(CC[NH+]=C(N)N...
278    NC1=CC(=CC=C1)CNC(=O)C2CCCN2C(=O)C([NH3+])CC3=...
250    CCC(CC)NC1=NC(=CC=N1)C2=C(N=C[N]2C3CC[NH](C)CC...
Name: SMILES, dtype: object

In [62]:
X = data.iloc[:,1:7] # in this case, take the data of column 1:7
# DataFrame.iloc: Purely integer-location based indexing for selection by position.
X = X.values
X = X.astype('int')
type(X)

numpy.ndarray

# Data preprocessing

In [63]:
##padding smiles to same length by adding "|" at the end of smiles
maxY = Y.str.len().max() # find the max length of SMILES 
y = Y.str.ljust(maxY, fillchar='|') # the maxY-th number frim left side, fill '|' 
ts = y.str.len().max()

print(maxY, ts)
y[0]
#  pandas.series.str
#  ljust(): returns the string left justified in a string of length width

105 105


'C[S](=O)(=O)NC1=C(OC2CCCCC2)C=C(C=C1)[N](=O)=O|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||'

In [64]:
# CharToIndex and IndexToChar functions
chars = sorted(list( set("".join(y.values.flatten()))))
print('total chars:', len(chars))

char_idx= dict((c, i) for i, c in enumerate(chars))
idx_char = dict((i, c) for i, c in enumerate(chars))

y_dash=dimY(y,ts)
x_dash=dimX(X,ts)

total chars: 25


# Building Model

In [67]:
def Gen():
    '''Generator model: generating sequence with random noise'''
    G = Sequential() # The keras.model.Sequential() model is a linear stack of layers
    G.add(TimeDistributed(Dense(x_dash.shape[2]), input_shape=(x_dash.shape[1], x_dash.shape[2])))
    # intput a sequence???
    # imeDistributedDense applies a same Dense (fully-connected) operation to every timestep of a 3D tensor.
    
    # Hidden layer 1 with 216 LSTM cell 
    G.add(LSTM(216, return_sequences=True)) 
    G.add(Dropout(0.3))
    # True: output at all steps. False: output as last step.
    
    # Hidden layer 1 with 216 LSTM cell
    G.add(LSTM(216, return_sequences=True))
    G.add(Dropout(0.3))
    
     # Hidden layer 3 with 216 LSTM cell
    G.add(LSTM(216, return_sequences=True))
    #G.add(BatchNormalization(momentum=0.9))
    
    # Output layer with one layers that output a sequence(generating) 
    G.add(TimeDistributed(Dense(y_dash.shape[2], activation='softmax')))
    G.compile(loss='categorical_crossentropy', optimizer=Adam(lr=2e-4))
    return G



def Dis():
    '''Discriminator model'''
    D = Sequential()
    D.add(TimeDistributed(Dense(y_dash.shape[2]), input_shape=(y_dash.shape[1],y_dash.shape[2])))
    
    # Hidden layer 1 with 216 LSTM cell
    D.add(LSTM(216, return_sequences=True))
    D.add(Dropout(0.3))
    
    # Hidden layer 1 with 60 LSTM cell
    D.add(LSTM(60, return_sequences=True))
    D.add(Flatten())
    
    # output layer with one neural(Real or Fake)
    D.add(Dense(1, activation='sigmoid'))
    D.compile(loss='binary_crossentropy', optimizer=SGD(lr=0.001))
    return D



def Gan():
    '''Generative adversarial network'''
    GAN=Sequential()
    GAN.add(G) # Generating mosel
    D.trainable=False #????
    
    GAN.add(D) # Discriminator model
    GAN.compile(loss='binary_crossentropy', optimizer=Adam(lr=2e-4))
    return GAN

In [69]:
#initializing models
G=Gen()
D=Dis()
GAN=Gan()

In [None]:
'''
if os.path.exists(os.getcwd()+"/output/Gen.h5")==True and os.path.exists(os.getcwd()+"/output/Dis.h5")==True and os.path.exists(os.getcwd()+"/output/Gan.h5")==True:
        #loading weights if exits
        G.load_weights(os.getcwd()+"/output/Gen.h5")
        D.load_weights(os.getcwd()+"/output/Dis.h5")
        GAN.load_weights(os.getcwd()+"/output/Gan.h5")
'''     

In [74]:
print("GAN input  : "+str(GAN.input_shape))
print("GAN output : "+str(GAN.output_shape))
print('-----------------------------------')
print("Gen input  : "+str(G.input_shape))
print("Gen output : "+str(G.output_shape))
print('-----------------------------------')
print("Dis input  : "+str(D.input_shape))
print("Dis output : "+str(D.output_shape))

GAN input  : (None, 105, 6)
GAN output : (None, 1)
-----------------------------------
Gen input  : (None, 105, 6)
Gen output : (None, 105, 25)
-----------------------------------
Dis input  : (None, 105, 25)
Dis output : (None, 1)


In [75]:
D.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_8 (TimeDist (None, 105, 25)           650       
_________________________________________________________________
lstm_12 (LSTM)               (None, 105, 216)          209088    
_________________________________________________________________
dropout_8 (Dropout)          (None, 105, 216)          0         
_________________________________________________________________
lstm_13 (LSTM)               (None, 105, 60)           66480     
_________________________________________________________________
flatten_2 (Flatten)          (None, 6300)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 6301      
Total params: 282,519
Trainable params: 0
Non-trainable params: 282,519
_________________________________________________________________


In [76]:
G.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_6 (TimeDist (None, 105, 6)            42        
_________________________________________________________________
lstm_9 (LSTM)                (None, 105, 216)          192672    
_________________________________________________________________
dropout_6 (Dropout)          (None, 105, 216)          0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 105, 216)          374112    
_________________________________________________________________
dropout_7 (Dropout)          (None, 105, 216)          0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 105, 216)          374112    
_________________________________________________________________
time_distributed_7 (TimeDist (None, 105, 25)           5425      
Total para

In [78]:
GAN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_5 (Sequential)    (None, 105, 25)           946363    
_________________________________________________________________
sequential_6 (Sequential)    (None, 1)                 282519    
Total params: 1,228,882
Trainable params: 946,363
Non-trainable params: 282,519
_________________________________________________________________


# Pre-training discrimator

In [77]:
def trainDis(data=None,mc=None):
    if data is None and mc is None:
        # Train on fake data        
        fake_data= G.predict(x_dash)
        targets = np.zeros(x_dash.shape[0]).astype(int)
        Dloss=D.fit(fake_data, targets,nb_epoch=1) 
             
    elif data is None and mc=="mc":
        #preventing mode collapse
        #artificial noice training 
        fake_ydata=np.copy(y_dash)
        shuffle3D(fake_ydata)
        targets = np.zeros(x_dash.shape[0]).astype(int)
        Dloss=D.fit(fake_ydata, targets,nb_epoch=1)    
            
    else:
        # Train on real data
        targets = np.ones(x_dash.shape[0]).astype(int)
        Dloss=D.fit(data,targets,nb_epoch=1)    
           
    #print Dloss.history.keys()
    return Dloss.history['loss'][0]

In [None]:
D.trainable=True

#pre training
for i in range(20):
    shuffleData = np.random.permutation(y_dash)
    # np.random.permutation: Randomly permute a sequence, or return a permuted range.
    trainDis() # call th trainDis() function 
    dloss = trainDis(shuffleData)
    print("Pre Training Discrimator "+str(dloss)+"\n")

# Train GAN Model

In [None]:
def trainGAN():
    #train Generator    
    target = np.ones(x_dash.shape[0]).astype(int)
    gan_loss = GAN.fit(x_dash, target,nb_epoch=1)
    
    return gan_loss.history['loss'][0]

In [None]:
for episode in range(episodes):
    print("Epoch "+str(episode)+"/"+str(episodes))
    trainDis()
    shuffleData = np.random.permutation(y_dash)
    disloss = trainDis(y_dash)
    disloss = trainDis(mc="mc")      
    ganloss = trainGAN()    
    print("D loss="+str(disloss)+" GAN loss="+str(ganloss))
    
    if episode%(episodes/100)==0:
        #G.save(os.getcwd()+'/output/Gen.h5')
        #D.save(os.getcwd()+'/output/Dis.h5')
        #GAN.save(os.getcwd()+'/output/Gan.h5')
        
        #for saving files in floydhub output directory              
        G.save("/output/Gen_mc.h5")
        D.save("/output/Dis_mc.h5")
        GAN.save("/output/Gan_mc.h5")
        
        
    if episode%(episodes/600)==0:
        print("Predicting Molecule")
        x_pred=[[0,0,0,1,0,0],[0,1,0,0,0,0],[0,0,0,0,0,1]]
        x_pred=dimX(x_pred,ts)   
        preds=G.predict(x_pred)
        y_pred=prediction(preds)
        y_pred=seq_txt(y_pred)
        s=smiles_output(y_pred)
        print(s)

# Predict

In [None]:
##For Prediction

'''
#start Prediction
Ghash=Gen()
Ghash.load_weights('Gen_mc.h5')
x_pred=[[0,0,0,1,0,0],
        [0,1,0,0,0,0],
        [0,0,0,0,0,1]]
x_pred=[[0.6,0,0,0,0,0],
        [.3,0,0,0,0,0],
        [0.7,0,0,0,0,0]]
	
x_pred=dimX(x_pred,ts)      
preds=Ghash.predict(x_pred)
y_pred=prediction(preds)
y_pred=seq_txt(y_pred)
s=smiles_output(y_pred)
print s
#end prediction
'''