In [1]:
import gensim
from time import time
import numpy as np
import os
from SznToolBox import SznToolBox
from Bio import SeqIO
from time import time
from pathlib import Path
from sklearn import metrics
import pandas as pd

In [2]:
posname = "usr60-5mc-pos.fasta"
negname = "usr60-5mc-neg.fasta"
pospath= posname
negpath= negname
toolbox = SznToolBox()

In [4]:
class Fasta_Char_Iter(object):
    def __init__(self, fname):
        self.fname = fname

    def __iter__(self):
        fasta_handle = open(self.fname,'r')
        for record in SeqIO.parse(fasta_handle,"fasta"):
            for mchar in str(record.seq):
                yield mchar
            #print (mchar)
            #yield line.split()    
        fasta_handle.close()

In [9]:
tmpfile= "temp.fasta"
tmp_iter = Fasta_Char_Iter(tmpfile)
tmp = list(tmp_iter)
print(len(tmp))
tmp2 = [*tmp_iter]
print(len(tmp2))

738
738


In [10]:
from time import time
pos_5mc_iter = Fasta_Char_Iter(pospath)
neg_5mc_iter = Fasta_Char_Iter(negpath)
workers = 8

In [12]:
tic = time()
NuAcid_embed_model = gensim.models.Word2Vec(pos_5mc_iter, size=20, window=20, workers=workers, sg=1)
toc=time()
print("Training took {} sec".format(toc-tic))

Training took 16.519062995910645 sec


In [13]:
tic = time()
NuAcid_embed_model.train(neg_5mc_iter,total_examples= 200000, epochs=1)
toc=time()
print("Training took {} sec".format(toc-tic))

Training took 27.33872151374817 sec


In [14]:
wfname = "gensim_5mc_NuAcid_embeddings.gen"
NuAcid_embed_model.save(wfname)

In [19]:
print(NuAcid_embed_model.wv.vocab)
print(len(NuAcid_embed_model.wv.vocab))
print(NuAcid_embed_model.wv.vectors.shape)

{'G': <gensim.models.keyedvectors.Vocab object at 0x0000016E397BF780>, 'A': <gensim.models.keyedvectors.Vocab object at 0x0000016E39D06278>, 'C': <gensim.models.keyedvectors.Vocab object at 0x0000016E39D063C8>, 'T': <gensim.models.keyedvectors.Vocab object at 0x0000016E39BB6908>}
4
(4, 20)


In [23]:
seq= 'GGAGCGGGCCCGGGCGGCGGCGGCAGCAGCGGCGACGGCTG'
print(NuAcid_embed_model.wv.vocab['C'].index)
tmp = [NuAcid_embed_model.wv.vocab[a].index for a in seq]
print(tmp)

1
[0, 0, 2, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 2, 0, 1, 2, 0, 1, 0, 0, 1, 0, 2, 1, 0, 0, 1, 3, 0]


In [3]:
def encode_sample_to_gensim_embedding_index(sample):
    #sample = sample.decode('UTF-8')
    n_arr = [NuAcid_embed_model.wv.vocab[a].index for a in sample]
    return np.array(n_arr)

##Loading Data

In [5]:
from sklearn.model_selection import train_test_split
my_n =1
pos_sample_list = list()
neg_sample_list = list()
with open (pospath, "r") as pos_handle:
    for record in SeqIO.parse(pos_handle,"fasta"):
        pos_sample_list.append(str(record.seq))
print(len(pos_sample_list))

with open (negpath, "r") as neg_handle:
    for i,record in enumerate(SeqIO.parse(neg_handle,"fasta")):
        neg_sample_list.append(str(record.seq))
        if (i==len(pos_sample_list)*my_n): # to read negative records equal to n time 
            #positive records
        #if (i==150000):
            break
print(len(neg_sample_list))

47976
47977


In [6]:
pos_sample_list = np.array([encode_sample_to_gensim_embedding_index(a) for a in pos_sample_list])
neg_sample_list = np.array([encode_sample_to_gensim_embedding_index(a) for a in neg_sample_list])
print(pos_sample_list.shape)
print(neg_sample_list.shape)
print(pos_sample_list[25])

(47976, 41)
(47977, 41)
[1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 2 1 1 1 1 0 0 1 1 2 1 0 1 1 1 1 3 1 0 1 0
 1 3 3 1]


In [7]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
pos_list_size = len(pos_sample_list)
neg_list_size = len(neg_sample_list)
### Preparing labels for positive and negative data
tmp1 = np.ones((pos_list_size,), dtype= int)
tmp2 = np.zeros((neg_list_size,), dtype= int)
Y = np.hstack((tmp1,tmp2))
### Preparing and shuffling samples
X = np.concatenate((pos_sample_list, neg_sample_list), axis = 0)
print(X.shape)
print(X[0])
X,Y = shuffle(X,Y, random_state=25)
X,Y = shuffle(X,Y, random_state=31)
print(X[0])
print(np.unique(Y, return_counts=True))
print("After Shuffle \n X Shape: {}, Y shape: {}".format(X.shape, Y.shape))

X_train, X_test, y_train,y_test = train_test_split(X,Y, test_size=0.3,random_state=17)
print("X_train:{}, y_train:{}, X_test:{},y_test: {}".format(X_train.shape,\
                                                y_train.shape,X_test.shape,y_test.shape))

(95953, 41)
[0 0 2 0 1 0 0 0 1 1 1 0 0 0 1 0 0 1 0 0 1 0 0 1 2 0 1 2 0 1 0 0 1 0 2 1 0
 0 1 3 0]
[1 2 0 0 0 0 1 0 1 2 0 0 0 2 0 0 0 2 0 0 1 1 3 0 0 0 1 1 3 1 1 0 3 3 3 1 1
 1 0 3 3]
(array([0, 1]), array([47977, 47976], dtype=int64))
After Shuffle 
 X Shape: (95953, 41), Y shape: (95953,)
X_train:(67167, 41), y_train:(67167,), X_test:(28786, 41),y_test: (28786,)


In [31]:
#char_index = 
mychars=list(NuAcid_embed_model.wv.vocab)
print(type(NuAcid_embed_model.wv.vocab))
print(mychars)

<class 'dict'>
['G', 'A', 'C', 'T']


In [8]:
num_chars = 4
EMBED_DIM = 20
embedding_matrix = np.zeros((num_chars,EMBED_DIM))
for i,mchar in enumerate(NuAcid_embed_model.wv.vocab.keys()):
    #print(i,mchar)
    vector = NuAcid_embed_model.wv.get_vector(mchar)
    if vector is not None:
        embedding_matrix[i] = vector
print(embedding_matrix.shape)

(4, 20)


In [9]:
from keras.models import Sequential,Model
from keras.layers import Bidirectional, Dropout, Embedding, Dense, LSTM, SimpleRNN, GRU,Conv1D, MaxPool1D
from keras.layers import CuDNNLSTM,CuDNNGRU,GlobalAveragePooling1D
#from keras.layers.advanced_activations import LeakyReLU, PReLU

Using TensorFlow backend.


In [10]:
from keras.layers import Embedding
NA_embeddings = Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],
                      weights=[embedding_matrix], trainable= False)

W0314 22:29:16.694295 10360 deprecation_wrapper.py:119] From C:\Miniconda3\envs\py36tfkeras\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [43]:
model_cnn = Sequential()
#model_cnn.add(Embedding(vocabulary_size, embedding_dim))
model_cnn.add(NA_embeddings)
model_cnn.add(Conv1D(10,7,activation="relu"))#, input_shape = (None,input_size)))
model_cnn.add(Conv1D(20,5,activation="relu"))#, input_shape = (None,input_size)))
#model_cnn.add(Conv1D(7,3,activation="relu", input_shape = (None,input_size)))
model_cnn.add(MaxPool1D(2))
model_cnn.add(Dropout(0.50))
model_cnn.add(Conv1D(30,5,activation="relu"))
model_cnn.add(Conv1D(48,3,activation="relu"))
model_cnn.add(MaxPool1D(2))
model_cnn.add(Dropout(0.50))
model_cnn.add(Conv1D(64,3,activation="relu"))
#model_cnn.add(Conv1D(24,3,activation="relu"))
#model_cnn.add(Flatten())
model_cnn.add(GlobalAveragePooling1D())
#model_cnn.add(Dropout(0.50))
model_cnn.add(Dense(8, activation="relu"))
model_cnn.add(Dense(1, activation="sigmoid"))
model_cnn.compile(loss='binary_crossentropy',
              optimizer='adam',
metrics=['accuracy'])
print(model_cnn.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 20)          80        
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 10)          1410      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 20)          1020      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 20)          0         
_________________________________________________________________
dropout_6 (Dropout)          (None, None, 20)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 30)          3030      
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 48)          4368      
__________

In [44]:
from keras.callbacks import ModelCheckpoint,EarlyStopping, TerminateOnNaN, ReduceLROnPlateau
#epochs=100
checkpointer = ModelCheckpoint(filepath="5mcCNN2.h5", verbose=0, \
                               save_weights_only=False, save_best_only=True, period =5)
earlystop = EarlyStopping(patience=5)
TonNaN = TerminateOnNaN()
rLR= ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

In [47]:
tic = time()
hist = model_cnn.fit(X_train, y_train,verbose=2,
          batch_size= 128,
          callbacks=[checkpointer,earlystop,TonNaN, rLR],
          epochs=30, validation_split=0.2)
          #validation_data= (X_test, y_test))
toc = time()
print("CNN Model training took {} Secs".format ((toc-tic)))

Train on 53733 samples, validate on 13434 samples
Epoch 1/30
 - 4s - loss: 0.1849 - acc: 0.9241 - val_loss: 0.1703 - val_acc: 0.9290
Epoch 2/30
 - 4s - loss: 0.1859 - acc: 0.9242 - val_loss: 0.1724 - val_acc: 0.9298
Epoch 3/30
 - 4s - loss: 0.1833 - acc: 0.9253 - val_loss: 0.1727 - val_acc: 0.9276
Epoch 4/30
 - 4s - loss: 0.1846 - acc: 0.9235 - val_loss: 0.1681 - val_acc: 0.9299
Epoch 5/30
 - 4s - loss: 0.1834 - acc: 0.9248 - val_loss: 0.1674 - val_acc: 0.9307
Epoch 6/30
 - 4s - loss: 0.1845 - acc: 0.9249 - val_loss: 0.1670 - val_acc: 0.9307
Epoch 7/30
 - 4s - loss: 0.1826 - acc: 0.9253 - val_loss: 0.1730 - val_acc: 0.9264
Epoch 8/30
 - 4s - loss: 0.1832 - acc: 0.9253 - val_loss: 0.1699 - val_acc: 0.9289
Epoch 9/30
 - 4s - loss: 0.1829 - acc: 0.9253 - val_loss: 0.1686 - val_acc: 0.9293
Epoch 10/30
 - 4s - loss: 0.1824 - acc: 0.9265 - val_loss: 0.1693 - val_acc: 0.9303
Epoch 11/30
 - 4s - loss: 0.1810 - acc: 0.9269 - val_loss: 0.1673 - val_acc: 0.9313
CNN Model training took 44.32042241

In [57]:
model_dict["CNN"]= model_cnn

In [53]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
ypred_dict =dict()
ypredprob_dict = dict()
model_dict = dict()

In [56]:
y_pred = model_cnn.predict(X_test)
y_pred = np.where (y_pred < 0.53, 0, 1)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred))
print(metrics.matthews_corrcoef(y_test,y_pred))
ypred_dict["CNN"] = y_pred
ypredprob_dict["CNN"] = model_cnn.predict_proba(X_test)

[[12817  1592]
 [  393 13984]]
              precision    recall  f1-score   support

           0       0.97      0.89      0.93     14409
           1       0.90      0.97      0.93     14377

    accuracy                           0.93     28786
   macro avg       0.93      0.93      0.93     28786
weighted avg       0.93      0.93      0.93     28786

0.9310428680608629
0.8651047610017965


In [55]:
model_gru = Sequential()
model_gru.add(NA_embeddings)
#model.add(Dropout(dropout_rate))
#model_bd_gru.add(Bidirectional(GRU(rnn_hidden_size)))
model_gru.add(CuDNNGRU(30, return_sequences=True))
model_gru.add(CuDNNGRU(20, return_sequences=False))
model_gru.add(Dropout(0.5))
model_gru.add(Dense(10, activation='relu'))
#model_gru.add(Dropout(0.1))
model_gru.add(Dense(1, activation='sigmoid'))
model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_gru.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 20)          80        
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       (None, None, 30)          4680      
_________________________________________________________________
cu_dnngru_2 (CuDNNGRU)       (None, 20)                3120      
_________________________________________________________________
dropout_6 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
Total params: 8,101
Trainable params: 8,021
Non-trainable params: 80
_________________________________________________________________
None


In [56]:
from time import time
chkptr_gru = ModelCheckpoint(filepath="5mcGRU.h5", verbose=0, \
                               save_weights_only=False, save_best_only=True, period =5)
tic = time()
hist_srnn= model_gru.fit(X_train, y_train, verbose=2,
          batch_size=12,
          callbacks=[chkptr_gru,earlystop,TonNaN, rLR],
          epochs=60,validation_split= 0.2)
#validation_data=(X_test, y_test))
toc = time()
print("Model training took {} Secs".format ((toc-tic)))

Train on 53733 samples, validate on 13434 samples
Epoch 1/60
 - 49s - loss: 0.5280 - acc: 0.7093 - val_loss: 0.2559 - val_acc: 0.8956
Epoch 2/60
 - 48s - loss: 0.2540 - acc: 0.8991 - val_loss: 0.2552 - val_acc: 0.9078
Epoch 3/60
 - 48s - loss: 0.2381 - acc: 0.9041 - val_loss: 0.2207 - val_acc: 0.9093
Epoch 4/60
 - 47s - loss: 0.2326 - acc: 0.9034 - val_loss: 0.2089 - val_acc: 0.9120
Epoch 5/60
 - 48s - loss: 0.2349 - acc: 0.9008 - val_loss: 0.3398 - val_acc: 0.8365
Epoch 6/60
 - 49s - loss: 0.2304 - acc: 0.9029 - val_loss: 0.1981 - val_acc: 0.9170
Epoch 7/60
 - 48s - loss: 0.2155 - acc: 0.9093 - val_loss: 0.2260 - val_acc: 0.9114
Epoch 8/60
 - 47s - loss: 0.2139 - acc: 0.9091 - val_loss: 0.1917 - val_acc: 0.9183
Epoch 9/60
 - 47s - loss: 0.2101 - acc: 0.9108 - val_loss: 0.1925 - val_acc: 0.9173
Epoch 10/60
 - 47s - loss: 0.2039 - acc: 0.9140 - val_loss: 0.1828 - val_acc: 0.9238
Epoch 11/60
 - 50s - loss: 0.2007 - acc: 0.9143 - val_loss: 0.1800 - val_acc: 0.9241
Epoch 12/60
 - 48s - los

In [57]:
model_dict["CNN"] = model_cnn
model_dict["GRU"] = model_gru

In [59]:
import joblib
model_dict_path = Path("5mc-2models-dict21Mar05.jbl")
model_dict_path.touch(exist_ok=True)

with open(model_dict_path, "wb") as file_handle:
    joblib.dump(model_dict, file_handle)

##Reloading the Gensim and other Classification Models

In [4]:
wfname = "gensim_5mc_NuAcid_embeddings.gen"
NuAcid_embed_model = gensim.models.Word2Vec.load(wfname)

In [11]:
import joblib
model_store_name = "5mc-2models-dict21Mar05.jbl"
with open (model_store_name, "rb") as fp_handle:
    model_dict = joblib.load(fp_handle)

W0314 22:31:18.380050 10360 deprecation_wrapper.py:119] From C:\Miniconda3\envs\py36tfkeras\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0314 22:31:18.389051 10360 deprecation_wrapper.py:119] From C:\Miniconda3\envs\py36tfkeras\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0314 22:31:18.467048 10360 deprecation_wrapper.py:119] From C:\Miniconda3\envs\py36tfkeras\lib\site-packages\keras\backend\tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0314 22:31:18.471052 10360 deprecation_wrapper.py:119] From C:\Miniconda3\envs\py36tfkeras\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0314 22:31:18.482055 10360 deprecation.

In [13]:
model_lstm = Sequential()
model_lstm.add(NA_embeddings)
#model.add(Dropout(dropout_rate))
#model_bd_gru.add(Bidirectional(GRU(rnn_hidden_size)))
model_lstm.add(CuDNNLSTM(30, return_sequences=True))
model_lstm.add(CuDNNGRU(20, return_sequences=False))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(10, activation='relu'))
#model_gru.add(Dropout(0.1))
model_lstm.add(Dense(1, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_lstm.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 20)          80        
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, None, 30)          6240      
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       (None, 20)                3120      
_________________________________________________________________
dropout_1 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 9,661
Trainable params: 9,581
Non-trainable params: 80
_________________________________________________________________
None


In [23]:
from time import time
chkptr_lstm = ModelCheckpoint(filepath="5mcLSTM.h5", verbose=0, \
                               save_weights_only=False, save_best_only=True, period =5)
tic = time()
hist_srnn= model_lstm.fit(X_train, y_train, verbose=2,
          batch_size= 128,
          callbacks=[chkptr_lstm,earlystop,TonNaN, rLR],
          epochs=10,validation_split= 0.2)
#validation_data=(X_test, y_test))
toc = time()
print("Model training took {} Secs".format ((toc-tic)))
model_dict["LSTM"] = model_lstm

Train on 53733 samples, validate on 13434 samples
Epoch 1/10
 - 5s - loss: 0.2289 - acc: 0.9053 - val_loss: 0.2125 - val_acc: 0.9110
Epoch 2/10
 - 5s - loss: 0.2241 - acc: 0.9066 - val_loss: 0.2206 - val_acc: 0.9123
Epoch 3/10
 - 5s - loss: 0.2202 - acc: 0.9090 - val_loss: 0.2126 - val_acc: 0.9075
Epoch 4/10
 - 5s - loss: 0.2228 - acc: 0.9065 - val_loss: 0.2383 - val_acc: 0.9128
Epoch 5/10
 - 5s - loss: 0.2286 - acc: 0.9043 - val_loss: 0.2153 - val_acc: 0.9125
Epoch 6/10
 - 5s - loss: 0.2212 - acc: 0.9087 - val_loss: 0.2092 - val_acc: 0.9112
Epoch 7/10
 - 5s - loss: 0.2204 - acc: 0.9075 - val_loss: 0.2112 - val_acc: 0.9131
Epoch 8/10
 - 5s - loss: 0.2232 - acc: 0.9069 - val_loss: 0.2141 - val_acc: 0.9088
Epoch 9/10
 - 5s - loss: 0.2209 - acc: 0.9084 - val_loss: 0.2166 - val_acc: 0.9122
Epoch 10/10
 - 5s - loss: 0.2208 - acc: 0.9076 - val_loss: 0.2128 - val_acc: 0.9135
Model training took 50.09367656707764 Secs


In [15]:
ypred_dict = dict()
ypredprob_dict = dict()
ypred_dict["CNN"] = model_dict["CNN"].predict(X_test)
ypredprob_dict["CNN"] = model_dict["CNN"].predict(X_test)

ypred_dict["GRU"] = model_dict["GRU"].predict(X_test)
ypredprob_dict["GRU"] = model_dict["GRU"].predict(X_test)

In [37]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef
y_pred = model_lstm.predict(X_test)
y_pred = np.where (y_pred < 0.5, 0, 1)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred))
print(metrics.matthews_corrcoef(y_test,y_pred))
ypred_dict["LSTM"] = y_pred
ypredprob_dict["LSTM"] = model_lstm.predict_proba(X_test)

[[12161  2248]
 [  308 14069]]
              precision    recall  f1-score   support

           0       0.98      0.84      0.90     14409
           1       0.86      0.98      0.92     14377

    accuracy                           0.91     28786
   macro avg       0.92      0.91      0.91     28786
weighted avg       0.92      0.91      0.91     28786

0.9112068366567081
0.8300120776794688


In [26]:
model_dict["LSTM"] = model_lstm

In [29]:
model_bd_gru = Sequential()
model_bd_gru.add(NA_embeddings)
#model.add(Dropout(dropout_rate))
#model_bd_gru.add(Bidirectional(GRU(rnn_hidden_size)))
model_bd_gru.add(Bidirectional(CuDNNGRU(23, return_sequences=True)))
model_bd_gru.add(Bidirectional(CuDNNGRU(12, return_sequences=False)))
model_bd_gru.add(Dropout(0.5))
model_bd_gru.add(Dense(10, activation='relu'))
model_bd_gru.add(Dense(1, activation='sigmoid'))
model_bd_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_bd_gru.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 20)          80        
_________________________________________________________________
bidirectional_5 (Bidirection (None, None, 46)          6210      
_________________________________________________________________
bidirectional_6 (Bidirection (None, 24)                4320      
_________________________________________________________________
dropout_4 (Dropout)          (None, 24)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                250       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 11        
Total params: 10,871
Trainable params: 10,791
Non-trainable params: 80
_________________________________________________________________
None


In [32]:
from time import time
chkptr_bdgru = ModelCheckpoint(filepath="5mcBDGru.h5", verbose=0, \
                               save_weights_only=False, save_best_only=True, period =5)
tic = time()
hist_bdgr= model_bd_gru.fit(X_train, y_train, verbose=2,
          batch_size= 128,
          callbacks=[chkptr_bdgru,earlystop,TonNaN, rLR],
          epochs=10,validation_split= 0.2)
#validation_data=(X_test, y_test))
toc = time()
print("Model training took {} Secs".format ((toc-tic)))


Train on 53733 samples, validate on 13434 samples
Epoch 1/10
 - 8s - loss: 0.1968 - acc: 0.9240 - val_loss: 0.1738 - val_acc: 0.9296
Epoch 2/10
 - 8s - loss: 0.1946 - acc: 0.9247 - val_loss: 0.1712 - val_acc: 0.9310
Epoch 3/10
 - 8s - loss: 0.1915 - acc: 0.9259 - val_loss: 0.1718 - val_acc: 0.9316
Epoch 4/10
 - 8s - loss: 0.1888 - acc: 0.9256 - val_loss: 0.1696 - val_acc: 0.9323
Epoch 5/10
 - 8s - loss: 0.1883 - acc: 0.9267 - val_loss: 0.1717 - val_acc: 0.9317
Epoch 6/10
 - 8s - loss: 0.2109 - acc: 0.9163 - val_loss: 0.1727 - val_acc: 0.9297
Epoch 7/10
 - 8s - loss: 0.1873 - acc: 0.9273 - val_loss: 0.1694 - val_acc: 0.9309
Epoch 8/10
 - 8s - loss: 0.1830 - acc: 0.9291 - val_loss: 0.1704 - val_acc: 0.9318
Epoch 9/10
 - 8s - loss: 0.1831 - acc: 0.9283 - val_loss: 0.1821 - val_acc: 0.9257
Epoch 10/10
 - 8s - loss: 0.1813 - acc: 0.9289 - val_loss: 0.1663 - val_acc: 0.9335
Model training took 76.03562021255493 Secs


In [38]:
y_pred = model_bd_gru.predict(X_test)
y_pred = np.where (y_pred < 0.5, 0, 1)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred))
print(metrics.matthews_corrcoef(y_test,y_pred))
ypred_dict["BiDirectionGRU"] = y_pred
ypredprob_dict["BiDirectionGRU"] = model_bd_gru.predict_proba(X_test)

[[12932  1477]
 [  513 13864]]
              precision    recall  f1-score   support

           0       0.96      0.90      0.93     14409
           1       0.90      0.96      0.93     14377

    accuracy                           0.93     28786
   macro avg       0.93      0.93      0.93     28786
weighted avg       0.93      0.93      0.93     28786

0.9308691725144167
0.8636875770657437


In [34]:
model_dict["BiDirection_GRU"] = model_bd_gru

In [35]:
model_bd_lstm = Sequential()
model_bd_lstm.add(NA_embeddings)
#model.add(Dropout(dropout_rate))
#model_bd_gru.add(Bidirectional(GRU(rnn_hidden_size)))
model_bd_lstm.add(Bidirectional(CuDNNLSTM(23, return_sequences=True)))
model_bd_lstm.add(Bidirectional(CuDNNLSTM(12, return_sequences=False)))
model_bd_lstm.add(Dropout(0.5))
model_bd_lstm.add(Dense(10, activation='relu'))
model_bd_lstm.add(Dense(1, activation='sigmoid'))
model_bd_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_bd_lstm.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 20)          80        
_________________________________________________________________
bidirectional_7 (Bidirection (None, None, 46)          8280      
_________________________________________________________________
bidirectional_8 (Bidirection (None, 24)                5760      
_________________________________________________________________
dropout_5 (Dropout)          (None, 24)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 10)                250       
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 11        
Total params: 14,381
Trainable params: 14,301
Non-trainable params: 80
_________________________________________________________________
None


In [40]:
chkptr_bdlstm = ModelCheckpoint(filepath="5mcBDLSTM.h5", verbose=0, \
                               save_weights_only=False, save_best_only=True, period =5)
tic = time()
hist_bdls= model_bd_lstm.fit(X_train, y_train, verbose=2,
          batch_size= 128,
          callbacks=[chkptr_bdlstm, earlystop, TonNaN, rLR],
          epochs=20,validation_split= 0.2)
#validation_data=(X_test, y_test))
toc = time()
print("Model training took {} Secs".format ((toc-tic)))

Train on 53733 samples, validate on 13434 samples
Epoch 1/20
 - 8s - loss: 0.1728 - acc: 0.9281 - val_loss: 0.1677 - val_acc: 0.9317
Epoch 2/20
 - 8s - loss: 0.1760 - acc: 0.9262 - val_loss: 0.2176 - val_acc: 0.9104
Epoch 3/20
 - 8s - loss: 0.1733 - acc: 0.9278 - val_loss: 0.1657 - val_acc: 0.9336
Epoch 4/20
 - 8s - loss: 0.1714 - acc: 0.9291 - val_loss: 0.1652 - val_acc: 0.9330
Epoch 5/20
 - 8s - loss: 0.1752 - acc: 0.9277 - val_loss: 0.1659 - val_acc: 0.9332
Epoch 6/20
 - 8s - loss: 0.1809 - acc: 0.9254 - val_loss: 0.1678 - val_acc: 0.9320
Epoch 7/20
 - 8s - loss: 0.1696 - acc: 0.9307 - val_loss: 0.1657 - val_acc: 0.9331
Epoch 8/20
 - 8s - loss: 0.1681 - acc: 0.9313 - val_loss: 0.1633 - val_acc: 0.9338
Epoch 9/20
 - 8s - loss: 0.1685 - acc: 0.9306 - val_loss: 0.1640 - val_acc: 0.9335
Epoch 10/20
 - 8s - loss: 0.1865 - acc: 0.9238 - val_loss: 0.1674 - val_acc: 0.9323
Epoch 11/20
 - 8s - loss: 0.1704 - acc: 0.9311 - val_loss: 0.1626 - val_acc: 0.9343
Epoch 12/20
 - 8s - loss: 0.1669 - 

In [41]:
y_pred = model_bd_lstm.predict(X_test)
y_pred = np.where (y_pred < 0.5, 0, 1)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(metrics.accuracy_score(y_test,y_pred))
print(metrics.matthews_corrcoef(y_test,y_pred))
ypred_dict["BiDirectionLSTM"] = y_pred
ypredprob_dict["BiDirectionLSTM"] = model_bd_lstm.predict_proba(X_test)

[[12897  1512]
 [  429 13948]]
              precision    recall  f1-score   support

           0       0.97      0.90      0.93     14409
           1       0.90      0.97      0.93     14377

    accuracy                           0.93     28786
   macro avg       0.94      0.93      0.93     28786
weighted avg       0.94      0.93      0.93     28786

0.9325713888695893
0.8676130953612733


In [42]:
model_dict["BiDirection_LSTM"] = model_bd_lstm

In [59]:
ypred_dict["Ground_truth"] = y_test
ypredprob_dict["Ground_truth"] = y_test

model_dict_path = Path("5cm-model-dict21Mar14.jbl")
model_dict_path.touch(exist_ok=True)

with open(model_dict_path, "wb") as file_handle:
    joblib.dump(model_dict, file_handle)

pred_dict_path = Path("5cm-ypred-dict21Mar14.jbl")
pred_dict_path.touch(exist_ok=True)

with open(pred_dict_path, "wb") as file_handle:
    joblib.dump(ypred_dict,file_handle)

x_test_path = Path("5cm-xtest-21Mar14.jbl")
x_test_path.touch(exist_ok=True)
with open(x_test_path, "wb") as file_handle:
    joblib.dump(X_test,file_handle)

pred_dict_path = Path("5cm-ypredprob-dict21Mar14.jbl")
pred_dict_path.touch(exist_ok=True)
with open(pred_dict_path, "wb") as file_handle:
    joblib.dump(ypredprob_dict,file_handle)