# Bioinformatics Modeling

## Neural Network Builder - Tutorial 2 

### Contact: fjgreco@us.ibm.com

### Run the following if in CP4D

## Inline editing of code...

In [1]:
%%writefile new_neural_network.py
#%load /project_data/data_asset/new_neural_network8.py
import argparse
#import input_data
import os
import sys
import tensorflow 
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers 
from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten,LSTM
from tensorflow.keras.optimizers import Adam


from tensorflow.keras.models import Sequential

def main():
    
    import os
    """
    cmdstring = 'pip install matplotlib'
    os.system(cmdstring)
    import matplotlib.pyplot as plt
    """
    parser = argparse.ArgumentParser()

    # environment variable when name starts with $
    parser.add_argument('--data_dir', type=str, default='$DATA_DIR',help='Directory with data')
    parser.add_argument('--result_dir', type=str, default='$RESULT_DIR',help='Directory with results')
    parser.add_argument('--sequences_file', type=str,default='sequences.txt',help='File name for sequences')
    parser.add_argument('--labels_file', type=str,default='labels.txt',help='File name for labels')
    parser.add_argument('--model_name', type=str,default='bioinformatics_model',help='neural model name')
    parser.add_argument('--lstm',type=bool,default=True,help='Include LSTM')
    parser.add_argument('--epochs',type=int,default=10,help='Number of epochs')
    parser.add_argument('--lr',type=float,default=0.01,help='Learning rate')
    parser.add_argument("--feature_shape",type=int,default=50,help='Feature shape')


    FLAGS, unparsed = parser.parse_known_args()

    print (FLAGS.result_dir)

    if (FLAGS.result_dir[0] == '$'):
        RESULT_DIR = os.environ[FLAGS.result_dir[1:]]
    else:
        RESULT_DIR = FLAGS.result_dir
        os.environ['RESULT_DIR']=FLAGS.result_dir

    #model_path = os.path.join(RESULT_DIR, 'model')
    #print(model_path)

    if (FLAGS.data_dir[0] == '$'):
        DATA_DIR = os.environ[FLAGS.data_dir[1:]]
    else:
        DATA_DIR = FLAGS.data_dir
        os.environ['DATA_DIR']=FLAGS.data_dir
        
    output_model_folder = os.environ["RESULT_DIR"]

    print("output model folder: ",output_model_folder)
    
    model_name=FLAGS.model_name
    
    history_filename  = model_name+"_history.p"
    print("history_filename: ",history_filename)
    
    cm_filename  = model_name+"_cm.p"
    print("cm_filename: ",cm_filename)
    
    h5_filename  = model_name+".h5"
    print("h5_filename: ",h5_filename)
    
    tar_filename = model_name+".tgz"
    print("tar_filename: ",tar_filename)
    
    model_weights = model_name + "_weights.h5"
    print("model_weights: ", model_weights)
    
    serialized_model = model_name + ".json"
    print("serialized_model: ", serialized_model)
   
    
    scoring_log = model_name + "_scoring.txt"
    
    loss_graph_pdf= model_name + "_loss.pdf"
    loss_graph_png = model_name + "_loss.png"
    print("loss_graph:",loss_graph_png)
    
    accuracy_graph_pdf = model_name + "_accuracy.pdf"
    accuracy_graph_png = model_name + "_accuracy.png"
    print("accuracy_graph:",accuracy_graph_png)
    
    
    #
    # Set training hyperparameters
    #
    
    epochs = FLAGS.epochs
    #epochs = 50
    lr     = FLAGS.lr
    #lr=  0.01
    lstm   = FLAGS.lstm
    feature_shape = FLAGS.feature_shape
    
    #
    # Print hyperparameters to stdout
    #
    
    print('\n')
    print("Number of epochs: ", epochs )
    print("Learning Rate:    ", lr)
    print("Include LSTM:     ", lstm )
    print("Feature Shape:    ", feature_shape )
   
    

    # Add data dir to file path
    sequences_file = os.path.join(DATA_DIR, FLAGS.sequences_file)
    
    labels_file = os.path.join(DATA_DIR, FLAGS.labels_file)
    
    #
    # One-hot encode feature data
    #
    
    with open(sequences_file,'r') as file: 
        raw_sequences=file.read()

    sequences=raw_sequences.split('\n')

    sequences = list(filter(None, sequences))  # Removes empty sequences.

    integer_encoder = LabelEncoder() 

    one_hot_encoder = OneHotEncoder(categories='auto')  
    
    input_features = []

    for sequence in sequences:
        integer_encoded = integer_encoder.fit_transform(list(sequence))
        integer_encoded = np.array(integer_encoded).reshape(-1, 1)
        one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
        input_features.append(one_hot_encoded.toarray())


    np.set_printoptions(threshold=40)
    input_features = np.stack(input_features)
   
    print("Sequence 1\n-----------------------")
    print('DNA Sequence #1:\n',sequences[0][:10],'...',sequences[0][-10:])
    print('One hot encoding of Sequence #1:\n',input_features[0].T)

    #
    # One-hot encode labels
    #
    with open(labels_file,'r') as file: 
        raw_labels=file.read()

    labels=raw_labels.split('\n')

    labels = list(filter(None, labels))  # This removes empty sequences.

    one_hot_encoder = OneHotEncoder(categories='auto')
    labels = np.array(labels).reshape(-1, 1)
    input_labels = one_hot_encoder.fit_transform(labels).toarray()

    print('Labels:\n',labels.T)
    print('One-hot encoded labels:\n',input_labels.T)

    train_features, test_features, train_labels, test_labels = train_test_split(
        input_features, input_labels, test_size=0.25, random_state=42)
        
            
    #
    # Define the neural network model
    #  

    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=12, 
                 input_shape=(train_features.shape[1], 4)))
    model.add(MaxPooling1D(pool_size=4))
    if lstm == True:
            model.add(LSTM(feature_shape))
    model.add(Flatten())
    model.add(Dense(16, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    
    opt = Adam(learning_rate=lr)

    model.compile(loss='binary_crossentropy', optimizer='adam', 
        metrics=['binary_accuracy'])
    model.summary()
    
    #
    # Train the model
    #
    
    history = model.fit(train_features, train_labels, 
            epochs=75,  verbose=0, validation_split=0.25)
    
    import pickle
    with open(history_filename, 'wb') as file_pi:
        pickle.dump(history.history, file_pi)
    
    cmdstring0 = 'cp ' + history_filename + ' '+  output_model_folder
    os.system(cmdstring0)
    
    #
    # Save model to the results storage
    #
    
    model.save( h5_filename ) 
    cmdstring1 = 'cp ' + h5_filename + ' '+  output_model_folder
    os.system(cmdstring1)

    cmdstring2 = 'tar -zcvf ' + tar_filename + ' ' + h5_filename
    os.system(cmdstring2)
    
    cmdstring22 = 'cp ' + tar_filename + ' '+  output_model_folder
    os.system(cmdstring22)
    
    
    #
    # Save the model definition to the results storage
    #
    model_json = model.to_json()
    with open(serialized_model, "w") as json_file:
        json_file.write(model_json)     
 
    cmdstring3 = 'cp ' + serialized_model + ' '+  output_model_folder
    os.system(cmdstring3)

    #
    # Save  trained model weights to the results storage
    #
    model.save_weights(model_weights)
    cmdstring4 = 'cp ' + model_weights + ' '+  output_model_folder
    os.system(cmdstring4)
    
    
    ## Produce and save a confusion matrix
    from sklearn.metrics import confusion_matrix
    #import itertools

    predicted_labels = model.predict(np.stack(test_features))
    cm = confusion_matrix(np.argmax(test_labels, axis=1), 
                          np.argmax(predicted_labels, axis=1))

    cm = cm.astype('float') / cm.sum(axis = 1)[:, np.newaxis]
    
    with open(cm_filename, 'wb') as file_pi:
        pickle.dump(cm, file_pi)
    
    cmdstringX = 'cp ' + cm_filename + ' '+  output_model_folder
    os.system(cmdstringX)
 
    scores = model.evaluate(test_features, test_labels, verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
 
if __name__ == '__main__':
    
    main()

Writing new_neural_network.py


### Test code locally (either on your desktop or Watson Studio)

In [2]:
!ls -al 

total 432
drwxr-xr-x  15 fjgreco  staff     480 Jun  6 21:26 [34m.[m[m
drwxr-xr-x@ 14 fjgreco  staff     448 Jun  6 21:21 [34m..[m[m
-rw-r--r--@  1 fjgreco  staff    6148 Jun  6 21:19 .DS_Store
drwxr-xr-x   3 fjgreco  staff      96 Jun  6 21:25 [34m.ipynb_checkpoints[m[m
drwxr-xr-x   5 fjgreco  staff     160 Jun  6 21:25 [34mDATA_DIR[m[m
-rw-r--r--   1 fjgreco  staff    8577 Jun  2 21:10 ICOS.py
drwxr-xr-x   3 fjgreco  staff      96 Jun  6 21:25 [34m__pycache__[m[m
-rw-r--r--   1 fjgreco  staff  100044 Jun  6 19:55 e2eai-bioinformatics-analysis(tutorial-4).ipynb
-rw-r--r--   1 fjgreco  staff   16326 Jun  6 21:25 e2eai-bioinformatics-assay(tutorial-1).ipynb
-rw-r--r--   1 fjgreco  staff   26032 Jun  6 21:15 e2eai-bioinformatics-neural_network_build(tutorial-2).ipynb
-rw-r--r--@  1 fjgreco  staff    1068 Jun  6 15:05 e2eai_credentials.json
-rw-r--r--   1 fjgreco  staff    7898 Jun  6 21:26 new_neural_network.py
drwxr-xr-x   4 fjgreco  staff     128 Jun  6 18:5

In [3]:
!mkdir DATA_DIR  # Local training directory copy

mkdir: DATA_DIR: File exists


In [4]:
!mkdir RESULT_DIR # Local results directory 

### Copy a subset of training data to TEST DATA DIRECTORY

In [5]:
!mkdir SUBSET_DATA_DIR # Local directory with subset of training data

In [6]:
max_rec=200
rec_count=0
with open('DATA_DIR/assay_data_full.lbl','r') as fi, open('SUBSET_DATA_DIR/assay_data_test.lbl', 'w') as fo:
    for line in fi:
        if rec_count==max_rec:
            break
        else:
            fo.write(line)
            rec_count += 1
rec_count=0           
with open('DATA_DIR/assay_data_full.seq','r') as fi, open('SUBSET_DATA_DIR/assay_data_test.seq', 'w') as fo:
    for line in fi:
        if rec_count==max_rec:
            break
        else:
            fo.write(line)
            rec_count += 1
                                                          

In [8]:
!ls SUBSET_DATA_DIR

assay_data_test.lbl assay_data_test.seq


#### Set environment variables that will be picked up by the payload program

In [9]:
import os
os.environ['RESULT_DIR']='RESULT_DIR'
os.environ['DATA_DIR']='SUBSET_DATA_DIR'   #Set DATA_DIR to TEST_DATA_DIR for local testing

In [10]:
!echo $DATA_DIR 

SUBSET_DATA_DIR


In [11]:
!echo $RESULT_DIR

RESULT_DIR


In [12]:
!ls DATA_DIR

assay_data_full.csv assay_data_full.lbl assay_data_full.seq


#### <font color=blue>Set parameters passed to the payload: </font>

<strong>labels_file</strong>and <strong>sequences_file</strong> must reference files in the DATA_DIR directory

<strong>feature_shape</strong> must patch the value coded in the payload

<strong>lstm</strong> determines whether the neural  network should include a LSTM layer

<strong>epochs</strong> determines the number of passes thru the training data

<strong>lr</strong> can be passed to set the learning rate.


In [13]:
!python3 new_neural_network.py --sequences_file assay_data_test.seq --labels_file assay_data_test.lbl --feature_shape=50 --epochs=10 --lstm=True

$RESULT_DIR
output model folder:  RESULT_DIR
history_filename:  bioinformatics_model_history.p
cm_filename:  bioinformatics_model_cm.p
h5_filename:  bioinformatics_model.h5
tar_filename:  bioinformatics_model.tgz
model_weights:  bioinformatics_model_weights.h5
serialized_model:  bioinformatics_model.json
loss_graph: bioinformatics_model_loss.png
accuracy_graph: bioinformatics_model_accuracy.png


Number of epochs:  10
Learning Rate:     0.01
Include LSTM:      True
Feature Shape:     50
Sequence 1
-----------------------
DNA Sequence #1:
 CGAGCCAATC ... TTGCGAGGAA
One hot encoding of Sequence #1:
 [[0. 0. 1. ... 0. 1. 1.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Labels:
 [['0' '0' '0' ... '0' '0' '1']]
One-hot encoded labels:
 [[1. 1. 1. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
2021-06-06 21:27:45.020972: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f9a0127eb80 initialized for platform Host (this does not guarantee that XLA will be 

## Create zip payload file

In [14]:
!cp new_neural_network.py neural_network_v8T.py

In [15]:
!mkdir tf_model_v8T

mkdir: tf_model_v8T: File exists


In [16]:
!cp neural_network_v8T.py tf_model_v8T/.

In [17]:
!zip -r tf_model_v8T.zip tf_model_v8T

updating: tf_model_v8T/ (stored 0%)
updating: tf_model_v8T/neural_network_v8T.py (deflated 69%)
updating: tf_model_v8T/.ipynb_checkpoints/ (stored 0%)


In [18]:
!ls -al

total 1640
drwxr-xr-x  24 fjgreco  staff     768 Jun  6 21:28 [34m.[m[m
drwxr-xr-x@ 14 fjgreco  staff     448 Jun  6 21:21 [34m..[m[m
-rw-r--r--@  1 fjgreco  staff    6148 Jun  6 21:19 .DS_Store
drwxr-xr-x   4 fjgreco  staff     128 Jun  6 21:27 [34m.ipynb_checkpoints[m[m
drwxr-xr-x   5 fjgreco  staff     160 Jun  6 21:25 [34mDATA_DIR[m[m
-rw-r--r--   1 fjgreco  staff    8577 Jun  2 21:10 ICOS.py
drwxr-xr-x   8 fjgreco  staff     256 Jun  6 21:27 [34mRESULT_DIR[m[m
drwxr-xr-x   4 fjgreco  staff     128 Jun  6 21:27 [34mSUBSET_DATA_DIR[m[m
drwxr-xr-x   3 fjgreco  staff      96 Jun  6 21:25 [34m__pycache__[m[m
-rw-r--r--   1 fjgreco  staff  270440 Jun  6 21:27 bioinformatics_model.h5
-rw-r--r--   1 fjgreco  staff    2703 Jun  6 21:27 bioinformatics_model.json
-rw-r--r--   1 fjgreco  staff  218334 Jun  6 21:27 bioinformatics_model.tgz
-rw-r--r--   1 fjgreco  staff     189 Jun  6 21:27 bioinformatics_model_cm.p
-rw-r--r--   1 fjgreco  staff    8229 Jun  6

##  <font color=green>Proceed to running wml-v4--bioinformatics-neural_network_train(tutorial).ipynb...</font>