In [None]:
# Disable tensorflow warnings:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import sys
sys.version

In [None]:
# Check GPU usage:
!nvidia-smi

In [None]:
# Pick GPU:
GPU_id = '2'
import os
os.environ["CUDA_VISIBLE_DEVICES"]=GPU_id
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
# Limit GPU memory growth:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
# Check gpu:
tf.config.experimental.list_physical_devices('GPU')

In [None]:
# Check number of CPU cores:
import multiprocessing
multiprocessing.cpu_count()

In [None]:
# Define inputs:
train_tsv = 'L5_log2expression_train_220528.tsv' # training data
val_tsv = 'L5_log2expression_val_220528.tsv' # validation data
test_tsv = 'L5_log2expression_test_220528.tsv' # test data

# Load data:
import pandas as pd
import numpy as np
train_df = pd.read_table(train_tsv)
val_df = pd.read_table(val_tsv)
test_df = pd.read_table(test_tsv)
train_df

In [None]:
val_df

In [None]:
test_df

In [None]:
# Convert letters to indexes:
vocab = ['pad','N','A','T','C','G'] # 'pad' has to be first (0)
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
def vectorize_string(string):
    vectorized_output = [char2idx[char] for char in string]
    return vectorized_output

train_df['Nni'] = train_df['Nn'].apply(vectorize_string)
val_df['Nni'] = val_df['Nn'].apply(vectorize_string)
test_df['Nni'] = test_df['Nn'].apply(vectorize_string)

# Specify x:
train_x = train_df.Nni
val_x = val_df.Nni
test_x = test_df.Nni

# Array x, empty spaces will be assigned as 0 ('pad'):
N = len(max(train_df['Nni'], key=len))
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_x = pad_sequences(train_x, maxlen=N, padding='post')
val_x = pad_sequences(val_x, maxlen=N, padding='post')
test_x = pad_sequences(test_x, maxlen=N, padding='post')

train_x

In [None]:
val_x

In [None]:
test_x

In [None]:
# Prepare y:
train_y = train_df[['nuc.log2expression', 'cyt.log2expression']].to_numpy()
train_y

In [None]:
val_y = val_df[['nuc.log2expression', 'cyt.log2expression']].to_numpy()
val_y

In [None]:
test_y = test_df[['nuc.log2expression', 'cyt.log2expression']].to_numpy()
test_y

In [None]:
# LSTM:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import BatchNormalization
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import Flatten
import keras

# Initiate the model:
Model = Sequential()
# Input layer:
embed_dim = 5
Model.add(Embedding(input_length=train_x.shape[1], input_dim=len(vocab), output_dim=embed_dim))
# LSTM layer:
Model.add(LSTM(64, return_sequences=True))
Model.add(LSTM(32, return_sequences=True))
Model.add(Dropout(0.5))
# Dense layer:
Model.add(Flatten())
Model.add(Dense(64, activation='relu'))
Model.add(Dropout(0.5))
# Output layer:
Model.add(Dense(2, activation='linear'))


# Define optimizer:
learning_rate = 0.5**11
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
Model.compile(loss='mae', optimizer=optimizer)

# Name the model:
output_model = 'models/L5-220528_em5>LSTM64x32*0.5>64*0.5-rep9.hdf5'

# Show architechture:
Model.summary()

In [None]:
# linear CNN:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Conv1D
from keras.layers import GlobalAveragePooling1D
from keras.layers import MaxPooling1D
import keras

# Initiate the model:
Model = Sequential()
# Input layer:
embed_dim = 4
Model.add(Embedding(input_dim=len(vocab), output_dim=embed_dim, input_length=train_x.shape[1]))
# 8-mers:
Model.add(Conv1D(256, kernel_size=8, strides=1, activation='relu'))
# feature counts:
Model.add(GlobalAveragePooling1D())
# Output layer:
Model.add(Dense(2, activation='linear'))


# Define optimizer:
learning_rate = 0.5**4
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
Model.compile(loss='mae', optimizer=optimizer)

# Name the model:
output_model = 'models/L5-220528_CNN8*256>GAP-rep4.hdf5'

# Show architechture:
Model.summary()

In [None]:
# Check GPU usage:
!nvidia-smi

In [None]:
# Set training:
batch_size = 65536
add_epoch = 4096
# Set recording:
Best_model_path = output_model
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint(Best_model_path, monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min')
callbacks_list = [checkpoint]
# Perform training:
history = Model.fit(train_x, train_y, validation_data=(val_x, val_y), shuffle=True, 
                    callbacks=callbacks_list, batch_size=batch_size, epochs=add_epoch)

In [None]:
# Plot training history:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Training history',fontsize=15)
plt.ylabel('Loss', fontsize=15)
plt.xlabel('Epoch', fontsize=15)
plt.legend(['Train', 'Val'], loc='upper right')
plt.axhline(y=min(history.history['val_loss']), color='tab:orange', linestyle='--', linewidth=0.5)
plt.axvline(x=np.argmin(history.history['val_loss']), color='tab:orange', linestyle='--', linewidth=0.5)
plt.axhline(y=min(history.history['loss']), color='tab:blue', linestyle='--', linewidth=0.5)
plt.axvline(x=np.argmin(history.history['loss']), color='tab:blue', linestyle='--', linewidth=0.5)
plt.show()

In [None]:
Best_model_path

In [None]:
# train mae:
from keras.models import load_model
BestModel = load_model(Best_model_path)
pred_y = BestModel.predict(train_x)
from sklearn.metrics import mean_absolute_error
train_mae = mean_absolute_error(train_y, pred_y)
train_mae

In [None]:
# validation mae:
pred_y = BestModel.predict(val_x)
from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(val_y, pred_y)
val_mae

In [None]:
# validation mse:
from sklearn.metrics import mean_squared_error
val_mse = mean_squared_error(val_y, pred_y)
val_mse

In [None]:
# predict test data:
pred_y = BestModel.predict(test_x)
# Plot nuc.log2expression comparison:
import numpy
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 8))
a = plt.axes(aspect='equal')
plt.scatter((test_y[:,0]), (pred_y[:,0]), s=0.7, c='black')
plt.xlabel('Measured', fontsize=17)
plt.ylabel('Predicted', fontsize=17)
test_y_max = max((test_y[:,0]))
test_y_min = min((test_y[:,0]))
pred_y_max = float(max((pred_y[:,0])))
pred_y_min = float(min((pred_y[:,0])))
the_max = max(test_y_max, pred_y_max)
the_min = min(test_y_min, pred_y_min)
lims = [the_min, the_max]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
# R2:
from sklearn import metrics
metrics.r2_score(test_y[:,0], pred_y[:,0])

In [None]:
metrics.r2_score(pred_y[:,0], test_y[:,0])

In [None]:
# Plot cyt.log2expression comparison:
import numpy
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 8))
a = plt.axes(aspect='equal')
plt.scatter((test_y[:,1]), (pred_y[:,1]), s=0.7, c='black')
plt.xlabel('Measured', fontsize=17)
plt.ylabel('Predicted', fontsize=17)
test_y_max = max((test_y[:,1]))
test_y_min = min((test_y[:,1]))
pred_y_max = float(max((pred_y[:,1])))
pred_y_min = float(min((pred_y[:,1])))
the_max = max(test_y_max, pred_y_max)
the_min = min(test_y_min, pred_y_min)
lims = [the_min, the_max]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
# R2:
from sklearn import metrics
metrics.r2_score(test_y[:,1], pred_y[:,1])

In [None]:
metrics.r2_score(pred_y[:,1], test_y[:,1])