In [None]:
import os
import sys
sys.path.append('../../Utils')
from metrics import compute_metrics

import sklearn.metrics as metrics
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
import kerastuner

import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

current_dir = os.getcwd()
data_dir = os.path.join(current_dir, '../../../Data/')
data_path = data_dir+'/29Atlas/protein_embeddings/nn_dataset.tsv'

Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


In [None]:
device = 0
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(physical_devices[device], 'GPU')
print(f'TF eager exectution: {tf.executing_eagerly()}')
print(f'Using device {physical_devices[device]}')

### Dataset import and formatting

In [None]:
#protein and mrna - log10, protein_length log2 -> need to remap to same coordinates as in manifold kernel
atlas_data = pd.read_csv(data_path,sep='\t',index_col=0).astype(np.float32)
atlas_data['protein'] = np.log2(10**atlas_data['protein']+1)
atlas_data['mrna'] = np.log2(10**atlas_data['mrna']+1)
atlas_data['protein_length'] = np.log2(2**atlas_data['protein_length']+1)
print(atlas_data.shape)
atlas_data.head()

In [None]:
ZSCORE = True
SEED = 10
train,test = train_test_split(atlas_data,test_size=0.2,random_state=SEED)
SEED = 42
test,val = train_test_split(test,test_size=0.5,random_state=SEED)
SEED = 82
test,overfit = train_test_split(test,test_size=0.05,random_state=SEED)

def zscore(train_df):
    assert isinstance(train_df,pd.DataFrame)
    means = train_df.mean(axis=0)
    stds = train_df.std(axis=0)
    zscored = (train_df-means)/stds
    return zscored, means, stds

if ZSCORE:
    print(f'Data is z-scored')
    train, train_mean, train_std = zscore(train) #zscore data
    val = (val-train_mean)/train_std #zscore validation data using mean and std from train set
    test = (test-train_mean)/train_std #zscore test data using mean and std from train set
    overfit = (overfit-train_mean)/train_std

In [None]:
BATCH = 64
x_train = train.drop(columns='protein').values
y_train = train[['protein']].values

x_val = val.drop(columns='protein').values
y_val = val[['protein']].values

x_test = test.drop(columns='protein').values
y_test = test[['protein']].values

x_overfit = overfit.drop(columns='protein').values
y_overfit = overfit[['protein']].values

print(f'train dataset size: {x_train.shape}')
print(f'validation dataset size: {x_val.shape}')
print(f'test dataset size: {x_test.shape}')
print(f'overfit dataset size: {x_overfit.shape}')

trn_dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train))
trn_dataset = trn_dataset.shuffle(buffer_size=x_train.shape[0]).batch(BATCH) #I think default is 32

overfit_dataset = tf.data.Dataset.from_tensor_slices((x_overfit,y_overfit))
overfit_dataset = overfit_dataset.shuffle(buffer_size=x_overfit.shape[0]).batch(BATCH)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val,y_val))
val_dataset = val_dataset.batch(BATCH)

In [25]:
def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Dense(units=hp.Int('units',
                                        min_value=8,
                                        max_value=128,
                                        step=4),
                           activation='relu'))
    model.add(layers.Dense(1, activation='linear'))
    model.compile(
        optimizer=keras.optimizers.Adam(
            hp.Choice('learning_rate',
                      values=[1e-2, 1e-3, 1e-4])),
        loss='mse',
        metrics=['mse'])
    return model

In [69]:
tuner = RandomSearch(
    build_model,
    objective=kerastuner.Objective("val_mse", direction="min"),
    max_trials=3,
    executions_per_trial=3,
    directory='my_dir',
    project_name='helloworld2')

#dict_keys(['loss', 'mse', 'val_loss', 'val_mse'])

INFO:tensorflow:Reloading Oracle from existing project my_dir/helloworld2/oracle.json
INFO:tensorflow:Reloading Tuner from my_dir/helloworld2/tuner0.json


In [46]:
tuner.search_space_summary()

In [70]:
tuner.search(x_val, y_val,
             epochs=5,
             validation_data=(x_overfit, y_overfit),
             callbacks=[tf.keras.callbacks.EarlyStopping('val_loss', patience=3)],
             verbose=0)

INFO:tensorflow:Oracle triggered exit


In [68]:
tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values

{'units': 40, 'learning_rate': 0.001}

In [6]:
class General_MLP(keras.Model):
    def __init__(self,hidden_nodes,input_shape=None):
        '''
        hidden nodes - all the dimensions after input including output size
        input shape is optional, if not specified then network takes input shape as the shape of the first vector passed to it.
        '''
        super(General_MLP, self).__init__()
        self.mlp_layers = []
        for nodes in hidden_nodes[0:-1]: 
            self.mlp_layers.append(keras.layers.Dense(nodes, activation='relu'))
            
        self.mlp_layers.append(keras.layers.Dense(hidden_nodes[-1]))
        
        if type(input_shape) != type(None):
            self.build(input_shape)
    
    #training flag if specific layers behave differently, for mlp no difference
    def call(self, inputs, training=False):
        for layer in self.mlp_layers:
            inputs = layer(inputs)
        return inputs

In [15]:
atlas_mlp = General_MLP([64,16],input_shape=(1,66))
atlas_mlp.summary()

Model: "general_mlp_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              multiple                  4288      
_________________________________________________________________
dense_4 (Dense)              multiple                  1040      
Total params: 5,328
Trainable params: 5,328
Non-trainable params: 0
_________________________________________________________________


In [16]:
title = f'training_170_epochs_64_16_1'
atlas_mlp.load_weights(f'results/{title}_modelweights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f43d0ae21d0>

In [19]:
atlas_mlp.weights

[<tf.Variable 'dense_3/kernel:0' shape=(66, 64) dtype=float32, numpy=
 array([[ 0.20449115, -0.44187957, -0.11918718, ...,  0.05622132,
          0.1136036 , -0.11823564],
        [-0.9564396 ,  0.71712494, -0.93131465, ...,  0.09455172,
          0.9911818 ,  0.05199847],
        [ 0.15582177, -0.15699957, -0.77588505, ..., -0.47392476,
          0.9260752 , -0.65149885],
        ...,
        [ 0.05187403, -0.591617  ,  0.90841854, ...,  0.36526102,
          0.22045124,  0.00195558],
        [ 0.7492621 , -0.33667684, -0.29802823, ..., -0.37556186,
         -0.64347845,  0.9877293 ],
        [ 0.11848468, -0.4664278 , -0.33216605, ...,  0.6089894 ,
         -0.6277354 , -0.41025424]], dtype=float32)>,
 <tf.Variable 'dense_3/bias:0' shape=(64,) dtype=float32, numpy=
 array([-0.9943621 , -0.9248907 , -0.23892032, -0.3871424 ,  0.292954  ,
        -0.4845467 , -0.22052644,  0.5970853 , -0.7791493 , -1.3753949 ,
        -1.5173056 , -0.3782626 , -0.711524  , -0.6430455 , -0.84698635,
   