In [1]:
MODEL = '6'
BENCHMARKS_DIR = './train_sets/full_valid/' + MODEL
BENCHMARK_NAME = 'bass_pb40'
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import tensorflow as tf
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))
import pickle
import pandas as pd
from IPython.display import display
from tensorflow import keras
from sklearn.model_selection import train_test_split
from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs
from imblearn.over_sampling import RandomOverSampler
import gc

# A local (non-global) binary output
OUTPUT_TYPE = OutputType(False, 'binary')
UNIQUE_LABELS = [0, 1]
OUTPUT_SPEC = OutputSpec(OUTPUT_TYPE, UNIQUE_LABELS)


# Loading the dataset

train_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.train.csv' % BENCHMARK_NAME)
valid_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.valid.csv' % BENCHMARK_NAME)
test_set_file_path = os.path.join(BENCHMARKS_DIR, '%s.test.csv' % BENCHMARK_NAME)
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
valid_set = pd.read_csv(valid_set_file_path).dropna().drop_duplicates()
test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()

print(f'{len(train_set)} training set records, {len(valid_set)} validation set records, {len(test_set)} test set records.')

# Loading the pre-trained model and fine-tuning it on the loaded dataset

pretrained_model_generator, input_encoder = load_pretrained_model()

# get_model_with_hidden_layers_as_outputs gives the model output access to the hidden layers (on top of the output)
model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 3, restore_best_weights = True),
    keras.callbacks.TensorBoard(log_dir = './logs', histogram_freq = 1, update_freq= 100)
]


finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'], \
        seq_len = 42, batch_size = 32, max_epochs_per_stage = 40, lr = 1e-04, begin_with_frozen_pretrained_layers = False, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 0, final_seq_len = 1024, final_lr = 1e-05, callbacks = training_callbacks)


# Evaluating the performance on the test-set

results, confusion_matrix = evaluate_by_len(model_generator, input_encoder, OUTPUT_SPEC, test_set['seq'], test_set['label'], \
        start_seq_len = 42, start_batch_size = 32)

print('Test-set performance:')
display(results)

print('Confusion matrix:')
display(confusion_matrix)

model=model_generator.create_model(seq_len=42)

model.save("./proteinbert_models/proteinBERT_full/"+ MODEL)

2.10.1
[]
365194 training set records, 73032 validation set records, 73032 test set records.
[2023_07_19-23:44:21] Training set: Filtered out 0 of 365194 (0.0%) records of lengths exceeding 40.
[2023_07_19-23:44:24] Validation set: Filtered out 0 of 73032 (0.0%) records of lengths exceeding 40.
Clearing gpu memory...
[2023_07_19-23:44:24] Training the entire fine-tuned model...


  super().__init__(name, **kwargs)


Layer GlobalAttention has arguments ['n_heads', 'd_key', 'd_value']
in `__init__` and therefore must override `get_config()`.

Example:

class CustomLayer(keras.layers.Layer):
    def __init__(self, arg1, arg2):
        super().__init__()
        self.arg1 = arg1
        self.arg2 = arg2

    def get_config(self):
        config = super().get_config()
        config.update({
            "arg1": self.arg1,
            "arg2": self.arg2,
        })
        return config
Epoch 1/40
Epoch 2/40
Epoch 3/40

Epoch 3: ReduceLROnPlateau reducing learning rate to 2.499999936844688e-05.
Epoch 4/40

Epoch 4: ReduceLROnPlateau reducing learning rate to 1e-05.
Epoch 5/40


  super().__init__(name, **kwargs)


Test-set performance:


Unnamed: 0_level_0,# records,AUC
Model seq len,Unnamed: 1_level_1,Unnamed: 2_level_1
42,73032,0.991017
All,73032,0.991017


Confusion matrix:


Unnamed: 0,0,1
0,72863,9
1,32,128


  super().__init__(name, **kwargs)


INFO:tensorflow:Assets written to: ./proteinbert_models/proteinBERT_full/6\assets


INFO:tensorflow:Assets written to: ./proteinbert_models/proteinBERT_full/6\assets
