# ¿Que más se podría hacer?

- Test con data augmentation y la LSTM
- Usar los tokenizadores usados en los transformers para ver como funcionan
- TSNE de embeddings y analizar si hay interpretación
- Probar con otras encimas o proteinas
- Usar los embeddings entrenados para analizar resultados de proteinas o encimas con menos data
- Entrenar una red neuronal con los features (fingerprints por ejemplo) y comparar los resultados con los embeddings


# Tome cualquiera de estas propuestas o alguna suya y desarrolle

# Elección: TSNE de embeddings y analizar si hay interpretación

## Preparacion e inicializacion

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LSTM, Embedding, Bidirectional, Activation, Input, Conv1D, MaxPool1D, Concatenate, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
from datagen import smiles_dict, DataGenerator

In [2]:
df = pd.read_csv('data/acetylcholinesterase_02_bioactivity_data_preprocessed.csv')

In [3]:
max_len_idx = df['canonical_smiles'].apply(len).argmax()
min_len_idx = df['canonical_smiles'].apply(len).argmin()
max_sequence_len = len(df['canonical_smiles'].iloc[max_len_idx]) + 20
X = df['canonical_smiles'].values
y = df['pIC50'].values
vocab_size = len(smiles_dict)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Genera

In [16]:
dgen_train = DataGenerator(X_train, y_train, seq_length=max_sequence_len, batch_size=128, data_augmentation=True)

dgen_test = DataGenerator(X_test, y_test, seq_length=max_sequence_len, batch_size=128, data_augmentation=False)

In [6]:
## Red

In [7]:
def R2(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true - y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [8]:
# Hyperparametros
learning_rate=0.01 #0.01   0.05    1e-3
epochs = 130     #50    30
batch_size=128     #64      32

lstm = 100 #64   20
dense_size = 512 #100 #50
drops = 0.8   #0.4
embeddings = 128

In [9]:
import mlflow.keras
mlflow.autolog()
#! mlflow ui

2023/02/05 10:19:31 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2023/02/05 10:19:32 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [10]:
model = Sequential()
model.add(Embedding(vocab_size + 1, embeddings, input_length=max_sequence_len))
model.add(Bidirectional(LSTM(lstm, return_sequences=False, return_state=False, activation='tanh')))
model.add(Dense(dense_size))
model.add(BatchNormalization())
model.add(Dropout(drops))
model.add(Dense(dense_size))
model.add(BatchNormalization())
model.add(Dropout(drops))
model.add(Activation('relu'))
model.add(Dense(1, activation='linear'))
model.summary()

2023-02-05 10:19:32.175492: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 227, 128)          5504      
                                                                 
 bidirectional (Bidirectiona  (None, 200)              183200    
 l)                                                              
                                                                 
 dense (Dense)               (None, 512)               102912    
                                                                 
 batch_normalization (BatchN  (None, 512)              2048      
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 512)               2

In [11]:
model.compile(optimizer=RMSprop(learning_rate=learning_rate), loss='mse', metrics=[R2])

In [12]:
# callbacks
mcp = ModelCheckpoint('models/best_model_{epoch}', save_best_only=True, save_format="h5")
earlyStopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=15, verbose=1, restore_best_weights=True)
reduceLR = ReduceLROnPlateau(monitor ='val_loss', factor=0.2, patience=15, verbose=1, mode='auto', min_delta=0.01, cooldown=0, min_lr=1e-8)

In [13]:
%%time
history = model.fit(dgen_train, epochs=epochs, batch_size=batch_size, validation_data=augm_test, callbacks=[mcp, reduceLR])

2023/02/05 10:19:32 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd3d62469faba470f85c890567e7aa8cb', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow


Epoch 1/130



INFO:tensorflow:Assets written to: models/best_model_1/assets


INFO:tensorflow:Assets written to: models/best_model_1/assets


Epoch 2/130
Epoch 3/130



INFO:tensorflow:Assets written to: models/best_model_3/assets


INFO:tensorflow:Assets written to: models/best_model_3/assets


Epoch 4/130



INFO:tensorflow:Assets written to: models/best_model_4/assets


INFO:tensorflow:Assets written to: models/best_model_4/assets


Epoch 5/130
Epoch 6/130
Epoch 7/130



INFO:tensorflow:Assets written to: models/best_model_7/assets


INFO:tensorflow:Assets written to: models/best_model_7/assets


Epoch 8/130



INFO:tensorflow:Assets written to: models/best_model_8/assets


INFO:tensorflow:Assets written to: models/best_model_8/assets


Epoch 9/130
Epoch 10/130
Epoch 11/130
Epoch 12/130
Epoch 13/130
Epoch 14/130
Epoch 15/130



INFO:tensorflow:Assets written to: models/best_model_15/assets


INFO:tensorflow:Assets written to: models/best_model_15/assets


Epoch 16/130
Epoch 17/130
Epoch 18/130



INFO:tensorflow:Assets written to: models/best_model_18/assets


INFO:tensorflow:Assets written to: models/best_model_18/assets


Epoch 19/130
Epoch 20/130
Epoch 21/130
Epoch 22/130
Epoch 23/130
Epoch 24/130



INFO:tensorflow:Assets written to: models/best_model_24/assets


INFO:tensorflow:Assets written to: models/best_model_24/assets


Epoch 25/130
Epoch 26/130
Epoch 27/130
Epoch 28/130
Epoch 29/130



INFO:tensorflow:Assets written to: models/best_model_29/assets


INFO:tensorflow:Assets written to: models/best_model_29/assets


Epoch 30/130
Epoch 31/130
Epoch 32/130
Epoch 33/130
Epoch 34/130



INFO:tensorflow:Assets written to: models/best_model_34/assets


INFO:tensorflow:Assets written to: models/best_model_34/assets


Epoch 35/130
Epoch 36/130
Epoch 37/130
Epoch 38/130
Epoch 39/130
Epoch 40/130



INFO:tensorflow:Assets written to: models/best_model_40/assets


INFO:tensorflow:Assets written to: models/best_model_40/assets


Epoch 41/130
Epoch 42/130
Epoch 43/130
Epoch 44/130
Epoch 45/130



INFO:tensorflow:Assets written to: models/best_model_45/assets


INFO:tensorflow:Assets written to: models/best_model_45/assets


Epoch 46/130
Epoch 47/130
Epoch 48/130
Epoch 49/130
Epoch 50/130
Epoch 51/130
Epoch 52/130
Epoch 53/130
Epoch 54/130
Epoch 55/130



INFO:tensorflow:Assets written to: models/best_model_55/assets


INFO:tensorflow:Assets written to: models/best_model_55/assets


Epoch 56/130
Epoch 57/130



INFO:tensorflow:Assets written to: models/best_model_57/assets


INFO:tensorflow:Assets written to: models/best_model_57/assets


Epoch 58/130
Epoch 59/130
Epoch 60/130
Epoch 61/130
Epoch 62/130
Epoch 63/130



INFO:tensorflow:Assets written to: models/best_model_63/assets


INFO:tensorflow:Assets written to: models/best_model_63/assets


Epoch 64/130
Epoch 65/130
Epoch 66/130
Epoch 67/130
Epoch 68/130
Epoch 69/130
Epoch 70/130



INFO:tensorflow:Assets written to: models/best_model_70/assets


INFO:tensorflow:Assets written to: models/best_model_70/assets


Epoch 71/130
Epoch 72/130
Epoch 73/130
Epoch 74/130



INFO:tensorflow:Assets written to: models/best_model_74/assets


INFO:tensorflow:Assets written to: models/best_model_74/assets


Epoch 75/130



INFO:tensorflow:Assets written to: models/best_model_75/assets


INFO:tensorflow:Assets written to: models/best_model_75/assets


Epoch 76/130
Epoch 77/130
Epoch 78/130



INFO:tensorflow:Assets written to: models/best_model_78/assets


INFO:tensorflow:Assets written to: models/best_model_78/assets


Epoch 79/130



INFO:tensorflow:Assets written to: models/best_model_79/assets


INFO:tensorflow:Assets written to: models/best_model_79/assets


Epoch 80/130
Epoch 81/130
Epoch 82/130



INFO:tensorflow:Assets written to: models/best_model_82/assets


INFO:tensorflow:Assets written to: models/best_model_82/assets


Epoch 83/130
Epoch 84/130
Epoch 85/130
Epoch 86/130
Epoch 87/130



INFO:tensorflow:Assets written to: models/best_model_87/assets


INFO:tensorflow:Assets written to: models/best_model_87/assets


Epoch 88/130
Epoch 89/130



INFO:tensorflow:Assets written to: models/best_model_89/assets


INFO:tensorflow:Assets written to: models/best_model_89/assets


Epoch 90/130
Epoch 91/130
Epoch 92/130
Epoch 93/130
Epoch 94/130
Epoch 95/130
Epoch 96/130
Epoch 97/130
Epoch 98/130



INFO:tensorflow:Assets written to: models/best_model_98/assets


INFO:tensorflow:Assets written to: models/best_model_98/assets


Epoch 99/130
Epoch 100/130
Epoch 101/130
Epoch 102/130
Epoch 103/130
Epoch 104/130
Epoch 105/130
Epoch 106/130
Epoch 107/130



INFO:tensorflow:Assets written to: models/best_model_107/assets


INFO:tensorflow:Assets written to: models/best_model_107/assets


Epoch 108/130
Epoch 109/130
Epoch 110/130
Epoch 111/130
Epoch 112/130
Epoch 113/130
Epoch 114/130
Epoch 115/130
Epoch 116/130
Epoch 117/130
Epoch 118/130
Epoch 119/130
Epoch 120/130
Epoch 121/130



INFO:tensorflow:Assets written to: models/best_model_121/assets


INFO:tensorflow:Assets written to: models/best_model_121/assets


Epoch 122/130
Epoch 122: ReduceLROnPlateau reducing learning rate to 0.0019999999552965165.
Epoch 123/130



INFO:tensorflow:Assets written to: models/best_model_123/assets


INFO:tensorflow:Assets written to: models/best_model_123/assets


Epoch 124/130



INFO:tensorflow:Assets written to: models/best_model_124/assets


INFO:tensorflow:Assets written to: models/best_model_124/assets


Epoch 125/130
Epoch 126/130



INFO:tensorflow:Assets written to: models/best_model_126/assets


INFO:tensorflow:Assets written to: models/best_model_126/assets


Epoch 127/130



INFO:tensorflow:Assets written to: models/best_model_127/assets


INFO:tensorflow:Assets written to: models/best_model_127/assets


Epoch 128/130



INFO:tensorflow:Assets written to: models/best_model_128/assets


INFO:tensorflow:Assets written to: models/best_model_128/assets


Epoch 129/130



INFO:tensorflow:Assets written to: models/best_model_129/assets


INFO:tensorflow:Assets written to: models/best_model_129/assets


Epoch 130/130



INFO:tensorflow:Assets written to: models/best_model_130/assets


INFO:tensorflow:Assets written to: models/best_model_130/assets






INFO:tensorflow:Assets written to: /var/folders/cc/hfx_w22x28j8f0l38h8ww1zc0000gn/T/tmp9x6p1hsg/model/data/model/assets


INFO:tensorflow:Assets written to: /var/folders/cc/hfx_w22x28j8f0l38h8ww1zc0000gn/T/tmp9x6p1hsg/model/data/model/assets


CPU times: user 3h 27min 40s, sys: 2h 23min 28s, total: 5h 51min 8s
Wall time: 1h 7min 34s


In [19]:
X_test_eval = []
y_t_eval = []
for X_t, y_t in dgen_test:
    X_test_eval = X_test_eval + [list(t) for t in X_t]
    y_t_eval = y_t_eval + list(y_t)
X_test_eval = np.array(X_test_eval)
y_test = np.array(y_t_eval)

y_pred = model.predict(X_test_eval)



In [21]:
r2 = 1 - ((y_test - y_pred.reshape(-1)) ** 2).sum() / ((y_test - y_test.mean()) ** 2).sum()
r2

0.5898812204959437

In [22]:
model.save('005/lstm_augment.h5', )

# Conclusion
No mejoró, le llevó mas iteraciones llegar a valores de R2 similares, la loss fue más alta.

In [23]:
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
# dd/mm/YY H:M:S
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
print("date and time =", dt_string)

date and time = 05/02/2023 11:54:06
