In [1]:
%load_ext tensorboard

In [18]:
import tensorflow as tf
import pandas as pd
import numpy as np
from datetime import datetime

from tensorflow.keras import layers, models, callbacks
from sklearn import model_selection, preprocessing

In [3]:
venancio = pd.read_parquet('../../expresion/venancio.parquet')

annot = pd.read_csv('../../anotacion/gemini_annot.csv', index_col=0)
annot = annot[annot['tejido'].notna() & annot['estres'].notna()]
annot

Unnamed: 0_level_0,bioproject,tejido,estres,tratamiento
biosample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SAMD00235524,PRJDB10183,seedling,control,control
SAMD00235525,PRJDB10183,seedling,control,control
SAMD00235526,PRJDB10183,seedling,control,control
SAMD00235527,PRJDB10183,seedling,control,control
SAMD00235528,PRJDB10183,seedling,control,control
...,...,...,...,...
SAMN36760719,PRJNA999924,seed,cold,treatment
SAMN36760720,PRJNA999924,seed,cold,treatment
SAMN36760721,PRJNA999924,seed,cold,treatment
SAMN36760722,PRJNA999924,seed,cold,treatment


In [4]:
venancio, annot = venancio.align(annot, join='inner', axis=0)

In [5]:
consamples = annot['bioproject'].map(annot['bioproject'].value_counts() > 2)
convarianza = venancio.var() > 0.01

filtrado = venancio.loc[consamples, convarianza]
X, y = filtrado.align(annot, join='inner', axis=0)

toda esta pipeline de preprocesamiento la pasaria a tf.data para escalarla en vez de sklearn pero es mas comodo implementarlo

In [7]:
y = y["estres"]
label_encoder = preprocessing.LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X,
    y_encoded,
    test_size=0.2,
)

In [9]:
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Las redes neuronales representan una aproximación fundamentalmente diferente al problema de expresión diferencial. Para nuestro análisis, implementamos una arquitectura feed-forward simple con dos capas ocultas, utilizando activación ReLU y dropout (0.3) para prevenir el sobreajuste e incentivar la exploración de genes. La capa de entrada tiene dimensión $p$ (número total de genes) y la de salida utiliza activación softmax para la clasificación multiclase.

Para manejar la alta dimensionalidad (característica de los datos de expresión génica), se intenta añadir una capa de reducción de dimensionalidad antes de la clasificación. Esto comprime las variables de entrada en un espacio latente de menor dimensionalidad. 

El entrenamiento se realizó utilizando el optimizador Adam con una tasa de aprendizaje inicial de 1e-4 y programación de tasa de aprendizaje cíclica para evitar mínimos locales. Para abordar el desbalance de clases, implementamos ponderación de clases inversamente proporcional a su frecuencia en los datos de entrenamiento.

In [None]:
# el dropout es bastante importante por la dimensionalidad del dataset, sino te overfitea bastante
model = models.Sequential([
    layers.Dense(128, activation="relu", input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(32, activation="relu"),
    layers.Dense(len(np.unique(y_train)), activation="softmax")
])

# uso la tasa inicial default en lugar de 1e-4, no cambia mucho
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-01-26 14:30:53.462344: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [11]:
# Tasa de aprendizaje ciclica
def lr_scheduler(epoch, lr):
    if epoch % 10 == 0 and epoch != 0: # este 10% conviene tunearlo
        lr = lr * 0.9 
    return lr 

lr_callback = callbacks.LearningRateScheduler(lr_scheduler)

In [14]:
log_dir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tb_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

history = model.fit(
    X_train_scaled, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[lr_callback, tb_callback]
)

Epoch 1/100


2025-01-26 14:31:41.504763: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 510546264 exceeds 10% of free system memory.


[1m76/77[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 90ms/step - accuracy: 0.2589 - loss: 9.3675

2025-01-26 14:31:52.903704: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 127740420 exceeds 10% of free system memory.
2025-01-26 14:31:54.075264: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 53173248 exceeds 10% of free system memory.
2025-01-26 14:31:54.113678: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 53173248 exceeds 10% of free system memory.
2025-01-26 14:31:54.149749: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 53173248 exceeds 10% of free system memory.


[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 282ms/step - accuracy: 0.2612 - loss: 9.3274 - val_accuracy: 0.5593 - val_loss: 2.0629 - learning_rate: 0.0010
Epoch 2/100
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 56ms/step - accuracy: 0.4660 - loss: 4.0899 - val_accuracy: 0.5772 - val_loss: 1.7380 - learning_rate: 0.0010
Epoch 3/100
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 61ms/step - accuracy: 0.4808 - loss: 3.1441 - val_accuracy: 0.5854 - val_loss: 1.3931 - learning_rate: 0.0010
Epoch 4/100
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 72ms/step - accuracy: 0.5182 - loss: 2.1930 - val_accuracy: 0.6309 - val_loss: 1.2386 - learning_rate: 0.0010
Epoch 5/100
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 59ms/step - accuracy: 0.5234 - loss: 1.9058 - val_accuracy: 0.6228 - val_loss: 1.1628 - learning_rate: 0.0010
Epoch 6/100
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 62ms

In [None]:
model.evaluate(X_test_scaled, y_test)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.8302 - loss: 1.0443


[1.119916319847107, 0.8179453611373901]