In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import rpy2.robjects as ro
import rpy2.robjects.pandas2ri as pandas2ri
pandas2ri.activate()
import pickle

# Create output directory
output_dir = "./data/processed/expression/adjusted_autoencoder"
os.makedirs(output_dir, exist_ok=True)

# Define data paths
data_path = "./data/processed/expression/readcounts_tmm_all/"
metadata_path = "./data/processed/attphe.pkl"

# Load metadata
with open(metadata_path, 'rb') as f:
    metadata = pickle.load(f)

# List all tissue files
tissue_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith(".pkl")]

# Define Autoencoder using Functional API
def build_autoencoder(input_dim):
    input_layer = keras.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='relu')(input_layer)
    x = layers.Dense(64, activation='relu')(x)
    latent = layers.Dense(32, activation='relu')(x)  # Latent space
    x = layers.Dense(64, activation='relu')(latent)
    x = layers.Dense(128, activation='relu')(x)
    output = layers.Dense(input_dim, activation='sigmoid')(x)

    autoencoder = keras.Model(inputs=input_layer, outputs=output)
    autoencoder.compile(optimizer='adam', loss='mse')

    # Encoder model to extract latent space
    encoder = keras.Model(inputs=input_layer, outputs=latent)

    return autoencoder, encoder

# Function to process each tissue file
def process_tissue_autoencoder(tissue_file, metadata):
    tissue_name = os.path.basename(tissue_file).replace(".pkl", "")

    # Load normalized read counts
    with open(tissue_file, 'rb') as f:
        normalized_counts = pickle.load(f)

    sample_ids = normalized_counts.columns
    attr_filtered = metadata[metadata['samp_id'].isin(sample_ids)]

    if normalized_counts.shape[1] != attr_filtered.shape[0]:
        raise ValueError(f"Number of samples in {tissue_name} does not match metadata.")

    # Resave normalized counts as DataFrame (using pandas HDF5 as previously)
    resaved_tissue_file = os.path.join(data_path, f"{tissue_name}_resaved.h5")
    normalized_counts.to_hdf(resaved_tissue_file, key='normalized_counts')
    print(f"Resaved normalized counts for {tissue_name} to {resaved_tissue_file}")

    input_dim = normalized_counts.shape[1]
    autoencoder, encoder = build_autoencoder(input_dim)

    # Train the autoencoder
    autoencoder.fit(normalized_counts.to_numpy(), normalized_counts.to_numpy(), epochs=50, batch_size=32)

    # Encode latent features
    latent_features = encoder.predict(normalized_counts.to_numpy())


    # Load limma package in R
    ro.r('library(limma)')
    
    from rpy2.robjects import numpy2ri
    numpy2ri.activate()
    
    # Transpose expression matrix so rows = samples, columns = genes
    counts_T = normalized_counts.T  # shape: (samples, genes)
    r_counts = ro.conversion.py2rpy(counts_T.to_numpy())  # 
    # Convert latent features
    r_covariates = ro.conversion.py2rpy(np.array(latent_features))

    print("Expression shape:", normalized_counts.shape)
    print("Latent shape:", latent_features.shape)

    # Call R function
    adjusted_expression_data = ro.r['removeBatchEffect'](r_counts, covariates=r_covariates)

    # Convert adjusted data back to pandas DataFrame and transpose it to original shape
    adjusted_expression_df = pd.DataFrame(np.array(adjusted_expression_data),
                                          index=counts_T.index,
                                          columns=counts_T.columns).T  # 
    
    # Save adjusted expression data to pickle file
    result_file = os.path.join(output_dir, f"{tissue_name}.pkl")
    with open(result_file, 'wb') as f:
        pickle.dump(adjusted_expression_df, f)

    print(f"Processed tissue: {tissue_name}")
    print(f"Dimensions of adjusted data: {adjusted_expression_df.shape}")

# Loop through each tissue file and process them
for tissue_file in tissue_files:
    process_tissue_autoencoder(tissue_file, metadata)


2025-04-15 14:47:08.812302: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Resaved normalized counts for Lung to ./data/processed/expression/readcounts_tmm_all/Lung_resaved.h5
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step - loss: 10359739392.0000
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 335ms/step - loss: 10359734272.0000
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step - loss: 10359734272.0000
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step - loss: 10359732224.0000
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 281ms/step - loss: 10359731200.0000
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 288ms/step - loss: 10359731200.0000
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283ms/step - loss: 10359732224.0000
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 206ms/step - loss: 10359731200.0000
Epoch 9/50
[1m1/1[0m [32m━━

R[write to console]: Zusätzlich: 
R[write to console]: Warnmeldung:

R[write to console]: Partial NA coefficients for 578 probe(s) 



Expression shape: (14, 226)
Latent shape: (14, 32)
Coefficients not estimable: 2 3 4 6 7 8 10 11 13 14 15 16 18 21 25 27 29 30 31 33 
Processed tissue: Liver
Dimensions of adjusted data: (14, 226)
Resaved normalized counts for Brain-Cortex to ./data/processed/expression/readcounts_tmm_all/Brain-Cortex_resaved.h5
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step - loss: 20279996416.0000
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step - loss: 20279994368.0000
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 259ms/step - loss: 20279990272.0000
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 311ms/step - loss: 20279988224.0000
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step - loss: 20279988224.0000
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step - loss: 20279984128.0000
Epoch 7/50
[1m1/1[0m [32m━━━━━━━

R[write to console]: Zusätzlich: 
R[write to console]: Warnmeldung:

R[write to console]: Partial NA coefficients for 226 probe(s) 



Expression shape: (14, 255)
Latent shape: (14, 32)
Coefficients not estimable: 32 33 2 3 4 8 9 10 11 13 16 17 18 19 20 25 26 28 30 
Processed tissue: Brain-Cortex
Dimensions of adjusted data: (14, 255)
