In [12]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import rpy2.robjects as ro
import rpy2.robjects.pandas2ri as pandas2ri
pandas2ri.activate()
import pickle

# Create output directory
output_dir = "./data/processed/expression/adjusted_autoencoder"
os.makedirs(output_dir, exist_ok=True)

# Define data paths
data_path = "./data/processed/expression/readcounts_tmm_all/"
metadata_path = "./data/processed/attphe.pkl"

# Load metadata
with open(metadata_path, 'rb') as f:
    metadata = pickle.load(f)

# List all tissue files
tissue_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith(".pkl")]

# Define Standard Autoencoder
def build_autoencoder(input_dim):
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(input_dim,)),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),  # Latent space
        layers.Dense(64, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(input_dim, activation='sigmoid')  # Reconstruction layer
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# Function to process each tissue file
def process_tissue_autoencoder(tissue_file, metadata):
    tissue_name = os.path.basename(tissue_file).replace(".pkl", "")
    
    # Load normalized read counts
    with open(tissue_file, 'rb') as f:
        normalized_counts = pickle.load(f)
    
    sample_ids = normalized_counts.columns
    attr_filtered = metadata[metadata['sample_id'].isin(sample_ids)]
    
    if normalized_counts.shape[1] != attr_filtered.shape[0]:
        raise ValueError(f"Number of samples in {tissue_name} does not match metadata.")
    
    # Resave normalized counts as DataFrame (using pandas HDF5 as previously)
    resaved_tissue_file = os.path.join(data_path, f"{tissue_name}_resaved.h5")
    normalized_counts.to_hdf(resaved_tissue_file, key='normalized_counts')
    print(f"Resaved normalized counts for {tissue_name} to {resaved_tissue_file}")
    
    input_dim = normalized_counts.shape[1]
    autoencoder = build_autoencoder(input_dim)
    
    # Train the autoencoder
    autoencoder.fit(normalized_counts.to_numpy(), normalized_counts.to_numpy(), epochs=50, batch_size=32)
    
    encoder = keras.Model(inputs=autoencoder.input, outputs=autoencoder.layers[2].output)  # Extract latent space
    latent_features = encoder.predict(normalized_counts.to_numpy())
    
    # Convert DataFrame and latent features to Python-native lists (not Rpy2)
    normalized_counts_list = normalized_counts.values.tolist()  # Convert to list
    latent_features_list = latent_features.tolist()  # Convert to list

    # Load limma package in R
    ro.r('library(limma)')
    
    # Adjust expression data using removeBatchEffect
    adjusted_expression_data = ro.r['removeBatchEffect'](normalized_counts_list, covariates=latent_features_list)

    # Convert adjusted data back to pandas DataFrame
    adjusted_expression_df = pd.DataFrame(np.array(adjusted_expression_data), 
                                          index=normalized_counts.index, 
                                          columns=normalized_counts.columns)
    
    # Save adjusted expression data to pickle file
    result_file = os.path.join(output_dir, f"{tissue_name}.pkl")
    with open(result_file, 'wb') as f:
        pickle.dump(adjusted_expression_df, f)
    
    print(f"Processed tissue: {tissue_name}")
    print(f"Dimensions of adjusted data: {adjusted_expression_df.shape}")

# Loop through each tissue file and process them
for tissue_file in tissue_files:
    process_tissue_autoencoder(tissue_file, metadata)


ModuleNotFoundError: No module named 'numpy._core.numeric'