In [2]:
import os
import pickle
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA

# Define the path to the folder containing pickled expression data
data_path = "./data/processed/expression/adjusted_autoencoder/"  # update if needed

# Check if directory exists
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Directory not found: {data_path}")

# List all .pkl files
tissue_files = [f for f in os.listdir(data_path) if f.endswith(".pkl")]

# Initialize lists to store data
combined_data = []
tissue_labels = []

# Loop through each .pkl file
for file in tissue_files:
    tissue_name = file.replace(".pkl", "")
    full_path = os.path.join(data_path, file)

    try:
        # Open and load the pickled file
        with open(full_path, "rb") as f:
            expr_df = pickle.load(f)

        # Ensure the loaded object is a DataFrame
        if not isinstance(expr_df, pd.DataFrame):
            print(f"Warning: {file} does not contain a DataFrame. Skipping.")
            continue

        # Transpose so that samples are rows (required for PCA)
        expr_df = expr_df.T
        expr_df["tissue"] = tissue_name

        # Add to the combined data list
        combined_data.append(expr_df)

    except Exception as e:
        print(f"Failed to read '{tissue_name}' from {file}: {e}")

# Combine all tissue data
if combined_data:
    combined_df = pd.concat(combined_data)
else:
    raise ValueError("No valid expression data was loaded.")

# Perform PCA on the combined data
X = combined_df.drop(columns=["tissue"])  # Remove 'tissue' column for PCA
y = combined_df["tissue"]  # Tissue labels for coloring

# Run PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

# Put PCA results into DataFrame
pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2", "PC3"])
pca_df["tissue"] = y.values

# Calculate the percentage of variance explained
explained_var = pca.explained_variance_ratio_ * 100

# Create 3D PCA plot using Plotly
fig = px.scatter_3d(
    pca_df,
    x="PC1",
    y="PC2",
    z="PC3",
    color="tissue",
    title=f"3D PCA of Gene Expression (PC1: {explained_var[0]:.2f}%, PC2: {explained_var[1]:.2f}%, PC3: {explained_var[2]:.2f}%)",
    opacity=0.7
)

# Save interactive plot as HTML file
output_dir = "results_after_readcounts_pca_all_tissues_autoencoder"
os.makedirs(output_dir, exist_ok=True)
fig.write_html(os.path.join(output_dir, "3d_pca_plot_all_tissues.html"))

# Show the plot
fig.show()
