In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, explained_variance_score
from itertools import product
from scipy.sparse import coo_matrix
from scipy.stats import spearmanr, pearsonr
from utils_plot import *

ModuleNotFoundError: No module named 'utils_plot'

In [None]:
# 1. Data Normalization
def preprocessing(alpha1, alpha2, N, ampl_threshold=0.2):

    targetnames = np.array(sorted(set(N.index) & set(alpha1.index) & set(alpha2.index)))
    print("Genes in common :", len(targetnames))

    alpha1, alpha2 = alpha1.loc[targetnames].to_numpy(), alpha2.loc[targetnames].to_numpy()
 
    ampl1 = (alpha1.max(axis=1)-alpha1.min(axis=1))/2
    ampl2 = (alpha2.max(axis=1)-alpha2.min(axis=1))/2
    ind = (ampl1 > ampl_threshold) & (ampl2 > ampl_threshold)
    alpha1, alpha2 = alpha1[ind,:], alpha2[ind,:]
    targetnames_filtered = targetnames[ind]

    N = N.loc[targetnames_filtered].to_numpy()
    
    # Identify TFs that are not present in any gene
    inactive_tfs = np.where(N.sum(axis=0) == 0)[0]
    print(f"Number of inactive TFs: {len(inactive_tfs)}")
    N = np.delete(N, inactive_tfs, axis=1)
    tf_names_filtered = np.delete(tf_names, inactive_tfs)
    
    print(f"Kept genes: {N.shape[0]} (ampl > {ampl_threshold})")
    alpha1_norm = alpha1 - np.mean(alpha1, axis=1, keepdims=True) - np.mean(alpha1, axis=0, keepdims=True) + np.mean(alpha1)
    alpha2_norm = alpha2 - np.mean(alpha2, axis=1, keepdims=True) - np.mean(alpha2, axis=0, keepdims=True) + np.mean(alpha2)
    #N_norm = N - np.mean(N, axis=0, keepdims=True) #We will optimize the sparse matrix, so we need to keep the absolute zero values.

    return alpha1_norm, alpha2_norm, N, targetnames_filtered, tf_names_filtered

In [None]:
# 2 Define Ridge Regression Model with Trainable Sparse W
class TrainableModel(nn.Module):
    def __init__(self, N, alpha, num_tfs, num_thetas, lambda1=0.01, lambda2=0.01):
        super(TrainableModel, self).__init__()
        
        self.lambda1 = lambda1  # L1 regularization for W
        self.lambda2 = lambda2  # L2 regularization for A

        # Convert N to COO format
        sparse_matrix = coo_matrix(N)

        # Get the nonzero indices and values
        self.i = torch.tensor(sparse_matrix.row, dtype=torch.long)
        self.j = torch.tensor(sparse_matrix.col, dtype=torch.long)
        values = torch.tensor(sparse_matrix.data, dtype=torch.float32)

        # Create W as a trainable vector for the non-zero elements of N
        self.W = nn.Parameter(torch.randn(len(values)) * 0.05)

        # Initialize the unconstrained TF activity parameters
        self.x = nn.Parameter(torch.randn(num_tfs, num_thetas) * 0.05)

        self.num_genes, self.num_tfs = N.shape

    def forward(self):
        # Create a sparse tensor for W
        W_sparse = torch.sparse_coo_tensor(
            indices=torch.stack([self.i, self.j]), 
            values=self.W, 
            size=(self.num_genes, self.num_tfs)
        )
        W_dense_tensor = W_sparse.to_dense()

        # Constrain A to be between 0 and 1 using the sigmoid function.
        A = torch.sigmoid(self.x)

        # Compute the reconstructed alpha matrix.
        return torch.matmul(W_dense_tensor, A)

    def loss(self, alpha_true):
        alpha_pred = self.forward()
        main_loss = torch.sum((alpha_true - alpha_pred) ** 2)
        l1_loss = torch.sum(torch.abs(self.W))  # L1 on W
        l2_loss = torch.sum(self.x ** 2)         # L2 on the unconstrained A

        total_loss = main_loss + self.lambda1 * l1_loss + self.lambda2 * l2_loss
        return total_loss

In [None]:
def train_model(N, alpha, alpha_test, lambda1, lambda2, patience=20, num_epochs=1000, lr=0.005):
    num_genes, num_tfs = N.shape
    num_thetas = alpha.shape[1]
    
    model = TrainableModel(N, alpha, num_tfs, num_thetas, lambda1, lambda2)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    best_EV = -float("inf")
    best_test_loss = float("inf")
    patience_counter = 0

    for epoch in range(num_epochs):
        optimizer.zero_grad()
        loss = model.loss(alpha)
        loss.backward()
        optimizer.step()

       # Validation Step (keep computations in PyTorch)
        W_sparse_vector = model.W.detach()
        W_dense = torch.sparse_coo_tensor(
            indices=torch.stack([model.i, model.j]),
            values=W_sparse_vector,
            size=(num_genes, num_tfs)
        ).to_dense()
        x = model.x.detach()
        A = torch.sigmoid(x)

        # Calculate R_test as a torch tensor
        R_test = torch.matmul(W_dense, A)

        # Calculate EV_test (convert tensors to NumPy for explained_variance_score)
        EV_test = explained_variance_score(alpha_test.numpy(), R_test.detach().numpy())

        # Calculate Test Loss in PyTorch
        main_loss = torch.sum((alpha_test - R_test) ** 2)
        l1_loss = torch.sum(torch.abs(W_sparse_vector))  # L1 on W
        l2_loss = torch.sum(x ** 2)                         # L2 on the unconstrained A
        total_test_loss = main_loss + lambda1 * l1_loss + lambda2 * l2_loss

        # Check for early stopping
        if total_test_loss < best_test_loss-25:
            best_EV = EV_test
            best_test_loss = total_test_loss
            patience_counter = 0  # Reset patience
        else:
            patience_counter += 1
        
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.0f}, Loss test: {total_test_loss:.0f}, EV_test: {EV_test*100:.2f}%")

        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best Loss test: {best_test_loss:.0f}. Best EV_test: {best_EV*100:.2f}%")
            break

    return W_dense.numpy(), A.numpy(), x.numpy(), loss.item()#, total_test_loss

In [None]:
# 4. Cross-Validation for Lambda Optimization
def cross_val_lambda(N, alpha1, alpha2, lambda1_values, lambda2_values):
    best_lambda1, best_lambda2, best_EV = None, None, -np.inf
    losses1 = []
    losses2 = []
    EVs_avg = []

    for lambda1, lambda2 in product(lambda1_values, lambda2_values):
        print(f"Testing lambda1 = {lambda1:.2f}, lambda2 = {lambda2:.2f}")

        # Train on alpha1, test on alpha2
        W1, A1, x1, loss1 = train_model(N, alpha1, alpha2, lambda1, lambda2)
        losses1.append(loss1)
        R_test1 = W1 @ A1
        EV1 = explained_variance_score(alpha2, R_test1)

        # Train on alpha2, test on alpha1
        W2, A2, x2, loss2 = train_model(N, alpha2, alpha1, lambda1, lambda2)
        losses2.append(loss2)
        R_test2 = W2 @ A2
        EV2 = explained_variance_score(alpha1, R_test2)

        avg_EV = (EV1 + EV2) / 2
        EVs_avg.append(avg_EV)
        print(f"Lambda1={lambda1:.2f}, Lambda2={lambda2:.2f}, EV={avg_EV*100:.2f}%\n")

        if avg_EV > best_EV:
            best_lambda1, best_lambda2, best_EV = lambda1, lambda2, avg_EV
            
    plt.figure(figsize=(7, 5))
    plt.plot(lambda2_values, EVs_avg)
    plt.xscale("log")
    plt.xlabel("Lambda1")
    plt.ylabel("EV Test")
    plt.grid(True)
    plt.title("EV test vs Lambda")
    plt.show()
    
    plt.figure(figsize=(7, 5))
    plt.plot(lambda2_values, losses1)
    plt.xscale("log")
    plt.xlabel("Lambda1")
    plt.ylabel("Loss function")
    plt.grid(True)
    plt.title("Loss vs Lambda")
    plt.show()

    print(f"Best λ1 = {best_lambda1:.2f}, Best λ2={best_lambda2:.2f}, Best EV={best_EV*100:.2f}%\n")
    return best_lambda1, best_lambda2

In [None]:
# 5. Cross train for best model
def cross_train(N, alpha1, alpha2, best_lambda1, best_lambda2):
    print("Training on α1, testing on α2...")
    W1, A1, x1, loss1 = train_model(N, alpha1, alpha2, best_lambda1, best_lambda2)
    R_test1 = W1 @ A1
    EV1_train = explained_variance_score(alpha1.numpy(), R_test1)
    EV1_test = explained_variance_score(alpha2.numpy(), R_test1)

    print("Training on α2, testing on α1...")
    W2, A2, x2, loss2 = train_model(N, alpha2, alpha1, best_lambda1, best_lambda2)
    R_test2 = W2 @ A2
    EV2_train = explained_variance_score(alpha2.numpy(), R_test2)
    EV2_test = explained_variance_score(alpha1.numpy(), R_test2)

    avg_EV_train = (EV1_train + EV2_train) / 2
    avg_EV_test = (EV1_test + EV2_test) / 2
    print(f"Average EV_train: {avg_EV_train*100:.2f}%")
    print(f"Average EV_test: {avg_EV_test*100:.2f}%")
    
    return W1, A1, x1, W2, A2, x2

In [None]:
def standardize_amplitudes(matrices, target_amp=0.2):
    """
    Rescale the matrices to a single target amplitude.
    """
    standardized_matrices = []
    for matrix in matrices:
        amp = (np.max(matrix, axis=1) - np.min(matrix, axis=1)) / 2
        scale = target_amp / amp
        standardized_matrix = matrix * scale[:, np.newaxis]
        standardized_matrices.append(standardized_matrix)
    
    return standardized_matrices

In [None]:
# Load Data & Run
fileAlpha1 = "/shared/space2/molina/suttyg/alpha_snrna_rep1_5000_1_2p75.csv"
fileAlpha2 = "/shared/space2/molina/suttyg/alpha_snrna_rep2_5000_1_2p75.csv"
fileBSM = '/shared/space2/molina/suttyg/data_binding_site_matrix.txt'
process = ["transcription", "\u03B1"]
theta_smooth = np.round(np.linspace(0.01, 1.00, 100), 2)  # 100 bins from 0.01 to 1.00
torch.manual_seed(42)

# Define Lambda Values
#lambda1_values = np.logspace(-2, 1, 4)
#lambda2_values = np.logspace(-2, 1, 4)
#lambda1_values = [0.89]
best_lambda1, best_lambda2 = 0.89, 0.05

N = pd.read_csv(fileBSM, sep="\t",index_col=0)
tf_names = N.columns
alpha1 = pd.read_csv(fileAlpha1, sep=",",index_col=0)
alpha2 = pd.read_csv(fileAlpha2, sep=",",index_col=0)

#Select common genes and normalize
print(alpha1.shape, alpha2.shape, N.shape)
ampl_threshold=0.2
alpha1_norm, alpha2_norm, N_norm, targetnames, tf_names = preprocessing(alpha1, alpha2, N, ampl_threshold=ampl_threshold)
print(alpha1_norm.shape, alpha2_norm.shape, N_norm.shape, "\n")

N_tensor = torch.tensor(N_norm, dtype=torch.float32)  # (genes, TFs)
alpha1_tensor = torch.tensor(alpha1_norm, dtype=torch.float32)  # (genes, thetas)
alpha2_tensor = torch.tensor(alpha2_norm, dtype=torch.float32)  # (genes, thetas)

# Optimize Lambda
#best_lambda1, best_lambda2 = cross_val_lambda(N_tensor, alpha1_tensor, alpha2_tensor, lambda1_values, lambda2_values)

# Train and Cross-Test
W1, A1, x1, W2, A2, x2 = cross_train(N_tensor, alpha1_tensor, alpha2_tensor, best_lambda1, best_lambda2)
print(np.sort(x1.flatten()))
print(np.sort(x2.flatten()))

# Save activities (TFs × θ)
#np.save("activities_export/A_star_pytorch.npy", A1)
#np.save("activities_export/tf_names_pytorch.npy", np.array(tf_names))
#np.save("activities_export/targetnames_pytorch.npy", np.array(targetnames))

In [None]:
print(np.sort(W1.flatten()))
print(np.sort(x1.flatten()))
print(np.sort(A1.flatten()))

In [None]:
#We smooth activities
A1 = fourier_fit(A1, theta_smooth)
A2 = fourier_fit(A2, theta_smooth)
R1 = W1 @ A1
R2 = W2 @ A2

In [None]:
BP_nb = np.where(tf_names == 'Hes1')[0][0]
#BP_nb = 30
plot_binding_protein_activity(tf_names, A1, process, theta_smooth, BP_nb=BP_nb)
print(f"Positive W1 among target genes of {tf_names[BP_nb]} : {np.sum(W1[:, BP_nb] > 0)}/{np.sum(W1[:, BP_nb] != 0)} ({np.sum(W1[:, BP_nb] > 0)/np.sum(W1[:, BP_nb] != 0)*100:.2f}%)")

In [None]:
n = np.where(targetnames == 'Nusap1')[0][0]
#n = 4972
print("Train")
plot_rate_comparison(targetnames, alpha1_norm, R1, process, theta_smooth, target_nb=n)
print("Test")
plot_rate_comparison(targetnames, alpha2_norm, R1, process, theta_smooth, target_nb=n)

In [None]:
corrs_A, corrs_E =  compute_reproducibility(A1, A2, alpha1_norm, alpha2_norm, metric="TF activities")
corrs_W, corrs_E = compute_reproducibility(W1, W2, alpha1_norm, alpha2_norm, metric="W site counts")
corrs_R, corrs_E = compute_reproducibility(R1, R2, alpha1_norm, alpha2_norm, metric="Reconstruction")

In [None]:
#Heatmap of BPs activity along cell cycle (Export)
key_tfs = ["Smad3", "Hbp1", "E2f1", "E2f2_E2f5", "E2f3", "E2f4", "E2f6", "E2f7", "E2f8", "Sp1", "Hes1", "Elf3", "Tfap4"]
tf_displayed = plot_heatmap_list(A1, tf_names, key_tfs, clip=True)
print(tf_displayed)

In [None]:
expected_activity = {
    "Smad3": {"ranges": [(0.1, 0.4)], "inhibitory": True},
    "Hbp1": {"ranges": [(0.1, 0.4)], "inhibitory": True},
    "E2f1": {"ranges": [(0.1, 0.4)], "inhibitory": False},
    "E2f2_E2f5": {"ranges": [(0.1, 0.4)], "inhibitory": False},
    "E2f3": {"ranges": [(0.1, 0.4)], "inhibitory": False},
    "E2f4": {"ranges": [(0.01, 0.25)], "inhibitory": False}, #May be True
    "E2f6": {"ranges": [(0.01, 0.25), (0.63, 0.9)], "inhibitory": False}, #May be True
    "E2f7": {"ranges": [(0.4, 0.9)], "inhibitory": False}, #May be True
    "E2f8": {"ranges": [(0.4, 0.9)], "inhibitory": False}, #May be True
    "Sp1": {"ranges": [(0.1, 0.63)], "inhibitory": False},
    "Hes1": {"ranges": [(0.1, 0.4)], "inhibitory": True},
    #"Elf3": {"ranges": [(0.25, 1)], "inhibitory": False},
    #"Tfap4": {"ranges": [(0.63, 0.9)], "inhibitory": False}
}

In [None]:
#### TF EXPRESSION AND BIOLOGICAL MEANING ####

In [None]:
key_tfs = ["Smad3", "Hbp1", "E2f1", "E2f2", "E2f3", "E2f4", "E2f5", "E2f6", "E2f7", "E2f8", "Sp1", "Hes1"]

fileAlphaTF_sn1 = "/shared/space2/molina/suttyg/alpha_snrna_rep1_5000_1_2p75.csv"
fileAlphaTF_sn2 = "/shared/space2/molina/suttyg/alpha_snrna_rep2_5000_1_2p75.csv"

#Get spliced RNA data to compare splicing and protein activity dynamics
alpha_sn1 = pd.read_csv(fileAlphaTF_sn1, sep=",",index_col=0)
alpha_sn2 = pd.read_csv(fileAlphaTF_sn2, sep=",",index_col=0)

tf_names_filtered = np.array([tf for tf in key_tfs if tf in alpha_sn1.index and tf in alpha_sn2.index and tf in tf_names])
print("TFs in common :", str(len(tf_names_filtered))+"/"+str(len(key_tfs)))

alpha_sn1_f, alpha_sn2_f = alpha_sn1.loc[tf_names_filtered], alpha_sn2.loc[tf_names_filtered]
alpha_sn1_n, alpha_sn2_n = alpha_sn1_f.to_numpy(), alpha_sn2_f.to_numpy()

#Standardize amplitudes
#A_standard = A_standard - np.mean(A_standard, axis=1, keepdims=True)
alpha_sn_n = (alpha_sn1_n + alpha_sn2_n) / 2
alpha_sn_norm = alpha_sn_n - np.mean(alpha_sn_n, axis=1, keepdims=True) - np.mean(alpha_sn_n, axis=0, keepdims=True) + np.mean(alpha_sn_n)
alpha_sn_norm, A_standard = standardize_amplitudes([alpha_sn_norm, A1])

In [None]:
corrs = []
z_vals = []
for tf in range(len(tf_names_filtered)):
    plot_TF_exp_activity(theta_smooth, alpha_sn_norm, A_standard, tf_names, tf_names_filtered, tf)
    corr = spearmanr(alpha_sn_norm[tf], A_standard[list(tf_names).index(tf_names_filtered[tf])])[0]
    action = "activator" if not expected_activity[tf_names_filtered[tf]]["inhibitory"] else "inhibitor"
    if (action == "inhibitor"):
        corr = -corr
    print(f"scRNA & A correlation : {corr:.3f} ({ action })\n")
    z_val = compute_tf_activity_difference(A_standard[list(tf_names).index(tf_names_filtered[tf]), :], theta_smooth, expected_activity[tf_names_filtered[tf]]["ranges"], expected_activity[tf_names_filtered[tf]]["inhibitory"])
    corrs.append(corr)
    z_vals.append(z_val)
    print(f"Expected activity range : {expected_activity[tf_names_filtered[tf]]["ranges"]}")
    print(f"TF activity biological z-score : {z_val:.2f} ({ action })")
print(f"Global correlation :{np.mean(corrs):.3f}")
print(f"Global z-score :{np.mean(z_vals):.2f}")