# Correlation analysis

This notebook demonstrates how to calculate the correlation between two RSP profiles.

In [124]:
import numpy as np
import pandas as pd
from itertools import combinations
from tqdm import tqdm  # For progress bar

from ipynb.fs.full.biorsp import (
    find_foreground_background_points,
    calculate_rsp_area,
    calculate_deviation_score,
)

from scipy.stats import spearmanr
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns

In [125]:
# Load your data
dge_matrix = pd.read_csv("data/MCA2_filtered.dge.txt", sep="\t", index_col=0)
tsne_results = pd.read_csv("embeddings/tsne_results.csv").to_numpy()
dbscan_results = pd.read_csv("embeddings/tsne_dbscan_results.csv")

In [126]:
threshold = 1
clusters = [1]
scanning_window = np.pi / 2
resolution = 1000
angle_range = np.array([0, 2 * np.pi])
mode = "relative"

In [127]:
gene_coverage = {}

for gene in tqdm(dge_matrix.index, desc="Calculating coverage"):
    try:
        # Find foreground and background points for the gene
        fg, bg = find_foreground_background_points(
            gene_name=gene,
            dge_matrix=dge_matrix,
            tsne_results=tsne_results,
            dbscan_df=dbscan_results,
            threshold=threshold,
            selected_clusters=clusters,
        )

        # Calculate coverage: (foreground + background) / total_cells * 100
        # Assuming foreground and background are mutually exclusive and collectively exhaustive
        num_foreground = fg.shape[0]
        num_background = bg.shape[0]
        coverage = (num_foreground / num_background) * 100
        gene_coverage[gene] = coverage
    except Exception as e:
        # If there's an error processing the gene, assign coverage as 0
        print(f"Error processing gene {gene}: {e}")
        gene_coverage[gene] = 0.0

gene_coverage_series = pd.Series(gene_coverage)

Calculating coverage: 100%|██████████| 16131/16131 [00:19<00:00, 839.71it/s]


In [128]:
filtered_genes = gene_coverage_series[
    (gene_coverage_series >= 45) & (gene_coverage_series <= 55)
].index.tolist()
gene_pairs = list(combinations(filtered_genes, 2))

print(f"Number of filtered genes: {len(filtered_genes)}")
print(f"Number of gene pairs to analyze: {len(gene_pairs)}")

Number of filtered genes: 21
Number of gene pairs to analyze: 210


In [129]:
rsp_metrics = {
    gene: {"rsp_area": None, "differences": None, "deviation_score": None}
    for gene in filtered_genes
}

for gene in tqdm(filtered_genes, desc="Calculating RSP metrics"):
    try:
        fg, bg = find_foreground_background_points(
            gene_name=gene,
            dge_matrix=dge_matrix,
            tsne_results=tsne_results,
            dbscan_df=dbscan_results,
            threshold=threshold,
            selected_clusters=clusters,
        )

        vantage_point = bg.mean(axis=0)

        rsp_area, differences, rsmd = calculate_rsp_area(
            foreground_points=fg,
            background_points=bg,
            vantage_point=vantage_point,
            scanning_window=scanning_window,
            resolution=resolution,
            angle_range=angle_range,
            mode=mode,
        )

        deviation_score = calculate_deviation_score(
            rsp_area=rsp_area, differences=differences, resolution=resolution
        )

        rsp_metrics[gene]["rsp_area"] = rsp_area
        rsp_metrics[gene]["differences"] = differences
        rsp_metrics[gene]["deviation_score"] = deviation_score
    except Exception as e:
        print(f"Error computing RSP for gene {gene}: {e}")
        rsp_metrics[gene]["rsp_area"] = np.nan
        rsp_metrics[gene]["differences"] = np.full(resolution, np.nan)
        rsp_metrics[gene]["deviation_score"] = np.nan

Calculating RSP metrics: 100%|██████████| 21/21 [00:03<00:00,  6.97it/s]


In [130]:
rsp_df = pd.DataFrame.from_dict(rsp_metrics, orient="index")
print(rsp_df.head())

         rsp_area                                        differences  \
Atp5d    0.014956  [-0.044708572281959374, -0.043865967044094636,...   
Atp5f1   0.015292  [-0.04692666245791246, -0.04941035546613011, -...   
Chchd10  0.019007  [-0.024748420479302834, -0.026824624274245083,...   
Cox6a1   0.013887  [-0.03868730382925299, -0.037126841508823054, ...   
Cryab    0.023483  [-0.039921456861485766, -0.04397707815520574, ...   

         deviation_score  
Atp5d           0.272960  
Atp5f1          0.174218  
Chchd10         0.124662  
Cox6a1          0.117388  
Cryab           0.087299  


In [131]:
def calculate_overlap(diffs_A, diffs_B):
    """
    Calculate the overlap between two difference arrays.

    Parameters:
        - diffs_A: Numpy array of differences for gene A.
        - diffs_B: Numpy array of differences for gene B.

    Returns:
        - dissimilarity: 1 - (overlap / total_area)
    """
    overlap = np.sum(np.minimum(diffs_A, diffs_B))
    total_area = np.sum(np.maximum(diffs_A, diffs_B))
    if total_area == 0:
        return np.nan
    dissimilarity = 1 - (overlap / total_area)
    return dissimilarity

In [132]:
dissimilarity_results = []

# Loop over gene pairs and calculate dissimilarity
for geneA, geneB in tqdm(gene_pairs, desc="Calculating dissimilarity"):
    diffs_A = rsp_df.loc[geneA, 'differences']
    diffs_B = rsp_df.loc[geneB, 'differences']
    
    # Handle NaN or infinite values
    if np.isnan(diffs_A).any() or np.isnan(diffs_B).any():
        dissimilarity = np.nan
    else:
        dissimilarity = calculate_overlap(diffs_A, diffs_B)
    
    dissimilarity_results.append({
        'GeneA': geneA,
        'GeneB': geneB,
        'Dissimilarity_Score': dissimilarity
    })

Calculating dissimilarity: 100%|██████████| 210/210 [00:00<00:00, 8910.69it/s]


In [133]:
dissimilarity_df = pd.DataFrame(dissimilarity_results)

print(dissimilarity_df.head())

   GeneA    GeneB  Dissimilarity_Score
0  Atp5d   Atp5f1             1.863207
1  Atp5d  Chchd10             1.979872
2  Atp5d   Cox6a1             1.924479
3  Atp5d    Cryab             1.893114
4  Atp5d    Gapdh             1.970975


In [134]:
dissimilarity_df.to_csv('results/dissimilarity.csv', index=False)