In [None]:
import os
from pathlib import Path
from typing import Annotated

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import entropy, chi2_contingency

from scipy import sparse
from scipy.stats import gaussian_kde
from sklearn.preprocessing import StandardScaler
from scipy.stats import mannwhitneyu
from scipy.stats import wilcoxon  

import matplotlib as mpl
from matplotlib.backends.backend_pdf import PdfPages

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42 #make text editable in pdf

os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/april_baff_signaling/')
os.getcwd()

In [None]:
merged = sc.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony/combined_cleaned.h5ad')

In [None]:
merged.layers['counts'].max()

In [None]:
merged.layers['normalized'].max()

In [None]:
merged.X = merged.layers['counts'].copy()
sc.pp.log1p(merged)
merged.X.max()

In [None]:
merged.layers['log']= merged.X.copy()

In [None]:
nbm = merged[merged.obs['Collection']=='NBM'].copy()
ndmm = merged[merged.obs['Collection']=='NDMM'].copy()
pt = merged[merged.obs['Collection']=='PT'].copy()

In [None]:
print(nbm.layers['counts'].max(), nbm.X.max())

In [None]:
sc.pl.umap(ndmm,
            color='TNFRSF13B',
            frameon=False, layer='log',
            size=3
        )

In [None]:
gene = "TNFSF13B" 
adatas  = [nbm, ndmm, pt]   
labels  = ["NBM", "NDMM", "PT"]   

layer  = None                       
cmap   = "viridis"                                              

umaps, exprs = [], []
for A in adatas:
    U = A.obsm["X_umap"]
    umaps.append(U)

    if layer is None:
        Xg = A[:, gene].X
    else:
        # use a specific layer
        gi = A.var_names.get_loc(gene)
        Xg = A.layers[layer][:, gi]
    e = Xg.toarray().ravel() 
    exprs.append(e)

xmin = min(U[:,0].min() for U in umaps)
xmax = max(U[:,0].max() for U in umaps)
ymin = min(U[:,1].min() for U in umaps)
ymax = max(U[:,1].max() for U in umaps)
padx = 0.02 * (xmax - xmin)
pady = 0.02 * (ymax - ymin)
xlim = (xmin - padx, xmax + padx)
ylim = (ymin - pady, ymax + pady)

all_expr = np.concatenate(exprs)
pos = all_expr[all_expr > 0]
vmax = np.percentile(pos, 99.0) if pos.size else (all_expr.max() if all_expr.size else 1.0)
if vmax <= 0:
    vmax = 1.0
vmin = 0.0

pdf_out = f"{gene}_umap_sharedscale.pdf"
with PdfPages(pdf_out) as pdf:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5), constrained_layout=True)

    for ax, A, U, label in zip(axes, adatas, umaps, labels):
        sc.pl.umap(
            A,
            color=gene,
            layer='counts',
            ax=ax,
            show=False,
            frameon=False,
            vmin=vmin,
            vmax=vmax,
            colorbar_loc=None,     # suppress individual colorbars
            s=3,            # dot size
            cmap=cmap
        )
        ax.set_title(label, fontsize=12, pad=6)
        ax.set_aspect('equal')
        ax.set_xlim(*xlim)
        ax.set_ylim(*ylim)
        # rasterize points for compact/crisp PDF
        for coll in ax.collections:
            coll.set_rasterized(True)

    # One shared colorbar
    norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
    sm = mpl.cm.ScalarMappable(norm=norm, cmap=plt.get_cmap(cmap))
    sm.set_array([])
    cbar = fig.colorbar(sm, ax=axes.ravel().tolist(), shrink=0.85)
    cbar.set_label(f"{gene} expression")

    fig.suptitle(f"{gene} expression on UMAP — shared scale", y=1.02, fontsize=14)
    pdf.savefig(fig, dpi=300)
    plt.close(fig)

print(f"Saved: {pdf_out}")


In [None]:
merged.obs['subset'].value_counts()

In [None]:
pc_markers = ['SDC1', 'CD38', 'TNFRSF17', 'SLAMF7', 'FCRL5', 'GPRC5D']
pc = merged[merged.obs['lin']=='PC'].copy()
sc.pl.dotplot(
    pc,
    var_names=pc_markers,
    layer='log',
    groupby="Collection",
    standard_scale="var"
)

In [None]:
pcf = pc[pc.obs['Collection'].isin(['NDMM', 'NBM', 'PT'])]
sc.pl.dotplot(
    pcf,
    var_names=["MKI67"],
    groupby="Collection",
    layer="log",
    standard_scale=None
)


In [None]:
# get average expr
genes = ["TNFRSF13B", "TNFRSF13C", "TNFRSF17", "TNFSF13", "TNFSF13B"]        
group_key = "Collection"  
group_order = ['NBM', 'NDMM', 'PT']
sample_key = "Sample"
upn_key = "UPN"
annot_key = "subset"

A = merged[merged.obs[group_key].isin(group_order), :].copy()
xg = A[:, genes].layers["log"]

In [None]:
xg_dense = np.asarray(xg.todense()) 

In [None]:
pd.DataFrame(xg_dense, index=A.obs_names, columns=genes)

In [None]:
df = pd.DataFrame(xg_dense, index=A.obs_names, columns=genes).join(A.obs[[annot_key, upn_key, group_key, sample_key]])
df

In [None]:
df.to_csv("receptor_ligand_expr_percell.tsv", sep="\t", index=True)

In [None]:
# receptors in B cells
genes = ["TNFRSF13B", "TNFRSF13C", "TNFRSF17"]        
group_key = "Collection"  

group_order = ['NBM', 'NDMM', 'PT']
sample_key = "Sample"
upn_key = "UPN"
annot_key = "subset"
adata = merged.copy()
                    
# subset and scale (z score scaling across cells)
A = adata[(adata.obs[group_key].isin(group_order)), :].copy()
xg = A[:, genes].layers['log'].copy()
xg = xg.toarray()
z = StandardScaler().fit_transform(xg) 

# Build per-cell DataFrame with metadata
df = (pd.DataFrame(z, index=A.obs_names, columns=genes)
        .join(A.obs[[annot_key, upn_key, group_key, sample_key]])
        .melt(id_vars=[annot_key, upn_key, group_key, sample_key],
              var_name="gene", value_name="z_scaled"))

# aggregate to per-(celltype × UPN × group)
avg_df = (df.groupby(["gene", annot_key, upn_key, group_key], observed=True)["z_scaled"]
            .mean()
            .reset_index()
            .rename(columns={"z_scaled": "avg_scaled"}))

In [None]:
avg_df.groupby("Collection", observed=True)["avg_scaled"].mean()
avg_df

In [None]:
avg_df.to_csv("receptor_avg_scale_expr_byUPN.tsv", sep="\t", index=False)

In [None]:
# ligands in non-B cells
genes = ["TNFSF13", "TNFSF13B"]     

# subset and scale (z score scaling across cells)
A = adata[(adata.obs[group_key].isin(group_order)), :].copy()
xg = A[:, genes].X
xg = xg.toarray()
z = StandardScaler().fit_transform(xg) 

# Build per-cell DataFrame with metadata
df = (pd.DataFrame(z, index=A.obs_names, columns=genes)
        .join(A.obs[[annot_key, upn_key, group_key, sample_key]])
        .melt(id_vars=[annot_key, upn_key, group_key, sample_key],
              var_name="gene", value_name="z_scaled"))

# aggregate to per-(celltype × UPN × group)
avg_df = (df.groupby(["gene", annot_key, upn_key, group_key], observed=True)["z_scaled"]
            .mean()
            .reset_index()
            .rename(columns={"z_scaled": "avg_scaled"}))

In [None]:
avg_df.to_csv("ligand_avg_scale_expr_byUPN.tsv", sep="\t", index=False)

In [None]:
mye = merged[merged.obs['lin']=="Mye"].copy()

In [None]:
sc.pl.dotplot(
    mye,
    var_names=["TNFSF13", "TNFSF13B"],
    groupby="Collection",
    layer="log",
    standard_scale="var"
)


In [None]:
merged.obs

In [None]:
upn_key = "UPN"
collection_key = "Collection"
subset_key = "subset"  
genes = ["TNFRSF13B", "TNFRSF13C", "TNFRSF17", 'TNFSF13', 'TNFSF13B']


merged.obs["UPN_Collection"] = merged.obs[upn_key].astype(str) + ":" + merged.obs[collection_key].astype(str)
sample_key="UPN_Collection"

results = []

for gene in genes:
    # extract expression vector
    x = merged[:, gene].X
    x = np.ravel(x.toarray()) if not isinstance(x, np.ndarray) else np.ravel(x)

    # make df for this gene
    df = merged.obs[[subset_key, sample_key]].copy()
    df["expr"] = x

    # per subset × sample AUC (sum of expr)
    auc_subset = (
        df.groupby([sample_key, subset_key], observed=True)
          .agg(AUC=("expr", "sum"), n_subset=("expr", "size"))
          .reset_index()
    )

    # total cells per sample
    n_total = (
        df.groupby(sample_key, observed=True)
          .size()
          .rename("n_total")
          .reset_index()
    )

    # merge and normalize
    auc_subset = auc_subset.merge(n_total, on=sample_key, how="left")
    auc_subset["AUC_norm"] = auc_subset["AUC"] / auc_subset["n_total"]

    # separate back into UPN / Collection
    auc_subset[["UPN", "Collection"]] = auc_subset[sample_key].str.split(":", expand=True)

    # tag the gene
    auc_subset["gene"] = gene

    results.append(auc_subset)

# combine all genes
auc_all = pd.concat(results, ignore_index=True)
auc_all.head()

In [None]:
auc_all.to_csv('auc_normalized.tsv', sep="\t")