In [None]:
import os
import re 
from pathlib import Path

import pandas as pd
import numpy as np
import scanpy as sc
import scanpy.external as sce
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns

os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/')
os.getcwd()

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42 #make text editable in pdf
mpl.rcParams['svg.fonttype'] = 'none'

In [None]:
# panel information
snv_probes = {}
snv_probes['BYGXJ6_hMulti'] = pd.read_csv('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/panel_info/BYGXJ6_snv_probe_names.tsv', sep="\t", header=None)[0].tolist()
snv_probes['W7JCJE_hMulti'] = pd.read_csv('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/panel_info/W7JCJE_snv_probe_names.tsv', sep="\t", header=None)[0].tolist()

In [None]:
snv_probes_list = sorted({x for v in snv_probes.values() for x in v})
snv_probes_list.append('GNAS_p.R201H_WT')
snv_probes_list

In [None]:
# sample information
sample_info = pd.read_csv('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/sample_info/sample_table_DI_xenPaths_exclSN228.txt', sep="\t", header=0)
sample_info = sample_info.set_index('Sample')

In [None]:
sample_info['Panel'].unique()

In [None]:
sample_info

In [None]:
# for each sample, get x and y centroid info from cells.parquet 10X output
# filter out all the cells that have NaN nucleus area

adata_dict={}
out_dir = Path("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/individual_samples/h5ad")

keep_parq = ["cell_id", "x_centroid", "y_centroid", "cell_area", "nucleus_area"]

for i, row in sample_info.iterrows():
    sample_id = row.name.strip()
    panel = row.get('Panel')
    
    base = Path(row.get("Output.file.path"))
    mtx_path = base / "cell_feature_matrix"
    adata = sc.read_10x_mtx(mtx_path)
    
    # first drop all SNV probes from the matrix
    mask = ~(adata.var_names.isin(snv_probes_list))
    adata = adata[:, mask].copy()
    
    parq_path = base / "cells.parquet"
    parq = pd.read_parquet(parq_path)
    parq = parq[keep_parq].copy()
    parq = parq.set_index("cell_id")
    
    # filter out nucleus size = NaN 
    adata.obs = adata.obs.join(parq, how="left")
    adata_nuc_filtered = adata[adata.obs["nucleus_area"].notna()].copy()
    print(sample_id, adata.shape, adata_nuc_filtered.shape)
    
    adata_nuc_filtered.obs_names = [sample_id + "_" + str(s) for s in adata_nuc_filtered.obs_names]
    adata_nuc_filtered.obs['Sample'] = sample_id
    adata_nuc_filtered.write(out_dir / f"{sample_id}.h5ad") 
    adata_dict[sample_id]=adata_nuc_filtered
    

In [None]:
#list(adata_dict.values())[1:10]
list(adata_dict.values())[1].obs.head()

In [None]:
# make a merged object including all samples
merged = sc.concat(adata_dict.values(), join='outer', label='Sample', keys=adata_dict.keys(), index_unique=None)

In [None]:
merged.obs.head()

In [None]:
merged_common_probes = sc.concat(adata_dict.values(), join='inner', label='Sample', keys=adata_dict.keys())

In [None]:
print(merged)
print(merged_common_probes)

In [None]:
# Save shared probe names
with open("shared_probes.txt", "w") as f:
    for gene in merged_common_probes.var_names:
        f.write(f"{gene}\n")

In [None]:
# add metadata
meta = sample_info[['UPN', 'DI_UPN', 'DI_Sample', 'Collection', 'Panel', 'MRD.Status', 'Months_PFS']]
print(meta.head())
meta.index = meta.index.astype(str)
merged.obs['Sample'] = merged.obs['Sample'].astype(str)

In [None]:
merged.obs = merged.obs.join(meta, on="Sample")
merged.obs.head()

In [None]:
# check distribution of total counts per cell in V5 versus V6 panels
merged.obs["nCounts"] = np.array(merged.X.sum(axis=1)).flatten()

In [None]:
merged.obs.head()

In [None]:
merged.obs["nCounts"].max()

In [None]:
v5_counts = merged.obs.loc[merged.obs["Panel"] == "W7JCJE_hMulti", "nCounts"]
v6_counts = merged.obs.loc[merged.obs["Panel"] == "BYGXJ6_hMulti", "nCounts"]
print(f"v5 cells: {len(v5_counts)}, v6 cells: {len(v6_counts)}")

In [None]:
plt.figure(figsize=(8,5))
plt.hist(v5_counts, bins=50, alpha=0.5, label='v5')
plt.hist(v6_counts, bins=50, alpha=0.5, label='v6')
plt.xlabel("Total Counts per Cell")
plt.ylabel("Number of Cells")
plt.legend()
plt.title("Distribution of Total Counts per Cell: v5 vs v6")
plt.show()

In [None]:
# save raw probe counts 
merged.layers["counts"] = merged.X.copy()
print(merged.X.min(), merged.X.max())

In [None]:
# normalize
sc.pp.normalize_total(merged)
print(merged.X.min(), merged.X.max())
# log-transform for variance stabilization
sc.pp.log1p(merged)
print(merged.X.min(), merged.X.max())
merged.layers['norm_log'] = merged.X.copy()
merged.write("merged.h5ad")