In [None]:
import os, re
from pathlib import Path
from typing import Annotated
import numpy as np
import pandas as pd
from IPython.display import display
import warnings
warnings.filterwarnings("ignore") 

import scipy.sparse as sparse
from scipy.io import mmread
from scipy.stats import pearsonr, pointbiserialr

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.lines import Line2D

import anndata as ad
import seaborn as sns
import scanpy as sc

from sklearn.cluster import KMeans
from sklearn.neighbors import radius_neighbors_graph
from sklearn.neighbors import NearestNeighbors

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import entropy, chi2_contingency

import random
seed = 1234
np.random.seed(seed)
random.seed(seed)

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42 #make text editable in pdf

In [None]:
output_dir = Path('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/banksy/Output')
output_dir.mkdir(parents=True, exist_ok=True)
os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/banksy/')
os.getcwd()

In [None]:
merged = sc.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/merged.h5ad')

In [None]:
# get shared genes on both v5 and v6 samples
import json
# selected V5 and V6 samples at random
with open("/diskmnt/primary/Xenium/data/20240821__204457__20240821_SenNet_bone/output-XETG00122__0033739__S18-30740A1U3__20240821__204528/gene_panel.json", "r", encoding="utf-8") as f:
    v5genes_j = json.load(f) 
with open("/diskmnt/primary/Xenium/data/20250305__182959__20250305_477_Bones_V6/output-XETG00523__0033883__S17-32736-A1U1__20250305__183110/gene_panel.json", "r", encoding="utf-8") as f:
    v6genes_j = json.load(f) 

def find_names(x):
    if isinstance(x, dict):
        for k, v in x.items():
            if k == "name":
                yield v
            yield from find_names(v)
    elif isinstance(x, list):
        for item in x:
            yield from find_names(item)

v5genes = set(list(find_names(v5genes_j)))
v6genes = set(list(find_names(v6genes_j)))

shared_genes = sorted(v5genes & v6genes)
# genes only in v5
only_v5_genes = sorted(v5genes - v6genes)

# genes only in v6
only_v6_genes = sorted(v6genes - v5genes)

print(len(only_v5_genes), len(only_v6_genes), len(shared_genes))

In [None]:
# the following functions are from Simon / Evan's pyBanksy pipeline: https://github.com/jwweii/PyBanksy-Harmony/blob/simon-main/script/Xenium_anndata_merge_v2.py

def stagger_spatial_coordinates_grid(adatas, samples_per_row=4, grid_width=10000, grid_height=10000, spatial_key = 'spatial'):
    """
    Stagger spatial coordinates for multiple AnnData objects into a fixed grid layout.

    Parameters:
        adatas (list of AnnData): List of AnnData objects to stagger spatial coordinates.
        samples_per_row (int): Number of samples to arrange in a row.
        grid_width (float): Width of each grid block assigned to a sample.
        grid_height (float): Height of each grid block assigned to a sample.

    Returns:
        list of AnnData: List of AnnData objects with staggered spatial coordinates.
    """
    staggered_adatas = []

    for i, adata in enumerate(adatas):
        # Determine the grid position
        row, col = divmod(i, samples_per_row)
        x_offset = col * grid_width
        y_offset = row * grid_height

        # Center the sample within its grid block
        spatial = adata.obsm[spatial_key]
        x_center = spatial[:, 0].mean()
        y_center = spatial[:, 1].mean()

        # Calculate shifts to center the sample in the grid block
        x_shift = x_offset + grid_width / 2 - x_center
        y_shift = y_offset + grid_height / 2 - y_center

        # Apply the calculated shifts
        staggered_adata = stagger_spatial_coordinates(adata, x_offset=x_shift, y_offset=y_shift, spatial_key=spatial_key)
        staggered_adatas.append(staggered_adata)

    print(f"Samples arranged in a grid with {samples_per_row} samples per row.")
    return staggered_adatas

def stagger_spatial_coordinates(adata, x_offset=0, y_offset=0, spatial_key='spatial'):
    """
    Stagger spatial coordinates stored in `adata.obsm['spatial']`.

    Parameters:
        adata (AnnData): The AnnData object with spatial coordinates.
        x_offset (float): Offset to add to the x-coordinate.
        y_offset (float): Offset to add to the y-coordinate.

    Returns:
        AnnData: A new AnnData object with staggered spatial coordinates.
    """
    adata_copy = adata.copy()
    if spatial_key in adata_copy.obsm:
        spatial = adata_copy.obsm[spatial_key].copy()
        spatial[:, 0] += x_offset
        spatial[:, 1] += y_offset
        adata_copy.obsm[spatial_key] = spatial
    else:
        raise KeyError(f"{spatial_key} not found in obsm. Ensure spatial data is available.")
    return adata_copy
    
def merge_anndata(adatas, spatial_key, join_type):
    """
    Merge multiple AnnData objects.

    Parameters:
        adatas (list of tuple): List of (dataset name, AnnData) tuples.
        spatial_key: name of .obsm assay containing (x,y) coordinates
        
    Returns:
        AnnData: Merged AnnData object.
    """
    merged_adata = ad.concat(
        adatas,
        join=join_type
    )

    # Add x and y coordinates to obs
    if spatial_key in merged_adata.obsm:
        spatial_coords = merged_adata.obsm[spatial_key]
        merged_adata.obs['x_coord'] = spatial_coords[:, 0]
        merged_adata.obs['y_coord'] = spatial_coords[:, 1]
    else:
        raise KeyError(f"{spatial_key} not found in obsm of merged AnnData.")

    return merged_adata

In [None]:
adata_allgenes=merged.copy()

xy = (adata_allgenes.obs[["x_centroid", "y_centroid"]]
      .apply(pd.to_numeric, errors="coerce")
      .to_numpy(dtype=float))

# add to obsm
adata_allgenes.obsm["spatial"] = xy

# include only shared genes on both v5 and v6 panels
adata_sharedgenes = adata_allgenes[:, adata_allgenes.var_names.isin(shared_genes)].copy()

# split adata into dict of separate anndata objects 
adata_list = []
sids = sorted(set(adata_sharedgenes.obs['Sample']))

for s in sids:
    adata_s = adata_sharedgenes[adata_sharedgenes.obs['Sample']==s].copy()
    adata_list.append(adata_s)

In [None]:
max_perimeter = 0
max_x = 0
max_y = 0
for a in adata_list:
    xrange = max(a.obs['x_centroid'])-min(a.obs['x_centroid'])
    yrange = max(a.obs['y_centroid'])-min(a.obs['y_centroid'])
    perimeter = 2*xrange + 2* yrange
    if max_x < xrange:
        max_x = xrange
    if max_y < yrange:
        max_y = yrange
print(max_x, max_y)

In [None]:
len(adata_list)

In [None]:
# Stagger spatial coordinates and merge AnnData objects
staggered_list = stagger_spatial_coordinates_grid(
    adata_list,
    samples_per_row= 8 ,
    grid_width= 11438,
    grid_height= 22106
)

In [None]:
for a in staggered_list:
    print(a.obsm['spatial'].max(0))

#print(test.obs['x_coord'].max(), test2.obs['x_coord'].max(), adatas[2].obs['x_centroid'].max())


In [None]:
adata =  merge_anndata(staggered_list, spatial_key='spatial', join_type='outer')
adata

In [None]:
adata.obsm['spatial'].max(0)

In [None]:
adata.write("merged_for_banksy.h5ad") # coordinates are staggered, only genes on both V5 and V6 panels included

In [None]:
sc.pl.scatter(adata, x="x_coord", y="y_coord", color="annot", legend_loc="none", show=True, size=2)

In [None]:
adata = sc.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/banksy/merged_for_banksy.h5ad')

In [None]:
adata.obs['dataset'] = adata.obs['Sample']
adata.obs['cell_id'] = adata.obs['Original_Barcode']

In [None]:
adata.obs

In [None]:
adata.write("merged_for_banksy.h5ad")