# processing eae 5k

## load packages

In [None]:
import warnings
warnings.filterwarnings('ignore')
import os
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans

## data retrival

In [None]:
import os
import scanpy as sc
import pandas as pd

base_dir = '/Volumes/Castelo_Branco/NGSDATA/[spatialOmics_TS]Mouse_RREAE_Xenium5k_SpinalCord#12Jun2025/RawData'

# Only keep subfolders matching "GoncaloTing"
runs = [d for d in os.listdir(base_dir) if 'GoncaloTing' in d]

ad_list = []

for run in runs:
    run_path = os.path.join(base_dir, run)

    # Look for sample folders inside this run
    samples = [s for s in os.listdir(run_path) if os.path.isdir(os.path.join(run_path, s))]

    for sample in samples:
        sample_path = os.path.join(run_path, sample)

        h5_path = os.path.join(sample_path, 'cell_feature_matrix.h5')
        cell_info_path = os.path.join(sample_path, 'cells.csv.gz')

        if not (os.path.exists(h5_path) and os.path.exists(cell_info_path)):
            print(f"Skipping {sample_path} (missing required files)")
            continue

        print(f"Loading sample: {sample_path}")
        ad_int = sc.read_10x_h5(h5_path)
        cell_info = pd.read_csv(cell_info_path, index_col=0)

        ad_int.obs = cell_info
        ad_int.obs['run'] = run
        ad_int.obs['sample'] = sample

        ad_list.append(ad_int)

In [None]:
ad = sc.concat(ad_list)

In [None]:
ad.obs["grid_label"] = ad.obs["sample"].str.split("__").str[2]

In [None]:
ad.obs["grid_label"]

## write raw data

In [None]:
ad.X = ad.X.toarray()

In [None]:
ad.write('../data/RREAE_5k_raw.h5ad')

## preprocessing

### calculate qc metrics and filter cells for counts and number of genes

In [None]:
ad.obs['segmentation_method'].value_counts()

In [None]:
sc.pp.calculate_qc_metrics(ad, percent_top=None, log1p=False, inplace=True)
sc.pp.filter_cells(ad,min_counts=40)
sc.pp.filter_cells(ad,min_genes=15)

### normalizing and transforming

In [None]:
sc.pp.normalize_total(ad, inplace=True,target_sum=100)
sc.pp.log1p(ad)
#sc.pp.scale(ad, )#max_value=10)

### pca and neighbors

In [None]:
plt.rcdefaults()
sc.tl.pca(ad)
sc.pl.pca_variance_ratio(ad, n_pcs=50, log=True)
sc.pp.neighbors(ad, n_neighbors=15, n_pcs=30)

### umap

In [None]:
sc.tl.umap(ad, min_dist=0.1)

### clustering

In [None]:
resolutions = [0.5, 1, 1.5 , 2, 2.5]
for resolution in resolutions: 
    print('clustering at resolution '+str(resolution))
    sc.tl.leiden(ad, resolution = resolution, key_added = 'leiden_'+str(resolution))
    print("done")

### write clustered data

In [None]:
ad.write('../data/RREAE_5k_clustered.h5ad')

In [None]:
plt.rcdefaults()
with plt.rc_context({'figure.figsize': (10, 7)}):
    sc.pl.umap(ad,color = ("leiden_2"),s=3,add_outline=True,legend_loc='on data',legend_fontsize=20,legend_fontoutline=2, ncols= 1, )

In [None]:
plt.rcdefaults()
with plt.rc_context({'figure.figsize': (10, 7)}):
    sc.pl.umap(ad,color = ("Serpina3n"),s=3,add_outline=True,legend_loc='on data',legend_fontsize=20,legend_fontoutline=2, ncols= 1, )

### add spatial information

In [None]:
spatial = np.array(ad.obs[['x_centroid','y_centroid']])
ad.obsm['spatial'] = spatial

### plot clusters on basis of coordinates

In [None]:
ad.obs.sample

In [None]:
for run in ad.obs['grid_label'].unique():
    print(run)
    ad_int = ad[ad.obs['grid_label'] == run]
    with plt.rc_context({'figure.figsize': (20, 10)}):
        sc.pl.spatial(ad_int, spot_size=20, color = 'leiden_2')
    plt.show()
    

## generate sample specific labels

In [None]:
sample_id = []
for grid in ad.obs['grid_label'].unique():
    print(grid)
    ad_int = ad[ad.obs['grid_label'] == grid]
    coords = pd.DataFrame(ad_int.obsm["spatial"], columns=["x", "y"])
    kmeans = KMeans(n_clusters=3, random_state=0).fit(coords)
    ad_int.obs["sample_kmeans"] = [f"{i}" for i in kmeans.labels_]
    with plt.rc_context({'figure.figsize': (20, 10)}):
        sc.pl.spatial(ad_int, spot_size=20, color = 'sample_kmeans')
    sample_id.append(ad_int.obs['grid_label'].astype(str) + '_' + ad_int.obs['sample_kmeans'].astype(str))
    plt.show()
sample_df = pd.concat(sample_id)
ad.obs['sample_id'] = sample_df

In [None]:
ad.write('../data/RREAE_5k_clustered_processed.h5ad')