#  Goal

Here, we will apply our regularized regression model to various independent datasets. 

# Import

In [None]:
import scanpy as sc
import pandas as pd
import decoupler as dc
import numpy as np
from sklearn.preprocessing import StandardScaler

import sys
sys.path.insert(0, '/data1/rudenska/EYW/git_projects/SIG13/functions')
import scanpy_custom as scc

%load_ext autoreload
%autoreload 2

# Define Functions to Calculate sPCA score

Here, we will convert information on sPCA component genes and weights so that they can be used to calculate sPCA scores in different datasets.

In [None]:
# import spca components and genes
spca_components = pd.read_csv("/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/spca/zscore_degs_allLigands_0.1_alpha1.0_sPCA_loadings.csv")
lm_scored = pd.read_csv('/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/spca/lm_scored_zscore_degs_allLigands_0.1_alpha1.0_sPCA_clean.csv')

# filter for good components
good_comps = lm_scored['component'].unique().tolist()
spca_components = spca_components[spca_components['spca_component'].isin(good_comps)]
# format for waggr scoring
net = spca_components.rename(columns={'gene':'target',
                                      'spca_component':'source',
                                      'loading':'weight'})
net = net.groupby('source', group_keys=False).apply(lambda x: x.nlargest(50, 'weight'))

# define genes used in scoring
comp_genes = net['target'].unique().tolist()

  net = net.groupby('source', group_keys=False).apply(lambda x: x.nlargest(50, 'weight'))


In [4]:
# convert to human genes
net_human = scc.convert_mouse_genes_to_human(net,'target')
net_human.drop('target', axis=1, inplace=True)
net_human.rename(columns={'human_gene':'target'}, inplace=True)
net_human.drop_duplicates(subset=['source', 'target'], inplace=True)

# define genes used in scoring
comp_genes_human = net_human['target'].unique().tolist()

In [None]:
def score_anndata(adata, net, scoring_genes, filter_low_expr=True, low_expr_min_cell_fraction=0.01):
    # subset to genes used in scoring
    adata = adata.copy()
    adata_sub = adata[:, adata.var_names.isin(scoring_genes)].copy()
    adata_sub.X = adata_sub.layers['log1p_norm']

    if filter_low_expr:
        # filter lowly expressed genes
        n_genes_before = adata_sub.n_vars
        min_cells = int(adata_sub.n_obs * low_expr_min_cell_fraction)
        sc.pp.filter_genes(adata_sub, min_cells=min_cells, inplace=True)
        n_genes_after = adata_sub.n_vars
        n_genes_filtered = n_genes_before - n_genes_after
        print(f"Filtered {n_genes_filtered} genes (kept {n_genes_after}/{n_genes_before})")

    # scale expression
    sc.pp.scale(adata_sub)
    
    # calculate waggr with no iterations (no pvalues)
    dc.mt.waggr(adata_sub, net, tmin=5, times=0)
    
    # add waggr scores to full anndata
    adata.obsm['score_waggr'] = adata_sub.obsm['score_waggr'].copy()

    # add waggr scores to metadata
    score_df = adata_sub.obsm['score_waggr']
    score_df.columns = [f'waggr_{col}' for col in score_df.columns]
    adata.obs = pd.concat([adata_sub.obs, score_df], axis=1)

    return adata

# Ligand Activity Scoring

In [None]:
# import and explanatory matrix
ligand_scores = pd.read_csv("/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/cytosig/SIG13_waggr_scores_explanatory_mat.csv", index_col=0)
ligands_df = ligand_scores.T
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(ligands_df), columns=ligands_df.columns, index=ligands_df.index)

## SIG19

This data is from the agonist antibody experiment performed in the manuscript.

In [None]:
rna = sc.read_h5ad('/data1/rudenska/EYW/SIG19/scvi_outs/SIG19_DTR_CD4T_iLN_scvi.h5ad')
rna_scored = score_anndata(rna, net, comp_genes, filter_low_expr=True, low_expr_min_cell_fraction=0.01)

# make and score anndata without Tregs
rna_activated = rna[~rna.obs['leiden_1.0'].isin(["8","12"])].copy()  # remove Tregs
rna_scored_activated =  score_anndata(rna_activated, net, comp_genes, filter_low_expr=True, low_expr_min_cell_fraction=0.01)

Filtered 1 genes (kept 1518/1519)


  return dispatch(args[0].__class__)(*args, **kw)


Filtered 1 genes (kept 1518/1519)


  return dispatch(args[0].__class__)(*args, **kw)


In [12]:
# process y matrix
y_df = (rna_scored.obs.assign(sample=lambda x: x['treatment'].astype(str) + "_" + x['mouse'].astype(str) + "_" + x['cage'].astype(str) + "_" + x['sex'].astype(str))
               .groupby('sample')[[c for c in rna_scored.obs.columns if c.startswith('waggr_')]]
               .mean()
               .T)
y_df.index = y_df.index.str.replace('waggr_', '')
y_df = y_df.loc[ligands_df.index]

# scale y matrix
Y_scaled = pd.DataFrame(scaler.fit_transform(y_df), columns=y_df.columns, index=y_df.index)

# calculate ligand activity scores
activity_scores, r2_obs = scc.calculate_ligand_activity_parallel(X_scaled, Y_scaled, n_perms=1000, verbose=True,
                                                               alpha_range=np.logspace(-1, 3, 100)
                                                               )
activity_scores = activity_scores.T.reset_index(names='sample')
activity_scores.to_csv('/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/cytosig/activity_scores_SIG19_DTR_iLN.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


Condition: DTA1-BGo_mouse16_5135324_F | Chosen Alpha: 29.15053 | R2: 0.5340
Condition: DTA1-BGo_mouse20_5135320_M | Chosen Alpha: 29.15053 | R2: 0.7080
Condition: DTA1-BGo_mouse24_5135338_M | Chosen Alpha: 16.68101 | R2: 0.8625
Condition: DTA1-IL4_mouse12_5135335_F | Chosen Alpha: 7.92483 | R2: 0.8244
Condition: DTA1-IL4_mouse4_5135327_M | Chosen Alpha: 3.76494 | R2: 0.8898
Condition: DTA1-IL4_mouse8_5135316_F | Chosen Alpha: 42.29243 | R2: 0.6945
Condition: DTA1_mouse10_5135335_F | Chosen Alpha: 0.93260 | R2: 0.9431
Condition: DTA1_mouse14_5135324_F | Chosen Alpha: 129.15497 | R2: 0.6403
Condition: DTA1_mouse22_5135338_M | Chosen Alpha: 1.62975 | R2: 0.9232
Condition: DTA1_mouse2_5135327_M | Chosen Alpha: 7.92483 | R2: 0.7348
Condition: DTA1_mouse6_5135316_F | Chosen Alpha: 1.96304 | R2: 0.9320
Condition: GK1.5-BGo_mouse15_5135324_F | Chosen Alpha: 20.09233 | R2: 0.6713
Condition: GK1.5-BGo_mouse19_5135320_M | Chosen Alpha: 9.54548 | R2: 0.8857
Condition: GK1.5-BGo_mouse23_5135338_M |

[Parallel(n_jobs=-1)]: Done   8 out of  23 | elapsed:    0.5s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  23 out of  23 | elapsed:    0.5s finished


In [13]:
# process y matrix
y_df = (rna_scored.obs.assign(sample=lambda x: x['treatment'].astype(str) + "_" + x['mouse'].astype(str) + "_" + x['cage'].astype(str) + "_" + x['sex'].astype(str) + "_" + x['leiden_1.0'].astype(str))
               .groupby('sample')[[c for c in rna_scored.obs.columns if c.startswith('waggr_')]]
               .mean()
               .T)
y_df.index = y_df.index.str.replace('waggr_', '')
y_df = y_df.loc[ligands_df.index]

# scale y matrix
Y_scaled = pd.DataFrame(scaler.fit_transform(y_df), columns=y_df.columns, index=y_df.index)

# calculate ligand activity scores
activity_scores, r2_obs = scc.calculate_ligand_activity_parallel(X_scaled, Y_scaled, n_perms=1000, verbose=True,
                                                               alpha_range=np.logspace(-1, 3, 100),
                                                               )
activity_scores = activity_scores.T.reset_index(names='sample')
activity_scores.to_csv('/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/cytosig/activity_scores_SIG19_DTR_iLN_celltype.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    2.5s


Condition: DTA1-BGo_mouse16_5135324_F_0 | Chosen Alpha: 97.70100 | R2: 0.4261
Condition: DTA1-BGo_mouse16_5135324_F_1 | Chosen Alpha: 61.35907 | R2: 0.7445
Condition: DTA1-BGo_mouse16_5135324_F_10 | Chosen Alpha: 4.53488 | R2: 0.9321
Condition: DTA1-BGo_mouse16_5135324_F_11 | Chosen Alpha: 38.53529 | R2: 0.8050
Condition: DTA1-BGo_mouse16_5135324_F_12 | Chosen Alpha: 31.99267 | R2: 0.6006
Condition: DTA1-BGo_mouse16_5135324_F_2 | Chosen Alpha: 24.20128 | R2: 0.9299
Condition: DTA1-BGo_mouse16_5135324_F_3 | Chosen Alpha: 18.30738 | R2: 0.8783
Condition: DTA1-BGo_mouse16_5135324_F_4 | Chosen Alpha: 2.84804 | R2: 0.9440
Condition: DTA1-BGo_mouse16_5135324_F_5 | Chosen Alpha: 38.53529 | R2: 0.8566
Condition: DTA1-BGo_mouse16_5135324_F_6 | Chosen Alpha: 10.47616 | R2: 0.8253
Condition: DTA1-BGo_mouse16_5135324_F_7 | Chosen Alpha: 67.34151 | R2: 0.4541
Condition: DTA1-BGo_mouse16_5135324_F_8 | Chosen Alpha: 4.97702 | R2: 0.8173
Condition: DTA1-BGo_mouse16_5135324_F_9 | Chosen Alpha: 31.99267

[Parallel(n_jobs=-1)]: Done 299 out of 299 | elapsed:    4.9s finished


## Thomas IBD

This dataset contains CD4 T cells from intestinal biopsies of patients with Ulcerative Colitis, Crohn's Disease, and Healthy controls. It has been pre-filtered to CD4 T cells based on author provided annotations.

Thomas, T., Friedrich, M., Rich-Griffin, C. et al. A longitudinal single-cell atlas of anti-tumour necrosis factor treatment in inflammatory bowel disease. Nat Immunol 25, 2152â€“2165 (2024). https://doi.org/10.1038/s41590-024-01994-8

In [None]:
rna = sc.read_h5ad("/data1/rudenska/EYW/external/thomas_IBD_2024/thomas_IBD_2024_cd4tcells_processed.h5ad")
rna_scored = score_anndata(rna, net_human, comp_genes_human, filter_low_expr=True, low_expr_min_cell_fraction=0.01)

Filtered 136 genes (kept 1279/1415)


  return dispatch(args[0].__class__)(*args, **kw)


In [24]:
# process y matrix
y_df = (
    # remove Tregs from bulk calculations
    rna_scored.obs[~rna_scored.obs['final_analysis'].str.contains('Treg')]
    #rna_scored.obs
    .assign(
        Treatment=lambda x: x['Treatment'].astype(str).fillna("None"),
        sample=lambda x: (
            x['Disease'].astype(str) + "__" + 
            x['Patient'].astype(str) + "__" + 
            x['Gender'].astype(str) + "__" +
            x['Remission_status'].astype(str) + "__" + 
            x['Site'].astype(str) + "__" +
            x['Inflammation'].astype(str) + "__" +
            x['Treatment'].astype(str)
        )
    )
    .filter(regex=r'^sample$|^waggr_comp')
    .groupby('sample')
    .mean()
    .T
)
y_df.index = y_df.index.str.replace('waggr_', '')
y_df = y_df.loc[ligands_df.index]

# scale y matrix
Y_scaled = pd.DataFrame(scaler.fit_transform(y_df), columns=y_df.columns, index=y_df.index)

# calculate activity scores
activity_scores, r2_obs = scc.calculate_ligand_activity_parallel(X_scaled, Y_scaled, n_perms=1000, verbose=True,
                                                               alpha_range=np.logspace(-1, 3, 100),
                                                               #l1_ratios=np.logspace(-1,0,20),
                                                               )
activity_scores = activity_scores.T.reset_index(names='sample')
activity_scores.to_csv('/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/cytosig/activity_scores_thomas_IBD_disease.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    2.5s


Condition: CD__CD10__M__Remission__Ascending_Colon__Inflamed__Pre | Chosen Alpha: 1000.00000 | R2: 0.0952
Condition: CD__CD10__M__Remission__Ascending_Colon__Non_Inflamed__Post | Chosen Alpha: 1.48497 | R2: 0.7263
Condition: CD__CD10__M__Remission__Descending_Colon__Inflamed__Pre | Chosen Alpha: 170.73526 | R2: 0.4271
Condition: CD__CD10__M__Remission__Descending_Colon__Non_Inflamed__Post | Chosen Alpha: 0.70548 | R2: 0.8125
Condition: CD__CD10__M__Remission__Rectum__Inflamed__Pre | Chosen Alpha: 2.15443 | R2: 0.4460
Condition: CD__CD10__M__Remission__Rectum__Non_Inflamed__Post | Chosen Alpha: 155.56761 | R2: 0.5446
Condition: CD__CD10__M__Remission__Terminal_Ileum__Inflamed__Pre | Chosen Alpha: 359.38137 | R2: 0.1712
Condition: CD__CD10__M__Remission__Terminal_Ileum__Non_Inflamed__Post | Chosen Alpha: 475.08102 | R2: 0.2147
Condition: CD__CD11__F__Non_Remission__Ascending_Colon__Non_Inflamed__Post | Chosen Alpha: 327.45492 | R2: 0.2093
Condition: CD__CD11__F__Non_Remission__Ascending_

[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:    3.5s finished


In [25]:
# process y matrix
y_df = (
    rna_scored.obs
    .assign(
        Treatment=lambda x: x['Treatment'].astype(str).fillna("None"),
        sample=lambda x: (
            x['Disease'].astype(str) + "__" + 
            x['Patient'].astype(str) + "__" + 
            x['Gender'].astype(str) + "__" +
            x['Remission_status'].astype(str) + "__" + 
            x['Site'].astype(str) + "__" +
            x['Inflammation'].astype(str) + "__" +
            x['Treatment'].astype(str) + "__" +
            x['final_analysis'].astype(str)
        )
    )
    .filter(regex=r'^sample$|^waggr_comp')
    .groupby('sample')
    .mean()
    .T
)
y_df.index = y_df.index.str.replace('waggr_', '')
y_df = y_df.loc[ligands_df.index]

# scale y matrix
Y_scaled = pd.DataFrame(scaler.fit_transform(y_df), columns=y_df.columns, index=y_df.index)

# calculate activity scores
activity_scores, r2_obs = scc.calculate_ligand_activity_parallel(X_scaled, Y_scaled, n_perms=1000, verbose=True,
                                                               alpha_range=np.logspace(-1, 3, 100),
                                                               #l1_ratios=np.logspace(-1,0,20),
                                                               )
activity_scores = activity_scores.T.reset_index(names='sample')
activity_scores.to_csv('/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/cytosig/activity_scores_thomas_IBD_disease_celltype.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   48.5s
[Parallel(n_jobs=-1)]: Done 3986 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 4199 out of 4199 | elapsed:  1.1min finished


Condition: CD__CD10__M__Remission__Ascending_Colon__Inflamed__Pre__CD4 FOShi T | Chosen Alpha: 2.59502 | R2: 0.6021
Condition: CD__CD10__M__Remission__Ascending_Colon__Inflamed__Pre__CD4 FOSpos T | Chosen Alpha: 271.85882 | R2: 0.2656
Condition: CD__CD10__M__Remission__Ascending_Colon__Inflamed__Pre__CD4 HSPhi CD70pos Treg | Chosen Alpha: 1000.00000 | R2: 0.1281
Condition: CD__CD10__M__Remission__Ascending_Colon__Inflamed__Pre__CD4 HSPhi Treg | Chosen Alpha: 628.02914 | R2: 0.1527
Condition: CD__CD10__M__Remission__Ascending_Colon__Inflamed__Pre__CD4 IKZF2hi TNFRSF18hi Treg | Chosen Alpha: 475.08102 | R2: 0.2857
Condition: CD__CD10__M__Remission__Ascending_Colon__Inflamed__Pre__CD4 IKZF2hi TNFRSF18lo Treg | Chosen Alpha: 1000.00000 | R2: 0.2531
Condition: CD__CD10__M__Remission__Ascending_Colon__Inflamed__Pre__CD4 IKZF2lo LAG3pos Treg | Chosen Alpha: 1000.00000 | R2: 0.0926
Condition: CD__CD10__M__Remission__Ascending_Colon__Inflamed__Pre__CD4 KLF2hi T | Chosen Alpha: 1000.00000 | R2: 

## AMP2

This dataset contains CD4 T cells from synovial fluid of Rheumatoid Arthritis and Osetoarthritis patients. It has been pre-filtered to CD4 T cells based on author provided annotations.

Zhang, F., Jonsson, A.H., Nathan, A. et al. Deconstruction of rheumatoid arthritis synovium defines inflammatory subtypes. Nature 623, 616â€“624 (2023). https://doi.org/10.1038/s41586-023-06708-y

In [26]:
rna = sc.read_h5ad("/data1/rudenska/EYW/external/AMP_2023/amp_2023_cd4_processed.h5ad")
rna_scored = score_anndata(rna, net_human, comp_genes_human, filter_low_expr=True, low_expr_min_cell_fraction=0.01)

Filtered 67 genes (kept 1353/1420)


  return dispatch(args[0].__class__)(*args, **kw)


In [None]:
# processing metadata to add cell-type abundance phenotypes (CTAPs) and binned indicators of disease severity
ctap_meta = pd.read_csv('/data1/rudenska/EYW/external/AMP_2023/2023_AMP2_CTAP.csv')

# make binned severity column based on das28_crp3
amp_obs = rna_scored.obs.copy()
cond_das = [
    amp_obs['das28_crp3'] >= 3.2,
    amp_obs['das28_crp3'] < 3.2,
    amp_obs['treatment'] == "osteoarthritis control"
]
choices_das = ["high", "low", "control"]
amp_obs = amp_obs.assign(
    das28_crp3_binary=np.select(cond_das, choices_das, default=None)
)

# add CTAP info
amp_obs = amp_obs.merge(ctap_meta, left_on=['subject_id','biopsy_id'],
                                      right_on=['subject_id','biopsy_id'],
                                      how='left')

# assign new obs to rna_scored
rna_scored.obs = amp_obs.copy()

  return dispatch(args[0].__class__)(*args, **kw)


In [28]:
# process y matrix
y_df = (
    rna_scored.obs
    .assign(
        sample=lambda x: (
            x['subject_id'].astype(str) + "__" + 
            x['biopsy_id'].astype(str) + "__" + 
            x['joint'].astype(str) + "__" + 
            x['sex'].astype(str) + "__" +
            x['treatment'].astype(str) + "__" +
            x['CTAP'].astype(str) + "__" +
            x['das28_crp3_binary'].astype(str)
        )
    )
    .filter(regex=r'^sample$|^waggr_comp')
    .groupby('sample')
    .mean()
    .T
)
y_df.index = y_df.index.str.replace('waggr_', '')
y_df = y_df.loc[ligands_df.index]

# scale y matrix
Y_scaled = pd.DataFrame(scaler.fit_transform(y_df), columns=y_df.columns, index=y_df.index)

# calculate activity scores
activity_scores, r2_obs = scc.calculate_ligand_activity_parallel(X_scaled, Y_scaled, n_perms=1000, verbose=True,
                                                               alpha_range=np.logspace(-1, 3, 100),
                                                               #l1_ratios=np.logspace(-1,0,20),
                                                               )
activity_scores = activity_scores.T.reset_index(names='sample')
activity_scores.to_csv('/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/cytosig/activity_scores_AMP2023_disease.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


Condition: 300-0143__BRI-413__Knee__Female__naive__F__high | Chosen Alpha: 73.90722 | R2: 0.6417
Condition: 300-0144__BRI-546__MCP__Female__methotrexate failure__T + B__low | Chosen Alpha: 1000.00000 | R2: 0.1256
Condition: 300-0145__BRI-589__MCP__Female__TNF failure__E + F + M__low | Chosen Alpha: 432.87613 | R2: 0.2859
Condition: 300-0149__BRI-560__Knee__Male__TNF failure__T + M__high | Chosen Alpha: 1000.00000 | R2: 0.1421
Condition: 300-0150__BRI-403__Wrist__Female__methotrexate failure__M__low | Chosen Alpha: 8.69749 | R2: 0.5473
Condition: 300-0151__BRI-611__MCP__Female__methotrexate failure__M__low | Chosen Alpha: 1000.00000 | R2: 0.0514
Condition: 300-0171__BRI-556__MCP__Female__methotrexate failure__M__low | Chosen Alpha: 26.56088 | R2: 0.6856
Condition: 300-0172__BRI-550__MTP__Female__naive__F__low | Chosen Alpha: 170.73526 | R2: 0.2645
Condition: 300-0173__BRI-460__Wrist__Female__naive__F__high | Chosen Alpha: 89.02151 | R2: 0.4377
Condition: 300-0174__BRI-479__Wrist__Female

[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    1.5s finished


In [29]:
# process y matrix
y_df = (
    rna_scored.obs
    .assign(
        sample=lambda x: (
            x['subject_id'].astype(str) + "__" + 
            x['biopsy_id'].astype(str) + "__" + 
            x['joint'].astype(str) + "__" + 
            x['sex'].astype(str) + "__" +
            x['treatment'].astype(str) + "__" +
            x['CTAP'].astype(str) + "__" +
            x['das28_crp3_binary'].astype(str) + "__" +
            x['cluster_name'].astype(str)
        )
    )
    .filter(regex=r'^sample$|^waggr_comp')
    .groupby('sample')
    .mean()
    .T
)
y_df.index = y_df.index.str.replace('waggr_', '')
y_df = y_df.loc[ligands_df.index]

# scale y matrix
Y_scaled = pd.DataFrame(scaler.fit_transform(y_df), columns=y_df.columns, index=y_df.index)

# calculate activity scores
activity_scores, r2_obs = scc.calculate_ligand_activity_parallel(X_scaled, Y_scaled, n_perms=1000, verbose=True,
                                                               alpha_range=np.logspace(-1, 3, 100),
                                                               )
activity_scores = activity_scores.T.reset_index(names='sample')
activity_scores.to_csv('/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/cytosig/activity_scores_AMP2023_disease_celltype.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:   11.6s


Condition: 300-0143__BRI-413__Knee__Female__naive__F__high__T-0: CD4+ IL7R+ memory | Chosen Alpha: 187.38174 | R2: 0.5118
Condition: 300-0143__BRI-413__Knee__Female__naive__F__high__T-10: CD4+ OX40+NR3C1+ | Chosen Alpha: 298.36472 | R2: 0.2241
Condition: 300-0143__BRI-413__Knee__Female__naive__F__high__T-11: CD4+ CD146+ memory | Chosen Alpha: 1000.00000 | R2: 0.1302
Condition: 300-0143__BRI-413__Knee__Female__naive__F__high__T-12: CD4+ GNLY+ | Chosen Alpha: 170.73526 | R2: 0.3258
Condition: 300-0143__BRI-413__Knee__Female__naive__F__high__T-1: CD4+ CD161+ memory | Chosen Alpha: 129.15497 | R2: 0.4169
Condition: 300-0143__BRI-413__Knee__Female__naive__F__high__T-2: CD4+ IL7R+CCR5+ memory | Chosen Alpha: 117.68120 | R2: 0.4490
Condition: 300-0143__BRI-413__Knee__Female__naive__F__high__T-3: CD4+ IFNG- Tph/Tfh | Chosen Alpha: 1000.00000 | R2: 0.0991
Condition: 300-0143__BRI-413__Knee__Female__naive__F__high__T-4: CD4+ naive | Chosen Alpha: 394.42061 | R2: 0.3514
Condition: 300-0143__BRI-4

[Parallel(n_jobs=-1)]: Done 987 out of 987 | elapsed:   15.4s finished


## Inflammation Atlas

This dataset contains CD4 T cells from PBMCs from healthy donor samples and patients with inflammatory disease across five distinct groups: (1) immune-mediated inflammatory diseases (IMIDs, nâ€‰=â€‰7) (systemic lupus erythematosus (SLE), rheumatoid arthritis (RA), psoriatic arthritis (PsA), psoriasis (PS), ulcerative colitis (UC), CrohnÊ¼s disease (CD) and multiple sclerosis (MS)); (2) acute (nâ€‰=â€‰1) (sepsis); (3) chronic inflammation (nâ€‰=â€‰3) (chronic obstructive pulmonary disease (COPD), asthma and cirrhosis); (4) infection (nâ€‰=â€‰4) (influenza virus (Flu), SARS-CoV-2 (COVID), hepatitis B virus (HBV) and human immunodeficiency virus (HIV)); and (5) solid tumors (nâ€‰=â€‰4) (breast cancer (BRCA), colorectal cancer (CRC), nasopharyngeal carcinoma (NPC) and head and neck squamous cell carcinoma (HNSCC)). It has been pre-filtered to CD4 T cells based on author provided annotations.

JimÃ©nez-Gracia, L., Maspero, D., Aguilar-FernÃ¡ndez, S. et al. Interpretable inflammation landscape of circulating immune cells. Nat Med (2026). https://doi.org/10.1038/s41591-025-04126-3

In [None]:
rna = sc.read_h5ad("/data1/rudenska/EYW/external/inflammation_atlas_2026/inflammation_atlas_cd4_subset.h5ad")
rna.var_names = rna.var['symbol'].astype(str)
rna_scored = score_anndata(rna, net_human, comp_genes_human, filter_low_expr=True, low_expr_min_cell_fraction=0.01)

  utils.warn_names_duplicates("var")


Filtered 131 genes (kept 1313/1444)


  return dispatch(args[0].__class__)(*args, **kw)


In [33]:
# process y matrix
y_df = (
    rna_scored.obs
    .assign(
        sample=lambda x: (
            x['studyID'].astype(str) + "__" + 
            x['sampleID'].astype(str) + "__" + 
            x['patientID'].astype(str) + "__" + 
            x['disease'].astype(str) + "__" +
            x['sex'].astype(str) + "__" +
            x['age'].astype(str) + "__" +
            x['binned_age'].astype(str) + "__" +
            x['chemistry'].astype(str) + "__" +
            x['diseaseGroup'].astype(str)
        )
    )
    .filter(regex=r'^sample$|^waggr_comp')
    .groupby('sample')
    .mean()
    .T
)
y_df.index = y_df.index.str.replace('waggr_', '')
y_df = y_df.loc[ligands_df.index]

# scale y matrix
Y_scaled = pd.DataFrame(scaler.fit_transform(y_df), columns=y_df.columns, index=y_df.index)

# calculate activity scores
activity_scores, r2_obs = scc.calculate_ligand_activity_parallel(X_scaled, Y_scaled, n_perms=1000, verbose=True,
                                                               alpha_range=np.logspace(-1, 3, 100),
                                                               )
activity_scores = activity_scores.T.reset_index(names='sample')
activity_scores.to_csv('/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/cytosig/activity_scores_inflammation_atlas.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:   11.7s


Condition: COMBAT2022__COMBAT2022_G05061_T0__COMBAT2022_G05061__COVID__female__nan__51-60__5_GEX_V1__infection | Chosen Alpha: 46.41589 | R2: 0.5136
Condition: COMBAT2022__COMBAT2022_G05064_T0__COMBAT2022_G05064__COVID__female__nan__51-60__5_GEX_V1__infection | Chosen Alpha: 50.94138 | R2: 0.4316
Condition: COMBAT2022__COMBAT2022_G05077_T0__COMBAT2022_G05077__COVID__male__nan__41-50__5_GEX_V1__infection | Chosen Alpha: 187.38174 | R2: 0.2870
Condition: COMBAT2022__COMBAT2022_G05078_T0__COMBAT2022_G05078__COVID__male__nan__31-40__5_GEX_V1__infection | Chosen Alpha: 20.09233 | R2: 0.5692
Condition: COMBAT2022__COMBAT2022_G05105_T0__COMBAT2022_G05105__COVID__female__nan__51-60__5_GEX_V1__infection | Chosen Alpha: 81.11308 | R2: 0.4345
Condition: COMBAT2022__COMBAT2022_G05112_T0__COMBAT2022_G05112__COVID__male__nan__41-50__5_GEX_V1__infection | Chosen Alpha: 61.35907 | R2: 0.8647
Condition: COMBAT2022__COMBAT2022_G05153_T0__COMBAT2022_G05153__COVID__male__nan__41-50__5_GEX_V1__infection | 

[Parallel(n_jobs=-1)]: Done 816 out of 816 | elapsed:   12.8s finished


In [36]:
# process y matrix
y_df = (
    rna_scored.obs
    .assign(
        sample=lambda x: (
            x['studyID'].astype(str) + "__" + 
            x['sampleID'].astype(str) + "__" + 
            x['patientID'].astype(str) + "__" + 
            x['disease'].astype(str) + "__" +
            x['sex'].astype(str) + "__" +
            x['age'].astype(str) + "__" +
            x['binned_age'].astype(str) + "__" +
            x['chemistry'].astype(str) + "__" +
            x['diseaseGroup'].astype(str) + "__" +
            x['Level2'].astype(str)
        )
    )
    .filter(regex=r'^sample$|^waggr_comp')
    .groupby('sample')
    .mean()
    .T
)
y_df.index = y_df.index.str.replace('waggr_', '')
y_df = y_df.loc[ligands_df.index]

# scale y matrix
Y_scaled = pd.DataFrame(scaler.fit_transform(y_df), columns=y_df.columns, index=y_df.index)

# calculate activity scores
activity_scores, r2_obs = scc.calculate_ligand_activity_parallel(X_scaled, Y_scaled, n_perms=1000, verbose=True,
                                                               alpha_range=np.logspace(-1, 3, 100),
                                                               )
activity_scores = activity_scores.T.reset_index(names='sample')
activity_scores.to_csv('/data1/rudenska/EYW/git_projects/SIG13/analysis_outs/cytosig/activity_scores_inflammation_atlas_celltypeLevel2.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:   26.9s
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:   36.9s
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:   48.4s
[Parallel(n_jobs=-1)]: Done 3986 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 4936 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 5986 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 7136 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 8386 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 8508 out of 8508 | elapsed:  2.2min finished


Condition: COMBAT2022__COMBAT2022_G05061_T0__COMBAT2022_G05061__COVID__female__nan__51-60__5_GEX_V1__infection__T_CD4_CM | Chosen Alpha: 1000.00000 | R2: 0.0872
Condition: COMBAT2022__COMBAT2022_G05061_T0__COMBAT2022_G05061__COVID__female__nan__51-60__5_GEX_V1__infection__T_CD4_CM_ribo | Chosen Alpha: 35.11192 | R2: 0.3319
Condition: COMBAT2022__COMBAT2022_G05061_T0__COMBAT2022_G05061__COVID__female__nan__51-60__5_GEX_V1__infection__T_CD4_EM | Chosen Alpha: 55.90810 | R2: 0.4525
Condition: COMBAT2022__COMBAT2022_G05061_T0__COMBAT2022_G05061__COVID__female__nan__51-60__5_GEX_V1__infection__T_CD4_EMRA | Chosen Alpha: 55.90810 | R2: 0.7180
Condition: COMBAT2022__COMBAT2022_G05061_T0__COMBAT2022_G05061__COVID__female__nan__51-60__5_GEX_V1__infection__T_CD4_Naive | Chosen Alpha: 81.11308 | R2: 0.4120
Condition: COMBAT2022__COMBAT2022_G05061_T0__COMBAT2022_G05061__COVID__female__nan__51-60__5_GEX_V1__infection__T_CD4_eff | Chosen Alpha: 38.53529 | R2: 0.5634
Condition: COMBAT2022__COMBAT2022