In [None]:
print()

### setup packages

In [None]:
# import packages
from glob import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scanpy as sc
import scipy.stats as ss
import seaborn as sns
sc.settings.set_figure_params(dpi=100)

### compile spatial transcriptomic datasets

In [None]:
# read in the mapping file for GPCRdb to filter for GPCRs
gpcr_mapping = pd.read_table('gpcrdb_mapping.txt', header=None, sep=' ')
gpcr_mapping.columns = ['uniprot','name']
gpcr_mapping[['name','species']] = gpcr_mapping['name'].str.split('_', expand=True)
# keep human uniprot IDs
uniprots = gpcr_mapping.loc[gpcr_mapping['species'] == 'human', 'uniprot'].tolist()

In [None]:
from glob import glob
# compile the complete list of visiums to run
adatas = []

In [None]:
# process heart atlas
for fn in glob('/path/to/data/external/heartcellatlas/*.h5ad'):
    # read in the file
    adata = sc.read_h5ad(fn)
    # define the spatial coordinates
    adata.obsm['spatial'] = adata.obs[['array_row','array_col']].values
    # add on the file id
    adata.obs['file_id'] = fn.split('/')[-1]
    # split by sanger id
    for id in adata.obs['sangerID'].unique():
        adatas.append(adata[adata.obs['sangerID'] == id].copy())

In [None]:
# process fetal lung atlas
for fn in glob('/path/to/data/external/fetallung/*.h5ad'):
    # read in the file
    adata = sc.read_h5ad(fn)
    # define the spatial coordinates
    adata.obsm['spatial'] = adata.obs[['array_row','array_col']].values
    # add on the file id
    adata.obs['file_id'] = fn.split('/')[-1]
    # each sample is a separate file
    adatas.append(adata)
    assert len(list(adata.uns['spatial'].keys())) == 1

In [None]:
# process fetal immune atlas
for fn in glob('/path/to/data/external/fetalimmune/*.h5ad'):
    # read in the file
    adata = sc.read_h5ad(fn)
    # define the spatial coordinates
    adata.obsm['spatial'] = adata.obs[['array_row','array_col']].values
    # add on the file id
    adata.obs['file_id'] = fn.split('/')[-1]
    # split by sanger id
    for id in adata.obs['sample_id'].unique():
        adatas.append(adata[adata.obs['sample_id'] == id].copy())

In [None]:
# process kidney atlas
for fn in glob('/path/to/data/external/kidneycellatlas/*.h5ad'):
    # read in the file
    adata = sc.read_h5ad(fn)
    # define the spatial coordinates
    adata.obsm['spatial'] = adata.obs[['array_row','array_col']].values
    # add on the file id
    adata.obs['file_id'] = fn.split('/')[-1]
    # split by sanger id
    for id in adata.obs['sample'].unique():
        adatas.append(adata[adata.obs['sample'] == id].copy())

In [None]:
# process 10x genomics data
fns = glob('/path/to/data/external/10x_multiorgans/*.h5ad') +\
glob('/path/to/data/external/10x_visium_v1_cancer/*.h5ad')
for fn in fns:
    # read in the file
    adata = sc.read_h5ad(fn)
    # define the spatial coordinates
    adata.obsm['spatial'] = adata.obs[['row','col']].values
    # add on the file id
    adata.obs['file_id'] = fn.split('/')[-1]
    # each sample is a separate file
    adatas.append(adata)

10x genomics derived data was log normalized via bayesspace in R via:
```
library(SingleCellExperiment)
library(BayesSpace)
# read in dataset
dirname <- "<path_to_raw_10x_genomics_data>"
sce <- readVisium(paste0(dirname, "<name_of_dataset>"))
# compute PCA with HVGs
sce <- spatialPreprocess(sce, platform="Visium", n.PCs=15, n.HVGs=2000, log.normalize=TRUE)
# write the data
library(zellkonverter)
writeH5AD(sce, file="<path_to_normalized_h5ad>")
```

### calculate spatial interactions

In [None]:
import spatialdm as sdm
# define the method to derive the spatial correlations
def calc_corr(adata):
    try:
        # make sure everything is unique
        adata.var_names = adata.var_names.astype(str)
        sc.pp.filter_genes(adata, min_cells=10)
        adata.var_names_make_unique()
        # define the weight matrix for the kernel
        sdm.weight_matrix(adata, l=1.2, cutoff=0.2, single_cell=False)
        # define the min n-spots filter
        n_spots = 10

        # assemble expression values of complexes
        genes = df_complex.values.flatten().astype(str)[df_complex.values.flatten().astype(str) != 'nan']
        genes = adata.var.index.intersection(genes)
        df_gex = sc.get.obs_df(adata, keys=genes.tolist())
        expr_complexs = []
        for idx in df_complex.index:
            genes = adata.var.index.intersection(df_complex.loc[idx].dropna())
            if len(genes) == 0: continue
            expr_complex = df_gex[genes].mean(1)
            expr_complex.name = idx
            expr_complexs.append(expr_complex)
        expr_complex = pd.concat(expr_complexs, axis=1)
        valid_complexes = expr_complex.columns[(expr_complex > 0).sum(0) >= n_spots]

        # perform rapid filtering (random)
        ligands = valid_rand_pairs['ligand'].unique()
        ligands = adata.var.index.intersection(ligands)
        try: valid_ligands = ligands[(adata[:, ligands].X > 0).sum(0).A1 >= n_spots].tolist()
        except: valid_ligands = ligands[(adata[:, ligands].X > 0).sum(0) >= n_spots].tolist()
        valid_ligands = valid_ligands + [x for x in valid_rand_pairs['ligand'].unique() if x in valid_complexes]
        receptors = valid_rand_pairs['receptor'].unique()
        receptors = adata.var.index.intersection(receptors)
        try: valid_receptors = receptors[(adata[:, receptors].X > 0).sum(0).A1 >= n_spots].tolist()
        except: valid_receptors = receptors[(adata[:, receptors].X > 0).sum(0) >= n_spots].tolist()
        valid_receptors = valid_receptors + [x for x in valid_rand_pairs['receptor'].unique() if x in valid_complexes]
        valid_rand_pairs_ = valid_rand_pairs.loc[valid_rand_pairs['ligand'].isin(valid_ligands) &\
        valid_rand_pairs['receptor'].isin(valid_receptors)].copy()
        # extract and compute Z-scores for the known ligand-receptor pairs
        sdm.extract_lr(adata, 'human', min_cell=10, datahost='user', df_intrxn=valid_rand_pairs_, df_complex=df_complex.copy())
        sdm.spatialdm_global(adata, specified_ind=None, method='z-score', nproc=1)
        zpval_rand, z_rand = adata.uns['global_res']['z_pval'], adata.uns['global_res']['z']

        # perform rapid filtering (known)
        ligands = df_intrxn_gpcrvsall['ligand'].unique()
        ligands = adata.var.index.intersection(ligands)
        try: valid_ligands = ligands[(adata[:, ligands].X > 0).sum(0).A1 >= n_spots].tolist()
        except: valid_ligands = ligands[(adata[:, ligands].X > 0).sum(0) >= n_spots].tolist()
        valid_ligands = valid_ligands + [x for x in df_intrxn_gpcrvsall['ligand'].unique() if x in valid_complexes]
        receptors = df_intrxn_gpcrvsall['receptor'].unique()
        receptors = adata.var.index.intersection(receptors)
        try: valid_receptors = receptors[(adata[:, receptors].X > 0).sum(0).A1 >= n_spots].tolist()
        except: valid_receptors = receptors[(adata[:, receptors].X > 0).sum(0) >= n_spots].tolist()
        valid_receptors = valid_receptors + [x for x in df_intrxn_gpcrvsall['receptor'].unique() if x in valid_complexes]
        df_intrxn_gpcrvsall_ = df_intrxn_gpcrvsall.loc[df_intrxn_gpcrvsall['ligand'].isin(valid_ligands) &\
        df_intrxn_gpcrvsall['receptor'].isin(valid_receptors)].copy()
        # extract and compute Z-scores for the known ligand-receptor pairs
        sdm.extract_lr(adata, 'human', min_cell=10, datahost='user', df_intrxn=df_intrxn_gpcrvsall_, df_complex=df_complex.copy())
        sdm.spatialdm_global(adata, specified_ind=None, method='z-score', nproc=1)
        zpval_known, z_known = adata.uns['global_res']['z_pval'], adata.uns['global_res']['z']
        
        # perform rapid filtering (orphan)
        ligands = df_intrxn_orphan['ligand'].unique()
        ligands = adata.var.index.intersection(ligands)
        try: valid_ligands = ligands[(adata[:, ligands].X > 0).sum(0).A1 >= n_spots].tolist()
        except: valid_ligands = ligands[(adata[:, ligands].X > 0).sum(0) >= n_spots].tolist()
        valid_ligands = valid_ligands + [x for x in df_intrxn_orphan['ligand'].unique() if x in valid_complexes]
        receptors = df_intrxn_orphan['receptor'].unique()
        receptors = adata.var.index.intersection(receptors)
        try: valid_receptors = receptors[(adata[:, receptors].X > 0).sum(0).A1 >= n_spots].tolist()
        except: valid_receptors = receptors[(adata[:, receptors].X > 0).sum(0) >= n_spots].tolist()
        valid_receptors = valid_receptors + [x for x in df_intrxn_orphan['receptor'].unique() if x in valid_complexes]
        df_intrxn_orphan_ = df_intrxn_orphan.loc[df_intrxn_orphan['ligand'].isin(valid_ligands) &\
        df_intrxn_orphan['receptor'].isin(valid_receptors)].copy()
        # extract and compute Z-scores for the orphan ligand-receptor pairs
        sdm.extract_lr(adata, 'human', min_cell=10, datahost='user', df_intrxn=df_intrxn_orphan_, df_complex=df_complex.copy())
        sdm.spatialdm_global(adata, specified_ind=None, method='z-score', nproc=1)
        zpval_orphan, z_orphan = adata.uns['global_res']['z_pval'], adata.uns['global_res']['z']
        
        # assemble and return result
        result = {adata.obs['file_id'][0]: {'rand': {'z_pval': zpval_rand, 'z': z_rand}, 
                                            'known': {'z_pval': zpval_known, 'z': z_known}, 
                                            'orphan': {'z_pval': zpval_orphan, 'z': z_orphan}}}
        return result
    except Exception as e:
        print('!!!' + adata.obs['file_id'][0] + str(e))

In [None]:
# retrieve uniprot to gene name mapping
df_conv = pd.read_csv('/home/dchen2/PACKAGES/cellphonedb-data-5.0.0/data/gene_input.csv')
df_conv = df_conv[['uniprot','gene_name']].value_counts().reset_index()
assert df_conv['uniprot'].value_counts().max() == 1
uniprot2genename = df_conv.set_index('uniprot')['gene_name']
# read in and process the complex
df_complex = pd.read_csv('/home/dchen2/PACKAGES/cellphonedb-data-5.0.0/data/complex_input.csv', index_col=0)
assert df_complex.index.value_counts().max() == 1; df_complex.index.name = None
df_complex = df_complex.loc[:, df_complex.columns.str.startswith('uniprot')]
df_complex.columns = df_complex.columns.str.replace('uniprot','subunit')
for col in df_complex.columns: df_complex[col] = df_complex[col].map(uniprot2genename)
# read in and process the interaction pairings
df_intrxn = pd.read_csv('/home/dchen2/PACKAGES/cellphonedb-data-5.0.0/data/interaction_input.csv')
for col, new_col in zip(['partner_a','partner_b'], ['ligand','receptor']):
    conv = df_intrxn[col].map(uniprot2genename)
    conv[conv.isna()] = df_intrxn.loc[conv.isna(), col]
    df_intrxn[new_col] = conv
df_intrxn['interaction_name'] = df_intrxn[['ligand','receptor']].agg(':'.join, axis=1)
df_intrxn.index = df_intrxn['interaction_name']
df_intrxn = df_intrxn[['interaction_name','ligand','receptor','is_ppi','directionality']]
# identify secreted proteins to differentiate
secreted_proteins = pd.read_csv('/home/dchen2/PACKAGES/cellphonedb-data-5.0.0/data/protein_input.csv', index_col=0)['secreted']
secreted_complexes = pd.read_csv('/home/dchen2/PACKAGES/cellphonedb-data-5.0.0/data/complex_input.csv', index_col=0)['secreted']
secreted_proteins.index = secreted_proteins.index.map(uniprot2genename)
secreted = pd.concat([secreted_proteins, secreted_complexes], axis=0)
secreted = secreted.index[secreted]
df_intrxn['annotation'] = 'No Secretion'
df_intrxn.loc[df_intrxn['ligand'].isin(secreted) | df_intrxn['receptor'].isin(secreted), 'annotation'] = 'Secretion'
# reset name of the index
df_intrxn.index.name = None
# adjust to remove duplicates
df_intrxn = df_intrxn.value_counts().reset_index().iloc[:, :-1]
df_intrxn.index = df_intrxn['interaction_name'].values
# derive a GPCR only subset
assert df_complex.index[df_complex.isin(uniprots).any(axis=1)].shape[0] == 0
df_conv_gpcrs = df_conv.loc[df_conv['uniprot'].isin(uniprots)]
# derive a list of the interactions with GPCRs
mask = df_intrxn['receptor'].isin(df_conv_gpcrs['gene_name'].unique())
df_intrxn_gpcrs = df_intrxn.loc[mask]

In [None]:
from tqdm import tqdm
# get a list of the unique ligands from the entire database, we care about known for paralog
ligands = df_intrxn['ligand'].unique()
# receptors are known GPCRs, read in from GPCRdb
receptors = df_intrxn_gpcrs['receptor'].unique()
# fill in the possible interactions beyond known interactions
df_intrxn_gpcrvsall = pd.DataFrame(columns=['ligand','receptor'])
for receptor in tqdm(receptors):
    for ligand in ligands:
        df_intrxn_gpcrvsall.loc[df_intrxn_gpcrvsall.shape[0]] = ligand, receptor
df_intrxn_gpcrvsall.insert(0, 'interaction_name', df_intrxn_gpcrvsall[['ligand','receptor']].agg(':'.join, axis=1))
df_intrxn_gpcrvsall.index = df_intrxn_gpcrvsall['interaction_name']
df_intrxn_gpcrvsall['annotation'] = 'No Secretion'
df_intrxn_gpcrvsall.loc[df_intrxn_gpcrvsall['ligand'].isin(secreted), 'annotation'] = 'Secretion'
# reset name of the index
df_intrxn_gpcrvsall.index.name = None
# adjust to remove duplicates
df_intrxn_gpcrvsall = df_intrxn_gpcrvsall.value_counts().reset_index().iloc[:, :-1]
df_intrxn_gpcrvsall.index = df_intrxn_gpcrvsall['interaction_name'].values

In [None]:
# repeat for orphans from GPCRdb (class A and class c)
with open('/path/to/orphans.all.txt', 'rt') as f:
    receptors = [x.strip() for x in f.readlines()]
# fill in the possible orphan interactions
df_intrxn_orphan = pd.DataFrame(columns=['ligand','receptor'])
for receptor in tqdm(receptors):
    for ligand in ligands:
        df_intrxn_orphan.loc[df_intrxn_orphan.shape[0]] = ligand, receptor
df_intrxn_orphan.insert(0, 'interaction_name', df_intrxn_orphan[['ligand','receptor']].agg(':'.join, axis=1))
df_intrxn_orphan.index = df_intrxn_orphan['interaction_name']
df_intrxn_orphan['annotation'] = 'No Secretion'
df_intrxn_orphan.loc[df_intrxn_orphan['ligand'].isin(secreted), 'annotation'] = 'Secretion'
# reset name of the index
df_intrxn_orphan.index.name = None
# adjust to remove duplicates
df_intrxn_orphan = df_intrxn_orphan.value_counts().reset_index().iloc[:, :-1]
df_intrxn_orphan.index = df_intrxn_orphan['interaction_name'].values

In [None]:
# generate random pairs for the GPCR subset of the known
n_needed = df_intrxn_gpcrs.shape[0]
np.random.seed(0)
valid_rand_pairs = []; running_pairs = []
while n_needed > 0:
    # grab the data
    ligands = np.random.choice(df_intrxn_gpcrs['ligand'].unique(), size=n_needed, replace=True)
    receptors = np.random.choice(df_intrxn_gpcrs['receptor'].unique(), size=n_needed, replace=True)
    rand_pairs = pd.Series(np.unique(ligands + '=X=' + receptors))
    rand_pairs_intrxn_format = pd.Series(np.unique(ligands + ':' + receptors))
    mask = (~rand_pairs_intrxn_format.isin(df_intrxn_gpcrs.index)) & (~rand_pairs.isin(running_pairs))
    rand_pairs = rand_pairs[mask]
    # create the dataframe and deduplicate
    rand_pairs_split = rand_pairs.str.split('=X=', expand=True)
    valid_rand_pairs.append(rand_pairs_split)
    running_pairs += rand_pairs.tolist()
    n_needed -= len(rand_pairs)
valid_rand_pairs = pd.concat(valid_rand_pairs, axis=0)
valid_rand_pairs.columns = ['ligand','receptor']

In [None]:
# add on the proper annotation
valid_rand_pairs = valid_rand_pairs.reset_index().iloc[:, 1:]
valid_rand_pairs['annotation'] = 'No Secretion'
valid_rand_pairs.loc[valid_rand_pairs['ligand'].isin(secreted) | valid_rand_pairs['receptor'].isin(secreted), 'annotation'] = 'Secretion'
valid_rand_pairs['interaction_name'] = valid_rand_pairs[['ligand','receptor']].agg(':'.join, axis=1)
valid_rand_pairs.index = valid_rand_pairs['interaction_name']
valid_rand_pairs = valid_rand_pairs[['interaction_name','ligand','receptor','annotation']]

In [None]:
import multiprocessing as mp
from tqdm import tqdm
# leverage multiprocessing to clear the predictions
results = []
with mp.Pool(processes=36) as pool:
    for result in tqdm(pool.map(calc_corr, adatas), total=len(adatas)):
        results.append(result)

### retrieve annotations for spatial transcriptomic datasets

In [None]:
# read in the annotations
anno = pd.read_excel('/path/to/241111_visium_annotations.xlsx', sheet_name=0)
# conversion for 10x
file2sample = anno[['file_id','sample_id']].value_counts().reset_index().set_index('file_id')['sample_id']
anno['tag'] = anno[['file_id','sample_id']].agg(':'.join, axis=1)
# create an age category
anno['age_cat'] = 'Unknown (likely adult)'
anno.loc[anno['age'].astype(str).str.endswith('pcw'), 'age_cat'] = 'Fetal'
anno.loc[anno['age'].isin(['55-60','45-50','40-45','50-55','20-25','51-60','41-50']), 'age_cat'] = 'Adult (<60)'
anno.loc[anno['age'].isin(['60-70','73','70-75','65-70','60-65','71-80','71-70']), 'age_cat'] = 'Adult (≥60)'

In [None]:
# compile the information
df_info = pd.DataFrame(columns=['file_id','sample_id'])
for adata in adatas:
    file_id = adata.obs['file_id'].iloc[0]
    sample_col = 'sangerID'
    if sample_col not in adata.obs: sample_col = 'sample_id'
    if sample_col not in adata.obs: sample_col = 'sample'
    if sample_col not in adata.obs:
        sample_id = file2sample[file_id]
    else:
        sample_id = adata.obs[sample_col].iloc[0]
    df_info.loc[df_info.shape[0]] = file_id, sample_id
df_info['tag'] = df_info[['file_id','sample_id']].agg(':'.join, axis=1)

### visualize CCC dynamics across major organ systems and developmental stages

In [None]:
# compile the results for z-score of known LRs with rand LRs as control
df_zs = []
for result, tag in zip(results, df_info['tag']):
    if result is None: continue
    # grab the file id
    file_id = list(result.keys())[0]
    # grab the result object
    result = result[file_id]
    # retrieve the proper dataframe
    known, rand = result['known']['z'].copy(), result['rand']['z'].copy()
    known.index += '}{KNOWN'; rand.index += '}{RAND'
    df_z = pd.concat([known, rand], axis=0).copy()
    df_z.name = tag
    df_zs.append(df_z)

In [None]:
# compute the expression profiles
df_z = pd.concat(df_zs, axis=1).T.fillna(0)
df_z['organ'] = df_z.index.map(anno[['tag','organ']].value_counts().reset_index().set_index('tag')['organ'])
df_z['organ'] += ':' + df_z.index.map(anno[['tag','age_cat']].value_counts().reset_index().set_index('tag')['age_cat'])
df_z = df_z.melt(id_vars='organ').dropna().reset_index().iloc[:, 1:]
df_z.columns = ['organ','variable','value']
df_z[['variable','test']] = df_z['variable'].str.split('}{', expand=True, regex=False)
df_z['secreted_vs_not'] = df_z['variable'].map(df_intrxn['annotation'])
df_z['receptor_type'] = df_z['variable'].map(df_intrxn['directionality'])
df_z['protein_vs_metabolite'] = df_z['variable'].map(df_intrxn['is_ppi']).map({True:'Protein (Secreted)', False:'Metabolite (Secreted)'})
assert df_z.loc[df_z['secreted_vs_not'] == 'No Secretion', 'protein_vs_metabolite'].value_counts().index.tolist() == ['Protein (Secreted)']
assert df_z.loc[df_z['receptor_type'] == 'Adhesion-Adhesion', 'protein_vs_metabolite'].value_counts().index.tolist() == ['Protein (Secreted)']
# > gap junctions are not GPCRs so they are excluded from this analysis
# assert df_z.loc[df_z['receptor_type'] == 'Gap-Gap', 'protein_vs_metabolite'].value_counts().index.tolist() == ['Protein (Secreted)']
df_z.loc[df_z['secreted_vs_not'] == 'No Secretion', 'protein_vs_metabolite'] = 'Protein (Non-Secreted)'
df_z.loc[df_z['receptor_type'] == 'Adhesion-Adhesion', 'protein_vs_metabolite'] = 'Protein (Adhesion)'
df_z.loc[df_z['receptor_type'] == 'Gap-Gap', 'protein_vs_metabolite'] = 'Protein (Gap Junction)'
df_z['protein_vs_metabolite'] = df_z['protein_vs_metabolite'].fillna('Random')
df_z['organ_simple'] = df_z['organ'].str.replace('Adult (≥60)', 'adult').str.replace('Adult (<60)', 'adult').str.replace('Unknown (likely adult)','adult').str.replace('Fetal','fetal').str.replace(':', ' – ')
# only look at known for now
df_z_plot = df_z.loc[df_z['variable'].isin(df_intrxn.index) | (df_z['test'] == 'RAND')]

In [None]:
# get the color of the different organs
order = sorted(df_z_plot['organ_simple'].unique())
order1 = [x for x in order if x.startswith('cns')]
order2 = [x for x in order if x.startswith('heart')]
order3 = [x for x in order if x not in order1 and x not in order2]
hue_order = ['Metabolite (Secreted)','Protein (Secreted)','Protein (Non-Secreted)','Protein (Adhesion)', 'Random']
# this is too long we have to separately plot
# > cns
fig, ax = plt.subplots(figsize=[6, 4]); ax.grid(False)
sns.barplot(x='organ_simple', y='value', data=df_z_plot, hue='protein_vs_metabolite', order=order1,
            ci=68, errwidth=1.5, capsize=0.3, edgecolor='k', errcolor='k', linewidth=1.5, hue_order=hue_order,
            saturation=1, palette=['#cc78bc', '#56b4e9', '#de8f05', '#0173b2','lightgray'])
ax.tick_params(axis='x', labelrotation=90); ax.set_xlim(-1, len(order1)); ax.legend(bbox_to_anchor=(1.05, .99), bbox_transform=ax.transAxes, frameon=False, loc='upper left')
ax.set_ylim(0, 60)
# > heart
fig, ax = plt.subplots(figsize=[11, 4]); ax.grid(False)
sns.barplot(x='organ_simple', y='value', data=df_z_plot, hue='protein_vs_metabolite', order=order2,
            ci=68, errwidth=1.5, capsize=0.3, edgecolor='k', errcolor='k', linewidth=1.5, hue_order=hue_order,
            saturation=1, palette=['#cc78bc', '#56b4e9', '#de8f05', '#0173b2','lightgray'])
ax.tick_params(axis='x', labelrotation=90); ax.set_xlim(-1, len(order2)); ax.legend(bbox_to_anchor=(1.05, .99), bbox_transform=ax.transAxes, frameon=False, loc='upper left')
ax.set_ylim(0, 60)
# > rest
fig, ax = plt.subplots(figsize=[18, 4]); ax.grid(False)
sns.barplot(x='organ_simple', y='value', data=df_z_plot, hue='protein_vs_metabolite', order=order3,
            ci=68, errwidth=1.5, capsize=0.3, edgecolor='k', errcolor='k', linewidth=1.5, hue_order=hue_order,
            saturation=1, palette=['#cc78bc', '#56b4e9', '#de8f05', '#0173b2','lightgray'])
ax.tick_params(axis='x', labelrotation=90); ax.set_xlim(-1, len(order3)); ax.legend(bbox_to_anchor=(1.05, .99), bbox_transform=ax.transAxes, frameon=False, loc='upper left')
ax.set_ylim(0, 60)

In [None]:
# average per organ in the random data to get a baseline to normalize by
df_z_tmp = df_z_plot.loc[df_z_plot['test'] == 'RAND'].copy()
df_z_tmp['organ'] = df_z_tmp['organ'].str.replace('Adult (≥60)', 'adult').str.replace('Adult (<60)', 'adult').str.replace('Unknown (likely adult)','adult').str.replace('Fetal','fetal').str.replace(':', ' – ')
df_z_tmp = df_z_tmp.groupby('organ').mean(numeric_only=True)
# derive a baseline based on random
baseline_random = df_z_tmp.mean(1)
baseline_random_organ = baseline_random.copy()

In [None]:
# utilize this baseline to normalize all relevant values
df_z_nrm = df_z_plot.loc[df_z_plot['test'] == 'KNOWN'].copy()
df_z_nrm['organ'] = df_z_nrm['organ'].str.replace('Adult (≥60)', 'adult').str.replace('Adult (<60)', 'adult').str.replace('Unknown (likely adult)','adult').str.replace('Fetal','fetal').str.replace(':', ' – ')
df_z_nrm['value'] /= df_z_nrm['organ'].map(baseline_random)
df_z_nrm[['ligand','receptor']] = df_z_nrm['variable'].str.split(':', expand=True)
assert df_z_nrm['variable'].str.split(':', expand=True).shape[1] == 2
df_z_nrm_plot = df_z_nrm.pivot_table(index='organ', columns='variable', values='value')

In [None]:
# calculate linkages
from scipy.cluster.hierarchy import linkage, fcluster
print('calculating linkages for rows')
Z_row = linkage(df_z_nrm_plot, method='ward')
print('calculating linkages for columns')
Z_col = linkage(df_z_nrm_plot.T, method='ward')
Z_col_organ_known = Z_col.copy()

In [None]:
from sklearn.metrics import silhouette_score
# define clusters
clusters = fcluster(Z_col_organ_known, criterion='maxclust', t=30)
clusters = pd.Series(clusters, index=df_z_nrm_plot.columns).astype(str)
# define the palette
pal = sns.color_palette('hls', 30).as_hex()
pal = {str(idx+1):color for idx, color in enumerate(pal)}
colors = clusters.map(pal)

In [None]:
# create clustermap
pal = {str(idx):'r' for idx in range(25, 30+1)}
g = sns.clustermap(df_z_nrm_plot, method='ward', figsize=[25, 15], vmin=-15, vmax=15, cmap='RdBu_r',
                   col_colors=clusters.map(pal).fillna('lightgray'), colors_ratio=(.025),
                   cbar_pos=(0, 1, .01, .1), yticklabels=1, dendrogram_ratio=(.05, .1))
g.ax_heatmap.grid(False); g.ax_col_colors.grid(False)

In [None]:
# create clustermap, this is the clean version
g = sns.clustermap(df_z_nrm_plot, method='ward', figsize=[25, 15], vmin=-15, vmax=15, cmap='RdBu_r',
                   cbar_pos=(0, 1, .01, .1), yticklabels=1, dendrogram_ratio=(.05, .1))
g.ax_heatmap.grid(False)

### derive statistical profiles for spatial interactions

In [None]:
# compare the CNS and non-CNS adult and non-adult
df_z_nrm_melt = df_z_nrm.copy()
df_z_nrm_melt['cat_simple'] = 'CNS'
df_z_nrm_melt.loc[~df_z_nrm_melt['organ'].str.startswith('cns'), 'cat_simple'] = 'Non-CNS'
df_z_nrm_melt.loc[df_z_nrm_melt['organ'].str.endswith('adult'), 'cat_simple'] += ' (Adult)'
df_z_nrm_melt.loc[df_z_nrm_melt['organ'].str.endswith('fetal'), 'cat_simple'] += ' (Fetal)'
# map on the interaction types
df_z_nrm_melt['secreted_vs_not'] = df_z_nrm_melt['variable'].map(df_intrxn['annotation'])
df_z_nrm_melt['receptor_type'] = df_z_nrm_melt['variable'].map(df_intrxn['directionality'])
df_z_nrm_melt['protein_vs_metabolite'] = df_z_nrm_melt['variable'].map(df_intrxn['is_ppi']).map({True:'Protein (Secreted)', False:'Metabolite (Secreted)'})
assert df_z_nrm_melt.loc[df_z_nrm_melt['secreted_vs_not'] == 'No Secretion', 'protein_vs_metabolite'].value_counts().index.tolist() == ['Protein (Secreted)']
assert df_z_nrm_melt.loc[df_z_nrm_melt['receptor_type'] == 'Adhesion-Adhesion', 'protein_vs_metabolite'].value_counts().index.tolist() == ['Protein (Secreted)']
df_z_nrm_melt.loc[df_z_nrm_melt['secreted_vs_not'] == 'No Secretion', 'protein_vs_metabolite'] = 'Protein (Non-Secreted)'
df_z_nrm_melt.loc[df_z_nrm_melt['receptor_type'] == 'Adhesion-Adhesion', 'protein_vs_metabolite'] = 'Protein (Adhesion)'
df_z_nrm_melt.loc[df_z_nrm_melt['receptor_type'] == 'Gap-Gap', 'protein_vs_metabolite'] = 'Protein (Gap Junction)'
# get the color of the different organs
fig, ax = plt.subplots(figsize=[8, 4]); ax.grid(False)
order = sorted(df_z_nrm_melt['cat_simple'].unique())
sns.barplot(x='cat_simple', y='value', data=df_z_nrm_melt, hue='protein_vs_metabolite', order=order,
            ci=68, errwidth=1.5, capsize=0.3, edgecolor='k', errcolor='k', linewidth=1.5,
            hue_order=['Metabolite (Secreted)','Protein (Secreted)','Protein (Non-Secreted)',
                       'Protein (Adhesion)'], saturation=1,
            palette=['#cc78bc', '#56b4e9', '#de8f05', '#0173b2'])
ax.tick_params(axis='x', labelrotation=90)
ax.set_xlim(-1, len(order))
ax.legend(bbox_to_anchor=(1.05, .99), bbox_transform=ax.transAxes, frameon=False, loc='upper left')
ax.set_yscale('log')
ax.set_ylim(0, 100)
# perform all p-value comparisons
df_z_nrm_melt['tag'] = df_z_nrm_melt[['cat_simple', 'protein_vs_metabolite']].agg(':'.join, axis=1)
for tag in sorted(df_z_nrm_melt['tag'].unique()):
    if tag == 'CNS (Adult):Metabolite (Secreted)': continue
    p = ss.mannwhitneyu(df_z_nrm_melt.loc[df_z_nrm_melt['tag'] == 'CNS (Adult):Metabolite (Secreted)', 'value'],
                        df_z_nrm_melt.loc[df_z_nrm_melt['tag'] == tag, 'value'])[1]
    print(tag, p)
for tag in sorted(df_z_nrm_melt['tag'].unique()):
    if tag == 'Non-CNS (Adult):Metabolite (Secreted)': continue
    p = ss.mannwhitneyu(df_z_nrm_melt.loc[df_z_nrm_melt['tag'] == 'Non-CNS (Adult):Metabolite (Secreted)', 'value'],
                        df_z_nrm_melt.loc[df_z_nrm_melt['tag'] == tag, 'value'])[1]
    print(tag, p)
for tag in sorted(df_z_nrm_melt['tag'].unique()):
    if tag == 'Non-CNS (Fetal):Metabolite (Secreted)': continue
    p = ss.mannwhitneyu(df_z_nrm_melt.loc[df_z_nrm_melt['tag'] == 'Non-CNS (Fetal):Metabolite (Secreted)', 'value'],
                        df_z_nrm_melt.loc[df_z_nrm_melt['tag'] == tag, 'value'])[1]
    print(tag, p)

### derive tissue specificity scores from spatial interaction data

In [None]:
# copy over the normalized data
df_z_known = df_z_nrm.copy()
# then average per receptor
df_z_stat = df_z_known.groupby('variable').mean(numeric_only=True)['value'].reset_index()
df_z_stat.columns = ['index','mean']
# retrieve other statistics
def custom(x): return ss.skew(x['value'])
df_z_stat['skew'] = df_z_known.groupby('variable').apply(custom).abs().values
df_z_stat['var'] = df_z_known.groupby('variable').var(numeric_only=True)['value'].fillna(0).values
df_z_stat['max'] = df_z_known.groupby('variable').max(numeric_only=True)['value'].fillna(0).values
df_z_stat_known = df_z_stat.copy()

In [None]:
import statsmodels.api as sm
# model max after mean
model = sm.OLS(exog=sm.add_constant(df_z_stat['mean']), endog=df_z_stat['max'])
model = model.fit()
summary = model.summary()
exp = model.predict(sm.add_constant(df_z_stat['mean']))
obsmexp = df_z_stat['max'] - exp
df_z_stat['max_obsmexp'] = obsmexp

In [None]:
# mean against max
fig, ax = plt.subplots(figsize=[6, 6]); ax.grid(False)
ax.scatter(df_z_stat['mean'], df_z_stat['max'], s=2.5, alpha=0.5, color='xkcd:bright lilac')
ax.set(xlabel='mean', ylabel='max')
# retrieve axes
xlim, ylim = ax.get_xlim(), ax.get_ylim()
ax.plot([0]*2, ylim, color='k', linestyle='-', lw=1.5)
ax.plot(xlim, [0]*2, color='k', linestyle='-', lw=1.5)
# retrieve predictions
xl = np.linspace(*xlim, 100)
yl = model.predict(sm.add_constant(xl))
ax.plot(xl, yl, color='k', linestyle='--', lw=1.5)
ax.set_xlim(*xlim); ax.set_ylim(*ylim)
# retrieve correlation
print(ss.pearsonr(df_z_stat['mean'], df_z_stat['max']))

In [None]:
# define the correlations of the statistics
def get_r(xs, ys): return ss.pearsonr(xs, ys)[0]
def get_p(xs, ys): return ss.pearsonr(xs, ys)[1]
mask = df_z_stat.iloc[:, 1:].corr(method=get_p) >= 0.05
for idx in mask.index: mask.loc[idx, idx] = False
g = sns.clustermap(df_z_stat.iloc[:, 1:].corr(method=get_r), vmin=-1, vmax=1, cmap='RdBu_r', annot=True, method='ward',
                   figsize=[6, 6], fmt='.2f', cbar_pos=(0, .9, .015, .12), mask=mask)
g.ax_heatmap.set_facecolor('lightgray'); g.ax_heatmap.grid(False); g.ax_cbar.grid(False)

### examine how tissue specificity and spatial interaction statistics behave across ligand-receptor classes

In [None]:
# map on the interaction types
df_z_stat['secreted_vs_not'] = df_z_stat['index'].map(df_intrxn['annotation'])
df_z_stat['receptor_type'] = df_z_stat['index'].map(df_intrxn['directionality'])
df_z_stat['protein_vs_metabolite'] = df_z_stat['index'].map(df_intrxn['is_ppi']).map({True:'Protein (Secreted)', False:'Metabolite (Secreted)'})
assert df_z_stat.loc[df_z_stat['secreted_vs_not'] == 'No Secretion', 'protein_vs_metabolite'].value_counts().index.tolist() == ['Protein (Secreted)']
assert df_z_stat.loc[df_z_stat['receptor_type'] == 'Adhesion-Adhesion', 'protein_vs_metabolite'].value_counts().index.tolist() == ['Protein (Secreted)']
df_z_stat.loc[df_z_stat['secreted_vs_not'] == 'No Secretion', 'protein_vs_metabolite'] = 'Protein (Non-Secreted)'
df_z_stat.loc[df_z_stat['receptor_type'] == 'Adhesion-Adhesion', 'protein_vs_metabolite'] = 'Protein (Adhesion)'
df_z_stat.loc[df_z_stat['receptor_type'] == 'Gap-Gap', 'protein_vs_metabolite'] = 'Protein (Gap Junction)'

In [None]:
# plot those of interest
col = 'mean'
fig, ax = plt.subplots(figsize=[3, 4]); ax.grid(False)
order = ['Metabolite (Secreted)','Protein (Secreted)','Protein (Non-Secreted)',
         'Protein (Adhesion)']
palette = ['#cc78bc', '#56b4e9', '#de8f05', '#0173b2']
sns.barplot(x='protein_vs_metabolite', y=col, data=df_z_stat, palette=palette,
            errwidth=1.5, capsize=0.3, errcolor='k', ci=68,
            linewidth=1.5, edgecolor='k', saturation=1, order=order)
ax.tick_params(axis='x', labelrotation=90)
ylim = ax.get_ylim()
ax.set_xlim(-1, 4)
ax.set_yscale('symlog')
ax.set_ylim(0, 100)
# plot the results
for idx, tag1 in enumerate(df_z_stat['protein_vs_metabolite'].unique()[:-1]):
    for tag2 in df_z_stat['protein_vs_metabolite'].unique()[idx+1:]:
        p = ss.mannwhitneyu(df_z_stat.loc[df_z_stat['protein_vs_metabolite'] == tag1, col],
                            df_z_stat.loc[df_z_stat['protein_vs_metabolite'] == tag2, col])[1]
        print(tag1, tag2, p)

In [None]:
# plot those of interest
col = 'max_obsmexp'
fig, ax = plt.subplots(figsize=[3, 4]); ax.grid(False)
sns.barplot(x='protein_vs_metabolite', y=col, data=df_z_stat, palette=palette,
            errwidth=1.5, capsize=0.3, errcolor='k', ci=68,
            linewidth=1.5, edgecolor='k', saturation=1, order=order)
ax.tick_params(axis='x', labelrotation=90)
ylim = ax.get_ylim()
ax.set_xlim(-1, 4)
ax.set_yscale('symlog')
ax.set_ylim(-10, 100)
# plot the results
for idx, tag1 in enumerate(df_z_stat['protein_vs_metabolite'].unique()[:-1]):
    for tag2 in df_z_stat['protein_vs_metabolite'].unique()[idx+1:]:
        p = ss.mannwhitneyu(df_z_stat.loc[df_z_stat['protein_vs_metabolite'] == tag1, col],
                            df_z_stat.loc[df_z_stat['protein_vs_metabolite'] == tag2, col])[1]
        print(tag1, tag2, p)

### examine average spatial interaction of CCC class and organ system and developmental stage

In [None]:
# get the plot
plot = df_z_nrm_melt.pivot_table(index='organ', columns='protein_vs_metabolite', values='value', aggfunc=np.mean)
# get the plot – standardized
g = sns.clustermap(plot, method='ward', cmap='Reds', figsize=[6, 9], yticklabels=1,
                   dendrogram_ratio=(.2, .05), cbar_pos=(0, 1, .01, .1), standard_scale=0)
g.ax_heatmap.grid(False)

### quantify the deorphanization capacity of spatial transcriptomics for known ligand-receptors

In [None]:
# compile the results for z-score of gpcr screen LRs
df_zs = []
for result, tag in zip(results, df_info['tag']):
    if result is None: continue
    # grab the file id
    file_id = list(result.keys())[0]
    # grab the result object
    result = result[file_id]
    # retrieve the proper dataframe
    df_z = result['known']['z'].copy()
    df_z.name = tag
    df_zs.append(df_z)

In [None]:
# average per organ
df_z = pd.concat(df_zs, axis=1).T.fillna(0)
df_z['organ'] = df_z.index.map(anno[['tag','organ']].value_counts().reset_index().set_index('tag')['organ'])
df_z['organ'] += ':' + df_z.index.map(anno[['tag','age_cat']].value_counts().reset_index().set_index('tag')['age_cat'])
df_z['organ'] = df_z['organ'].str.replace('Adult (≥60)', 'adult').str.replace('Adult (<60)', 'adult').str.replace('Unknown (likely adult)','adult').str.replace('Fetal','fetal').str.replace(':', ' – ')
df_z = df_z.groupby('organ').mean()
df_z = (df_z.T / baseline_random_organ).T

In [None]:
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, precision_recall_curve
# define the data, mean
xs = df_z.mean(0)
ys = 1 * df_z.columns.isin(df_intrxn.index)
xs = np.array(xs).reshape(-1, 1); ys = np.array(ys)
xs_mean = xs.copy()
# setup the data splitter
seed = 0; np.random.seed(seed)
sss = StratifiedKFold(random_state=0, n_splits=10, shuffle=True)
df_stat = pd.DataFrame(columns=['auroc','auprc','auroc_rand','auprc_rand','method'])
# setup the random seed and loop
for idxs, idys in tqdm(sss.split(xs, ys), total=10):
    data = xs[idxs].copy(), ys[idxs].copy()
    # train the model
    clf = LogisticRegression(random_state=seed)
    clf.fit(*data)
    # get the predictions
    preds = clf.predict_proba(xs[idys])
    preds = preds[:, clf.classes_ == 1][:, 0]
    preds_bin = clf.predict(xs[idys])
    auroc = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
    auprc = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
    # retrieve random predictions
    np.random.shuffle(preds)
    np.random.shuffle(preds_bin)
    auroc_rand = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
    auprc_rand = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
    df_stat.loc[df_stat.shape[0]] = auroc, auprc, auroc_rand, auprc_rand, 'mean'

In [None]:
# define the data, max
xs = df_z.max(0)
ys = 1 * df_z.columns.isin(df_intrxn.index)
xs = np.array(xs).reshape(-1, 1); ys = np.array(ys)
xs_max = xs.copy()
# setup the data splitter
seed = 0; np.random.seed(seed)
sss = StratifiedKFold(random_state=0, n_splits=10, shuffle=True)
# setup the random seed and loop
for idxs, idys in tqdm(sss.split(xs, ys), total=10):
    data = xs[idxs].copy(), ys[idxs].copy()
    # train the model
    clf = LogisticRegression(random_state=seed)
    clf.fit(*data)
    # get the predictions
    preds = clf.predict_proba(xs[idys])
    preds = preds[:, clf.classes_ == 1][:, 0]
    preds_bin = clf.predict(xs[idys])
    auroc = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
    auprc = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
    # retrieve random predictions
    np.random.shuffle(preds)
    np.random.shuffle(preds_bin)
    auroc_rand = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
    auprc_rand = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
    df_stat.loc[df_stat.shape[0]] = auroc, auprc, auroc_rand, auprc_rand, 'max'

In [None]:
# print statistics for the above data
print(df_stat.groupby('method').mean())
print(df_stat.groupby('method').std() / np.sqrt(10))

In [None]:
import statsmodels.api as sm
# define the data, obsmexp
model = sm.OLS(exog=sm.add_constant(df_z.mean(0).reset_index().set_index('index')[0]),
               endog=df_z.max(0).reset_index().set_index('index')[0])
model = model.fit()
summary = model.summary()
exp = model.predict(sm.add_constant(df_z.mean(0).reset_index().set_index('index')[0]))
xs = df_z.max(0).reset_index().set_index('index')[0] - exp
ys = 1 * df_z.columns.isin(df_intrxn.index)
xs = np.array(xs).reshape(-1, 1); ys = np.array(ys)
xs_obsmexp = xs.copy()
# setup the data splitter
seed = 0; np.random.seed(seed)
sss = StratifiedKFold(random_state=0, n_splits=10, shuffle=True)
# setup the random seed and loop
for idxs, idys in tqdm(sss.split(xs, ys), total=10):
    data = xs[idxs].copy(), ys[idxs].copy()
    # train the model
    clf = LogisticRegression(random_state=seed)
    clf.fit(*data)
    # get the predictions
    preds = clf.predict_proba(xs[idys])
    preds = preds[:, clf.classes_ == 1][:, 0]
    preds_bin = clf.predict(xs[idys])
    auroc = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
    auprc = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
    # retrieve random predictions
    np.random.shuffle(preds)
    np.random.shuffle(preds_bin)
    auroc_rand = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
    auprc_rand = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
    df_stat.loc[df_stat.shape[0]] = auroc, auprc, auroc_rand, auprc_rand, 'obsmexp'

In [None]:
# melt the data and plot accordingly
data = df_stat.melt(id_vars='method')
fig, ax = plt.subplots(figsize=[2, 4]); ax.grid(False)
sns.boxplot(x='method', y='value', hue='variable', data=data, hue_order=['auprc','auprc_rand'],
            order=['mean','max'], palette=['xkcd:bright lilac','lightgray'], saturation=1, linewidth=1.5, linecolor='k')
ax.legend(bbox_to_anchor=(.99, 1.05), bbox_transform=ax.transAxes, loc='upper left', frameon=False)
# add some extra space on top and compute p-values
ylim = ax.get_ylim(); yrange = ylim[1] - ylim[0]; ax.set_ylim(ylim[0], ylim[1]+yrange*0.1)
# compute p-values
for method in ['mean','max']:
    p = ss.mannwhitneyu(data.loc[(data['method'] == method)&(data['variable'] == 'auprc'), 'value'],
                        data.loc[(data['method'] == method)&(data['variable'] == 'auprc_rand'), 'value'])[1]
    print(method, p)
for n in range(2, 4):
    ax.patches[n].set_hatch('//')
ax.set_xticklabels(['Mean','Max'])
ax.set_xlabel('Metric utilized')
ax.set_ylabel('Area under precision-\nrecall curve (AUPRC)')
# change name and color
children = ax.legend_.get_children()[0].get_children()[-1].get_children()[0].get_children()
children[0].get_children()[-1].get_children()[0].set_text('Observed')
children[-1].get_children()[-1].get_children()[0].set_text('Randomly\nshuffled')
children[-1].get_children()[0].get_children()[0].set_hatch('//')

In [None]:
# melt the data and plot accordingly
data = df_stat.melt(id_vars='method')
fig, ax = plt.subplots(figsize=[2, 4]); ax.grid(False)
sns.boxplot(x='method', y='value', hue='variable', data=data, hue_order=['auroc','auroc_rand'],
            order=['mean','max'], palette=['xkcd:bright lilac','lightgray'], saturation=1, linewidth=1.5, linecolor='k')
ax.legend(bbox_to_anchor=(.99, 1.05), bbox_transform=ax.transAxes, loc='upper left', frameon=False)
# add some extra space on top and compute p-values
ylim = ax.get_ylim(); yrange = ylim[1] - ylim[0]; ax.set_ylim(ylim[0], ylim[1]+yrange*0.1)
# compute p-values
for method in ['mean','max']:
    p = ss.mannwhitneyu(data.loc[(data['method'] == method)&(data['variable'] == 'auroc'), 'value'],
                        data.loc[(data['method'] == method)&(data['variable'] == 'auroc_rand'), 'value'])[1]
    print(method, p)
for n in range(2, 4):
    ax.patches[n].set_hatch('//')
ax.set_xticklabels(['Mean','Max'])
ax.set_xlabel('Metric utilized')
ax.set_ylabel('Area under receiver operating\ncharacteristic curve (AUROC)')
# change name and color
children = ax.legend_.get_children()[0].get_children()[-1].get_children()[0].get_children()
children[0].get_children()[-1].get_children()[0].set_text('Observed')
children[-1].get_children()[-1].get_children()[0].set_text('Randomly\nshuffled')
children[-1].get_children()[0].get_children()[0].set_hatch('//')

In [None]:
# compute seperately per ligand
df_stat_overall = df_stat.copy()

### stratify deorphanization capacity by ligand-receptor class

In [None]:
# derive the relevant receptors involved in certain interactions
metab_receptors = df_intrxn_gpcrs['receptor'][~df_intrxn_gpcrs['is_ppi']].unique()
secr_receptors = df_intrxn_gpcrs['receptor'][(df_intrxn_gpcrs['is_ppi'])&(df_intrxn_gpcrs['annotation'] == 'Secretion')&(df_intrxn_gpcrs['annotation'] != 'Adhesion-Adhesion')].unique()
nonsecr_receptors = df_intrxn_gpcrs['receptor'][(df_intrxn_gpcrs['is_ppi'])&(df_intrxn_gpcrs['annotation'] != 'Secretion')&(df_intrxn_gpcrs['annotation'] != 'Adhesion-Adhesion')].unique()
adhes_receptors = df_intrxn_gpcrs['receptor'][(df_intrxn_gpcrs['directionality'] == 'Adhesion-Adhesion')].unique()

In [None]:
# define the lrs
lrs = df_z.columns.to_series().str.split(':', expand=True)
df_stat = pd.DataFrame(columns=['auroc','auprc','auroc_rand','auprc_rand','method','receptor_type'])
params = [[metab_receptors, 'metab'], [secr_receptors, 'secr'],
          [nonsecr_receptors, 'nonsecr'], [adhes_receptors, 'adhes']]
for receptors, name in params:
    # define the data, mean
    mask = lrs[1].isin(receptors)
    xs = df_z.loc[:, mask].mean(0)
    ys = 1 * df_z.columns[mask].isin(df_intrxn.index)
    xs = np.array(xs).reshape(-1, 1); ys = np.array(ys)
    # setup the data splitter
    seed = 0; np.random.seed(seed)
    sss = StratifiedKFold(random_state=0, n_splits=10, shuffle=True)
    # setup the random seed and loop
    for idxs, idys in tqdm(sss.split(xs, ys), total=10):
        data = xs[idxs].copy(), ys[idxs].copy()
        # train the model
        clf = LogisticRegression(random_state=seed)
        clf.fit(*data)
        # get the predictions
        preds = clf.predict_proba(xs[idys])
        preds = preds[:, clf.classes_ == 1][:, 0]
        preds_bin = clf.predict(xs[idys])
        auroc = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
        auprc = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
        # retrieve random predictions
        np.random.shuffle(preds)
        np.random.shuffle(preds_bin)
        auroc_rand = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
        auprc_rand = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
        df_stat.loc[df_stat.shape[0]] = auroc, auprc, auroc_rand, auprc_rand, 'mean', name

In [None]:
# melt the data and plot accordingly
data = df_stat.melt(id_vars=['receptor_type','method'])
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
sns.boxplot(x='receptor_type', y='value', hue='variable', data=data, hue_order=['auprc','auprc_rand'],
            palette=['xkcd:bright lilac','lightgray'], saturation=1, linewidth=1.5, linecolor='k')
ax.legend(bbox_to_anchor=(.99, .5), bbox_transform=ax.transAxes, loc='center left', frameon=False)
# add some extra space on top and compute p-values
ylim = ax.get_ylim(); yrange = ylim[1] - ylim[0]; ax.set_ylim(ylim[0], ylim[1]+yrange*0.1)
# compute p-values
for name in ['metab','secr','nonsecr','adhes']:
    p = ss.mannwhitneyu(data.loc[(data['receptor_type'] == name)&(data['variable'] == 'auprc'), 'value'],
                        data.loc[(data['receptor_type'] == name)&(data['variable'] == 'auprc_rand'), 'value'])[1]
    print(name, p)
for n, c in zip(range(4), ['#cc78bc', '#56b4e9', '#de8f05', '#0173b2']):
    ax.patches[n].set_color(c)
    ax.patches[n].set_edgecolor('k')
for n in range(4, 8):
    ax.patches[n].set_hatch('//')
ax.set_xticklabels(['Metabolite\n(Secreted)','Protein\n(Secreted)','Protein\n(Non-Secreted)', 'Protein\n(Adhesion)'])
ax.tick_params(axis='x', labelrotation=90)
ax.set(xlabel='Ligand-receptor class', ylabel='Area under precision-\nrecall curve (AUPRC)')

In [None]:
# melt the data and plot accordingly
data = df_stat.melt(id_vars=['receptor_type','method'])
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
sns.boxplot(x='receptor_type', y='value', hue='variable', data=data, hue_order=['auroc','auroc_rand'],
            palette=['xkcd:bright lilac','lightgray'], saturation=1, linewidth=1.5, linecolor='k')
ax.legend(bbox_to_anchor=(.99, .5), bbox_transform=ax.transAxes, loc='center left', frameon=False)
# add some extra space on top and compute p-values
ylim = ax.get_ylim(); yrange = ylim[1] - ylim[0]; ax.set_ylim(ylim[0], ylim[1]+yrange*0.1)
# compute p-values
for name in ['metab','secr','nonsecr','adhes']:
    p = ss.mannwhitneyu(data.loc[(data['receptor_type'] == name)&(data['variable'] == 'auroc'), 'value'],
                        data.loc[(data['receptor_type'] == name)&(data['variable'] == 'auroc_rand'), 'value'])[1]
    print(name, p)
for n, c in zip(range(4), ['#cc78bc', '#56b4e9', '#de8f05', '#0173b2']):
    ax.patches[n].set_color(c)
    ax.patches[n].set_edgecolor('k')
for n in range(4, 8):
    ax.patches[n].set_hatch('//')
ax.set_xticklabels(['Metabolite\n(Secreted)','Protein\n(Secreted)','Protein\n(Non-Secreted)', 'Protein\n(Adhesion)'])
ax.tick_params(axis='x', labelrotation=90)
ax.set(xlabel='Ligand-receptor class', ylabel='Area under receiver operating\ncharacteristic curve (AUROC)')

In [None]:
# print statistics
print(df_stat.groupby(['method','receptor_type']).mean())
print(df_stat.groupby(['method','receptor_type']).std() / np.sqrt(10))
# save receptor type
df_stat_rtype = df_stat.copy()

### quantify deorphanization capacity by spatial transcriptomic dataset

In [None]:
# keep track of predictions
df_stat = pd.DataFrame(columns=['auroc','auprc','auroc_rand','auprc_rand','method','dataset']); preds_ = []
for dataset in tqdm(df_z.index):
    # define the data, mean
    xs = df_z.loc[dataset]
    ys = 1 * df_z.columns.isin(df_intrxn.index)
    xs = np.array(xs).reshape(-1, 1); ys = np.array(ys)
    # setup the data splitter
    seed = 0; np.random.seed(seed)
    sss = StratifiedKFold(random_state=0, n_splits=10, shuffle=True)
    # setup the random seed and loop
    for idxs, idys in sss.split(xs, ys):
        data = xs[idxs].copy(), ys[idxs].copy()
        # train the model
        clf = LogisticRegression(random_state=seed)
        clf.fit(*data)
        # get the predictions
        preds = clf.predict_proba(xs[idys])
        preds = preds[:, clf.classes_ == 1][:, 0]
        preds_.append(pd.Series(preds, index=df_z.columns[idys], name=dataset))
        preds_bin = clf.predict(xs[idys])
        auroc = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
        auprc = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
        # retrieve random predictions
        np.random.shuffle(preds)
        np.random.shuffle(preds_bin)
        auroc_rand = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
        auprc_rand = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
        df_stat.loc[df_stat.shape[0]] = auroc, auprc, auroc_rand, auprc_rand, 'mean', dataset

In [None]:
from matplotlib.cm import get_cmap
from matplotlib.colors import to_hex
# derive a color palette
cmap = get_cmap('Purples')
# melt the data and plot accordingly
fig, ax = plt.subplots(figsize=[8, 4]); ax.grid(False)
values = df_stat.groupby('dataset').mean(numeric_only=True)['auprc'].sort_values()
# derive colors and order
order = values.index; vmin = values.min(); vmax = values.max(); print(vmin, vmax)
sns.boxplot(x='dataset', y='auprc', data=df_stat, saturation=1, linewidth=1.5, linecolor='k',
            order=order[::-1], palette=[to_hex(cmap((x-vmin)/(vmax-vmin))) for x in values[::-1]])
ax.legend(bbox_to_anchor=(.99, .5), bbox_transform=ax.transAxes, loc='center left', frameon=False)
# add some extra space on top and compute p-values
ax.set_xlim(-1, df_z.shape[0]); ax.tick_params(axis='x', labelrotation=90)
ylim = ax.get_ylim(); yrange = ylim[1] - ylim[0]; ax.set_ylim(ylim[0], ylim[1]+yrange*0.1)
ax.set(xlabel='Tissue', ylabel='Area under precision-\nrecall curve (AUPRC)')
labels = values.index.str.capitalize().str.replace(' – ', ' in ')\
.str.replace('Cns','CNS').str.replace('avn','atrioventricular node').str.replace('Prostate','Reproductive (prostate)')
ax.set_xticklabels(labels[::-1])

In [None]:
# melt the data and plot accordingly
fig, ax = plt.subplots(figsize=[8, 4]); ax.grid(False)
values = df_stat.groupby('dataset').mean(numeric_only=True)['auroc'].sort_values()
# derive colors and order
order = values.index; vmin = values.min(); vmax = values.max(); print(vmin, vmax)
sns.boxplot(x='dataset', y='auroc', data=df_stat, saturation=1, linewidth=1.5, linecolor='k',
            order=order[::-1], palette=[to_hex(cmap((x-vmin)/(vmax-vmin))) for x in values[::-1]])
ax.legend(bbox_to_anchor=(.99, .5), bbox_transform=ax.transAxes, loc='center left', frameon=False)
# add some extra space on top and compute p-values
ax.plot([-1, df_z.shape[0]], [0.5]*2, color='k', linestyle='--')
ax.set_xlim(-1, df_z.shape[0]); ax.tick_params(axis='x', labelrotation=90)
ylim = ax.get_ylim(); yrange = ylim[1] - ylim[0]; ax.set_ylim(ylim[0], ylim[1]+yrange*0.1)

In [None]:
# look at the predictions
preds = pd.concat(preds_, axis=1).T.reset_index()
preds = preds.groupby('index').mean().T
df_p = preds.corr(method=get_p).astype(float)
for idx in df_p.index: df_p.loc[idx, idx] = 0
mask = df_p.astype(float) >= 0.05

In [None]:
# demostrate organ to organ similarity
g = sns.clustermap(preds.corr(), cmap='magma', method='ward', cbar_pos=(0, 1, .01, .1), mask=mask,
                   figsize=[8, 8], dendrogram_ratio=.2, xticklabels=1, yticklabels=1, vmax=0.8, vmin=-0.2)
g.ax_heatmap.grid(False); g.ax_cbar.grid(False)
g.ax_heatmap.tick_params(labelsize=10); g.ax_heatmap.set_facecolor('lightgray')

In [None]:
# add on the extra information
anno_tmp = anno.copy()
anno_tmp['tmp'] = anno[['tag','organ']].value_counts().reset_index().set_index('tag')['organ'].values
anno_tmp['tmp'] += ':' + anno[['tag','age_cat']].value_counts().reset_index().set_index('tag')['age_cat'].values
anno_tmp['tmp'] = anno_tmp['tmp'].str.replace('Adult (≥60)', 'adult').str.replace('Adult (<60)', 'adult').str.replace('Unknown (likely adult)','adult').str.replace('Fetal','fetal').str.replace(':', ' – ')

In [None]:
# derive coefficients of variation
df_info['coef_of_var'] = np.nan
for idx in tqdm(range(len(adatas)), total=len(adatas)):
    try: coef_of_var = np.nanmean(np.sqrt(adatas[idx].X.mean(axis=0).A1) / adatas[idx].X.toarray().var(axis=0))
    except: coef_of_var = np.nanmean(np.sqrt(adatas[idx].X.mean(axis=0)) / adatas[idx].X.var(axis=0))
    df_info.loc[idx, 'coef_of_var'] = coef_of_var
anno_tmp['coef_of_var'] = anno_tmp['tag'].map(df_info.set_index('tag')['coef_of_var'])

In [None]:
# calculate sparsity
df_info['sparsity'] = np.nan
for idx in tqdm(range(len(adatas)), total=len(adatas)):
    df_info.loc[idx, 'sparsity'] = 1 - np.mean(adatas[idx].X != 0)
anno_tmp['sparsity'] = anno_tmp['tag'].map(df_info.set_index('tag')['sparsity'])

In [None]:
from scipy.spatial.distance import pdist
# calculate connectivity
df_info['connectivity'] = np.nan
for idx in tqdm(range(len(adatas)), total=len(adatas)):
    df_info.loc[idx, 'connectivity'] = np.mean(pdist(adatas[idx].obsm['spatial']) < 2.5)
anno_tmp['connectivity'] = anno_tmp['tag'].map(df_info.set_index('tag')['connectivity'])

In [None]:
# compile the results for z-score of known LRs with rand LRs as control
df_zs = []
for result, tag in zip(results, df_info['tag']):
    if result is None: continue
    # grab the file id
    file_id = list(result.keys())[0]
    # grab the result object
    result = result[file_id]
    # retrieve the proper dataframe
    known, rand = result['known']['z'].copy(), result['rand']['z'].copy()
    known.index += '}{KNOWN'; rand.index += '}{RAND'
    df_z = pd.concat([known, rand], axis=0).copy()
    df_z.name = tag
    df_zs.append(df_z)

In [None]:
# compute the expression profiles
df_z = pd.concat(df_zs, axis=1).T.fillna(0)
df_z_known, df_z_test = df_z.loc[:, df_z.columns.str.endswith('}{KNOWN')], df_z.loc[:, df_z.columns.str.endswith('}{RAND')]
df_z_known.columns = df_z_known.columns.str.slice(0, -7)
df_z_test.columns = df_z_test.columns.str.slice(0, -6)
# derive a baseline based on random
baseline_random = df_z_test.mean(1)
# and normalize data with it
df_z_known = (df_z_known.T / baseline_random).T

### repeat this per dataset stratification at higher resolution

In [None]:
# keep track of predictions
df_stat_hires = pd.DataFrame(columns=['auroc','auprc','auroc_rand','auprc_rand','method','dataset']); preds_ = []
for dataset in tqdm(df_z_known.index):
    # define the data, mean
    xs = df_z_known.loc[dataset]
    ys = 1 * df_z_known.columns.isin(df_intrxn.index)
    xs = np.array(xs).reshape(-1, 1); ys = np.array(ys)
    # setup the data splitter
    seed = 0; np.random.seed(seed)
    sss = StratifiedKFold(random_state=0, n_splits=10, shuffle=True)
    # setup the random seed and loop
    for idxs, idys in sss.split(xs, ys):
        data = xs[idxs].copy(), ys[idxs].copy()
        # train the model
        clf = LogisticRegression(random_state=seed)
        clf.fit(*data)
        # get the predictions
        preds = clf.predict_proba(xs[idys])
        preds = preds[:, clf.classes_ == 1][:, 0]
        preds_.append(pd.Series(preds, index=df_z_known.columns[idys], name=dataset))
        preds_bin = clf.predict(xs[idys])
        auroc = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
        auprc = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
        # retrieve random predictions
        np.random.shuffle(preds)
        np.random.shuffle(preds_bin)
        auroc_rand = auc(*roc_curve(1 * (ys[idys] == 1), preds)[:-1])
        auprc_rand = auc(*precision_recall_curve(1 * (ys[idys] == 1), preds)[:-1][::-1])
        df_stat_hires.loc[df_stat_hires.shape[0]] = auroc, auprc, auroc_rand, auprc_rand, 'mean', dataset

In [None]:
# look at correlations
anno_mean = anno_tmp[anno_tmp['tag'].isin(df_stat_hires['dataset'])].copy()
anno_mean['auroc'] = anno_mean['tag'].map(df_stat_hires.groupby('dataset').mean(numeric_only=True)['auroc']).values
anno_mean['auprc'] = anno_mean['tag'].map(df_stat_hires.groupby('dataset').mean(numeric_only=True)['auprc']).values
# check the correlation
cols = ['n-spots', 'coef_of_var',  'sparsity', 'connectivity', 'auroc', 'auprc']
g = sns.clustermap(anno_mean[cols].corr(method='pearson'), vmin=-0.5, vmax=1, cmap='magma', method='ward', figsize=[5, 5],
                   dendrogram_ratio=.2, cbar_pos=(0, 1, .01, .1))
g.ax_heatmap.grid(False)

In [None]:
# measure against n-spots
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
xs, ys = anno_mean['auroc'], anno_mean['n-spots']
ax.scatter(xs, ys, lw=1.5, alpha=0.75,
           edgecolor='xkcd:bright lilac', color='xkcd:pale lavender', s=20)
model = np.polynomial.Polynomial(0)
model = model.fit(xs, ys, 1)
xlim, ylim = ax.get_xlim(), ax.get_ylim()
xl, yl = model.linspace(domain=xlim)
ax.plot(xl, yl, color='k')
ax.set_xlim(*xlim); ax.set_ylim(*ylim)
ax.set(xlabel='AUROC', ylabel='# of spots')
ss.pearsonr(xs, ys)

In [None]:
# repeat the visualization by AUPRC
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
xs, ys = anno_mean['auprc'], anno_mean['n-spots']
ax.scatter(xs, ys, lw=1.5, alpha=0.75,
           edgecolor='xkcd:bright lilac', color='xkcd:pale lavender', s=20)
model = np.polynomial.Polynomial(0)
model = model.fit(xs, ys, 1)
xlim, ylim = ax.get_xlim(), ax.get_ylim()
xl, yl = model.linspace(domain=xlim)
ax.plot(xl, yl, color='k')
ax.set_xlim(*xlim); ax.set_ylim(*ylim)
ax.set(xlabel='Area under precision-\nrecall curve (AUPRC)', ylabel='# of observations / dataset')
ss.pearsonr(xs, ys)

In [None]:
# as well as sparsity
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
xs, ys = anno_mean['auroc'], anno_mean['sparsity']
ax.scatter(xs, ys, lw=1.5, alpha=0.75,
           edgecolor='xkcd:bright lilac', color='xkcd:pale lavender', s=20)
model = np.polynomial.Polynomial(0)
model = model.fit(xs, ys, 1)
xlim, ylim = ax.get_xlim(), ax.get_ylim()
xl, yl = model.linspace(domain=xlim)
ax.plot(xl, yl, color='k')
ax.set_xlim(*xlim); ax.set_ylim(*ylim)
ax.set(xlabel='AUROC', ylabel='sparsity')
ss.pearsonr(xs, ys)

In [None]:
# as well as sparsity
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
xs, ys = anno_mean['auprc'], anno_mean['sparsity']
ax.scatter(xs, ys, lw=1.5, alpha=0.75,
           edgecolor='xkcd:bright lilac', color='xkcd:pale lavender', s=20)
model = np.polynomial.Polynomial(0)
model = model.fit(xs, ys, 1)
xlim, ylim = ax.get_xlim(), ax.get_ylim()
xl, yl = model.linspace(domain=xlim)
ax.plot(xl, yl, color='k')
ax.set_xlim(*xlim); ax.set_ylim(*ylim)
ax.set(xlabel='Area under precision-\nrecall curve (AUPRC)', ylabel='Sparsity of counts')
ss.pearsonr(xs, ys)

### examine spatial dynamics of promiscuous ligands and receptors

In [None]:
# compile the results for z-score of gpcr screen LRs
df_zs = []
for result, tag in zip(results, df_info['tag']):
    if result is None: continue
    # grab the file id
    file_id = list(result.keys())[0]
    # grab the result object
    result = result[file_id]
    # retrieve the proper dataframe
    df_z = result['known']['z'].copy()
    df_z.name = tag
    df_zs.append(df_z)
# average per organ
df_z = pd.concat(df_zs, axis=1).T.fillna(0)
df_z['organ'] = df_z.index.map(anno[['tag','organ']].value_counts().reset_index().set_index('tag')['organ'])
df_z['organ'] += ':' + df_z.index.map(anno[['tag','age_cat']].value_counts().reset_index().set_index('tag')['age_cat'])
df_z['organ'] = df_z['organ'].str.replace('Adult (≥60)', 'adult').str.replace('Adult (<60)', 'adult').str.replace('Unknown (likely adult)','adult').str.replace('Fetal','fetal').str.replace(':', ' – ')
df_z = df_z.groupby('organ').mean()
df_z = (df_z.T / baseline_random_organ).T

In [None]:
from scipy.spatial.distance import cdist, pdist, squareform
# retrieve ligands with multiple receptors
ligands = df_intrxn_gpcrs['ligand'].value_counts().index[df_intrxn_gpcrs['ligand'].value_counts() >= 2]
# compute the distance in spatial patterning
df_dist = pd.DataFrame(columns=['ligand','in','out','n'])
for ligand in ligands:
    if sum(df_z.columns.str.startswith(ligand) & df_z.columns.isin(df_intrxn.index)) < 2: continue
    if sum(df_z.columns.str.startswith(ligand) & (~df_z.columns.isin(df_intrxn.index))) < 2: continue
    d_in = pdist(df_z[df_z.columns[df_z.columns.str.startswith(ligand) & df_z.columns.isin(df_intrxn.index)]].T).mean()
    d_ou = cdist(df_z[df_z.columns[df_z.columns.str.startswith(ligand) & df_z.columns.isin(df_intrxn.index)]].T,
                 df_z[df_z.columns[df_z.columns.str.startswith(ligand) & (~df_z.columns.isin(df_intrxn.index))]].T).mean()
    df_dist.loc[df_dist.shape[0]] = ligand, d_in, d_ou, sum(df_z.columns.str.startswith(ligand) & df_z.columns.isin(df_intrxn.index))

In [None]:
# derive the relevant ligands involved in certain interactions
metab_ligands = df_intrxn_gpcrs['ligand'][~df_intrxn_gpcrs['is_ppi']].unique()
secr_ligands = df_intrxn_gpcrs['ligand'][(df_intrxn_gpcrs['is_ppi'])&(df_intrxn_gpcrs['annotation'] == 'Secretion')&(df_intrxn_gpcrs['annotation'] != 'Adhesion-Adhesion')].unique()
nonsecr_ligands = df_intrxn_gpcrs['ligand'][(df_intrxn_gpcrs['is_ppi'])&(df_intrxn_gpcrs['annotation'] != 'Secretion')&(df_intrxn_gpcrs['annotation'] != 'Adhesion-Adhesion')].unique()
adhes_ligands = df_intrxn_gpcrs['ligand'][(df_intrxn_gpcrs['directionality'] == 'Adhesion-Adhesion')].unique()

In [None]:
# adhesions do not have enough 1LMR
df_metab = df_dist[df_dist['ligand'].isin(metab_ligands)].melt(id_vars=['ligand'])
df_metab['annotation'] = 'Metabolite (Secreted)'
df_secr = df_dist[df_dist['ligand'].isin(secr_ligands)].melt(id_vars=['ligand'])
df_secr['annotation'] = 'Protein (Secreted)'
df_nonsecr = df_dist[df_dist['ligand'].isin(nonsecr_ligands)].melt(id_vars=['ligand'])
df_nonsecr['annotation'] = 'Protein (Non-Secreted)'
df_adhes = df_dist[df_dist['ligand'].isin(adhes_ligands)].melt(id_vars=['ligand'])
df_adhes['annotation'] = 'Protein (Adhesion)'
df = pd.concat([df_metab, df_secr, df_nonsecr, df_adhes], axis=0)

In [None]:
# save the values
df_dist_spatial, df_spatial = df_dist.copy(), df.copy()

In [None]:
# retrieve ligands with multiple receptors
ligands = df_intrxn_gpcrs['ligand'].value_counts().index[df_intrxn_gpcrs['ligand'].value_counts() >= 2]
# compare maximal interaction scores
df_dist = pd.DataFrame(columns=['ligand','in','out','n'])
for ligand in tqdm(ligands):
    if sum(df_z.columns.str.startswith(ligand) & df_z.columns.isin(df_intrxn.index)) < 2: continue
    if sum(df_z.columns.str.startswith(ligand) & (~df_z.columns.isin(df_intrxn.index))) < 2: continue
    d_in = pdist(df_z[df_z.columns[df_z.columns.str.startswith(ligand) & df_z.columns.isin(df_intrxn.index)]].max(0).values.reshape(-1, 1)).mean()
    d_ou = cdist(df_z[df_z.columns[df_z.columns.str.startswith(ligand) & df_z.columns.isin(df_intrxn.index)]].max(0).values.reshape(-1, 1),
                 df_z[df_z.columns[df_z.columns.str.startswith(ligand) & (~df_z.columns.isin(df_intrxn.index))]].max(0).values.reshape(-1, 1)).mean()
    df_dist.loc[df_dist.shape[0]] = ligand, d_in, d_ou, sum(df_z.columns.str.startswith(ligand) & df_z.columns.isin(df_intrxn.index))

In [None]:
# adhesions do not have enough 1LMR
df_metab = df_dist[df_dist['ligand'].isin(metab_ligands)].melt(id_vars=['ligand'])
df_metab['annotation'] = 'Metabolite (Secreted)'
df_secr = df_dist[df_dist['ligand'].isin(secr_ligands)].melt(id_vars=['ligand'])
df_secr['annotation'] = 'Protein (Secreted)'
df_nonsecr = df_dist[df_dist['ligand'].isin(nonsecr_ligands)].melt(id_vars=['ligand'])
df_nonsecr['annotation'] = 'Protein (Non-Secreted)'
df_adhes = df_dist[df_dist['ligand'].isin(adhes_ligands)].melt(id_vars=['ligand'])
df_adhes['annotation'] = 'Protein (Adhesion)'
df = pd.concat([df_metab, df_secr, df_nonsecr, df_adhes], axis=0)

In [None]:
# save the values
df_dist_max, df_max = df_dist.copy(), df.copy()

In [None]:
# look at the broad distribution
fig, ax = plt.subplots(figsize=[1, 4]); ax.grid(False)
sns.boxplot(np.log2(df_dist_spatial['in'] / df_dist_spatial['out']), color='xkcd:bright lilac', linecolor='k', linewidth=1.5)
ax.plot([-1, 1], [0]*2, color='k', linestyle='--', zorder=0); ax.set_xlim(-1, 1)
ax.tick_params(axis='x', bottom=False, which='both')
ax.set_ylabel('Cross-organ interaction\npattern distance')
# calculate statistics
counts = (1 * (np.log2(df_dist_spatial['in'] / df_dist_spatial['out']) < 0)).value_counts().loc[[1, 0]]
exp_avg = counts.sum() / 2
exp = [exp_avg] * 2
p = ss.fisher_exact(np.array([exp, counts]))[1]
print(p)
# show the percent that are less than 0
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
_ = ax.pie(counts, colors=['xkcd:bright lilac','lightgray'], hatch=['','//'])
print(np.mean(np.log2(df_dist_spatial['in'] / df_dist_spatial['out']) < 0)); np.log2(df_dist_spatial['in'] / df_dist_spatial['out']).describe()

In [None]:
# look at the broad distribution
fig, ax = plt.subplots(figsize=[1, 4]); ax.grid(False)
sns.boxplot(np.log2(df_dist_max['in'] / df_dist_max['out']), color='xkcd:bright lilac', linecolor='k', linewidth=1.5)
ax.plot([-1, 1], [0]*2, color='k', linestyle='--', zorder=0); ax.set_xlim(-1, 1)
ax.tick_params(axis='x', bottom=False, which='both')
ax.set_ylabel('Interaction intensity distance')
# calculate statistics
counts = (1 * (np.log2(df_dist_max['in'] / df_dist_max['out']) < 0)).value_counts().loc[[1, 0]]
exp_avg = counts.sum() / 2
exp = [exp_avg] * 2
p = ss.fisher_exact(np.array([exp, counts]))[1]
print(p)
# show the percent that are less than 0
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
_ = ax.pie(counts, colors=['xkcd:bright lilac','lightgray'], hatch=['','//'])
print(np.mean(np.log2(df_dist_max['in'] / df_dist_max['out']) < 0)); np.log2(df_dist_max['in'] / df_dist_max['out']).describe()

In [None]:
# compare intra and inter differences in terms of spatial patterning
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
sns.boxplot(x='annotation', y='value', hue='variable', data=df_spatial[df_spatial['variable'].isin(['in','out'])],
            saturation=1, hue_order=['in','out'], linewidth=1.5, linecolor='k', palette=['xkcd:bright lilac','lightgray'])
ax.set_xlim(-1, 4); ax.tick_params(axis='x', labelrotation=90)
for x in df_spatial['annotation'].unique():
    p = ss.mannwhitneyu(df_spatial.loc[(df_spatial['annotation'] == x)&(df_spatial['variable'] == 'in'), 'value'],
                        df_spatial.loc[(df_spatial['annotation'] == x)&(df_spatial['variable'] == 'out'), 'value'])[1]
    print(method, p)
for n in range(4, 8):
    ax.patches[n].set_hatch('//')
ylim = ax.get_ylim(); yrange = ylim[1] - ylim[0]; ax.set_ylim(ylim[0], ylim[1]+yrange*0.15)
ax.set(xlabel='Ligand-receptor class', ylabel='Cross-organ interaction\npattern distance')
ax.legend(labels=['Binds the\nsame ligand','Binds any\nother ligands'],
          bbox_to_anchor=(.99, 1.05), bbox_transform=ax.transAxes, frameon=False, loc='upper left')

In [None]:
# compare intra and inter differences in terms of spatial patterning
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
sns.boxplot(x='annotation', y='value', hue='variable', data=df_max[df_max['variable'].isin(['in','out'])],
            saturation=1, hue_order=['in','out'], linewidth=1.5, linecolor='k', palette=['xkcd:bright lilac','lightgray'])
ax.set_xlim(-1, 4); ax.tick_params(axis='x', labelrotation=90)
ax.legend(bbox_to_anchor=(.99, .5), bbox_transform=ax.transAxes, frameon=False, loc='center left')
for x in df_max['annotation'].unique():
    p = ss.mannwhitneyu(df_max.loc[(df_max['annotation'] == x)&(df_max['variable'] == 'in'), 'value'],
                        df_max.loc[(df_max['annotation'] == x)&(df_max['variable'] == 'out'), 'value'])[1]
    print(method, p)
for n in range(4, 8):
    ax.patches[n].set_hatch('//')
ylim = ax.get_ylim(); yrange = ylim[1] - ylim[0]; ax.set_ylim(ylim[0], ylim[1]+yrange*0.15)
ax.set(xlabel='Ligand-receptor class', ylabel='Interaction intensity distance')
ax.legend(labels=['Binds the\nsame ligand','Binds any\nother ligands'],
          bbox_to_anchor=(.99, 1.05), bbox_transform=ax.transAxes, frameon=False, loc='upper left')