In [1]:
import pickle
import sys
import os
import gc

import numpy as np
import bottleneck as bn
import pandas as pd

# Run TraSig on the example data

1. The inputs are prepared following the Prepare_input_from_dynverse_ti_methods.ipynb tutorial.
2. For the following demo, we use the default multiprocessing (4 cores) setting. 

In [2]:
%time ! main.py -i ../example/input -o ../example/output -d oligodendrocyte-differentiation-clusters_marques -g None -b ti_slingshot -n 1000 -s smallerWindow

Namespace(input='../example/input', listType='ligand_receptor', metric='dot', modelName='ti_slingshot', multiProcess=True, nLap=20, nan2zero=True, ncores=4, numPerms='1000', output='../example/output', preprocess='None', project='oligodendrocyte-differentiation-clusters_marques', startingTreatment='smallerWindow')
Load:  oligodendrocyte-differentiation-clusters_marques_lr.txt
Permutations grouped to different cores: [range(1, 334), range(334, 667), range(667, 1000), range(1000, 1001)]
CPU times: user 592 ms, sys: 102 ms, total: 695 ms
Wall time: 40.1 s


# Analyze results

## Load inputs

In [3]:
project = "oligodendrocyte-differentiation-clusters_marques"
preprocess = ""
model_name = "ti_slingshot"
others = ""
list_type = 'ligand_receptor'
startingTreatment = "smallerWindow"

if preprocess != "":
    _preprocess = f"_{preprocess}"
else:
    _preprocess = ""
    
if startingTreatment != "None":
    _startingTreatment = f"_{startingTreatment}"
else:
    _startingTreatment = ""

n_lap = 20 # smoothing window
metrics = ['dot']
nan2zero = True
num_perms = 1e3

input_path = '../example/input'
output_path = '../example/output'

suffix = f"{project}_{list_type}{_preprocess}_{model_name}"
suffix = f"{suffix}{_startingTreatment}_nlap_{n_lap}{others}"
child_suffix = f"{suffix}_{metrics[0]}_{int(np.log10(num_perms))}"


# get interaction file (list of (ligand, receptor/target))
filename = f"{list_type}_{project}{_preprocess}.pickle"
with open(os.path.join(input_path, filename), 'rb') as handle:
    interaction_list = pickle.load(handle)

    
# load expression data
filename = f"{project}{_preprocess}_lr.txt"
print("Load: ", filename)

data_file = os.path.join(input_path, filename)
df = pd.read_csv(data_file, index_col=0)
cell_exps = df.values
gene_names = list(df.columns.values)  # assume unique


# (optional) load corresponding between sampling time and path
filename = f"sampling_time_per_path_{project}{_preprocess}_{model_name}.pickle"
with open(os.path.join(input_path, filename), 'rb') as handle:
    time2path = pickle.load(handle)

path2time = dict()
for k, ps in time2path.items():
    for p in ps:
        path2time[p] = k

        
# load path & time assignment
# original assignment
hid_var_file = f"{project}{_preprocess}_{model_name}_it2_hid_var.pickle"
with open(os.path.join(input_path, hid_var_file), 'rb') as handle:
    hid_var = pickle.load(handle, encoding="latin1")

unique_paths = np.unique(hid_var["cell_path"])
all_times = [round(i, 2) for i in np.arange(0, 1.01, 0.01)]  # all possible labels for cell time
cell_paths_o = hid_var["cell_path"]
cell_times_o = hid_var["cell_time"]


Load:  oligodendrocyte-differentiation-clusters_marques_lr.txt


## Load outputs

In [4]:
# load the scores on the original data 
_n = 0

_columns = dict.fromkeys(metrics)
for m in metrics:
    _columns[m] = []

_columns.update({'pair': [], 'gene_pair_id': []})

# load results  
filename = f"{suffix}_metrics_{_n}.pickle"
data_file = os.path.join(output_path, filename)

with open(data_file, 'rb') as handle:
    results = pickle.load(handle)

for pair, mets in results.items():
    for m in metrics:
        _columns[m] += list(mets[m])

    _columns['pair'] += list(np.repeat(pair, len(mets[m])))
    _columns['gene_pair_id'] += list(range(len(mets[m])))
    
df = pd.DataFrame(_columns)
num_pairs = len(results[pair][m])


# load permutation results  
filename = f"{suffix}_permutation_results.pickle"
data_file = os.path.join(output_path, filename)

with open(data_file, 'rb') as handle:
    pair2counts = pickle.load(handle)

    
# turn to p-values
for pair, _ in pair2counts.items():
    for m in metrics:
         pair2counts[pair][m] = (pair2counts[pair][m] + 1)/(num_perms + 1)

            
# add to the dataframe            
_columns = dict.fromkeys(metrics)
for m in metrics:
    _columns[m] = []

for pair, counts in pair2counts.items():
    for m in metrics:
        _columns[m] += list(counts[m])
        
for m in metrics:
    df[f"{m}_p"] = _columns[m]

In [5]:
# add ligand target info
df['ligand'] = [interaction_list[int(i)][0] for i in df['gene_pair_id']]
df['target'] = [interaction_list[int(i)][1] for i in df['gene_pair_id']]
ligand_list = np.unique(df['ligand'])

# add more info about cell clusters 
df['sender'] = [i.split('_')[0] for i in df['pair']]
df['receiver'] = [i.split('_')[1] for i in df['pair']]
df['sender'] = df['sender'].astype('int')
df['receiver'] = df['receiver'].astype('int')
df['time-sender'] = [path2time[i] for i in df['sender']]
df['time-receiver'] = [path2time[i] for i in df['receiver']]

### Adjust p-values for multiple comparisons

In [6]:
# add adjusted p-values 
import statsmodels.api as sm
import statsmodels as sm

In [7]:
p_adjusted = df['dot_p'].values.copy()
_p = df['dot_p'].values.copy()

for pair in results.keys(): 
    condition = np.where(df['pair'] == pair)[0]
    adjusted = sm.stats.multitest.fdrcorrection(df['dot_p'].values[condition])
    _p[condition] = adjusted[1]

In [8]:
df['dot_p_adjusted'] = _p

In [9]:
columns = ['pair','sender', 'receiver', 'time-sender', 'time-receiver', 'gene_pair_id', 
           'ligand', 'target', 'dot', 'dot_p', 'dot_p_adjusted']

In [10]:
df[columns].head()

Unnamed: 0,pair,sender,receiver,time-sender,time-receiver,gene_pair_id,ligand,target,dot,dot_p,dot_p_adjusted
0,0_0,0,0,0,0,0,ADAM17,NOTCH1,0.242648,0.000999,0.002331
1,0_0,0,0,0,0,1,AGRN,ATP1A3,0.668552,0.000999,0.002331
2,0_0,0,0,0,0,2,AGRP,SDC3,0.008098,0.050949,0.113477
3,0_0,0,0,0,0,3,APOE,LRP1,1.583651,0.000999,0.002331
4,0_0,0,0,0,0,4,APOE,SORL1,0.200087,1.0,1.0
