This script tests difference of SE between response group and non-response group at SIG1/SIG2 clsuter raised in original study and compute S score for each sample.


In [1]:
# This script tests difference of SE between response group and non-response group at SIG1/SIG2 clsuter raised in original study and compute S score for each sample.
import sys
sys.path.append('..')
import abd_profile
import os
import pandas as pd
import GCN
import copy
import numpy as np
import tree_util
import se
from scipy.stats import mannwhitneyu
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib as mpl
from statsmodels.stats.multitest import fdrcorrection as fdr
import json

In [2]:
outer = '../data'
outdir = '../result/immu/SE_sig'
indir= os.path.join(outer, 'immu')
#input dir: including metadata.tsv and abd.tsv
# output result to dir
# load distance, abundance profile and metadata
abd_path = os.path.join(indir, 'merged_species.txt')
metadata_path = os.path.join(indir, 'metadata.txt')
ori_GCN = GCN.input_GCN('../data/gcn2008.tsv', transfer=True)
d_df = pd.read_csv('../data/sp_d.tsv', sep='\t', header=0, index_col=0)
related_abd_df = pd.read_csv('../data/immu/sig.txt', sep='\t', header=0, index_col=None)
raw_profile = abd_profile.input_profile(abd_path, transfer=True)
raw_profile = raw_profile[list(set(d_df.index).intersection(set(raw_profile.columns)))]
metadata = pd.read_csv(metadata_path, sep=',', header=0)
metadata.fillna('NA', inplace=True)


In [3]:
# load species list of sig1 and sig2
sig_dict = {}
sig_dict['sig1'] = list(related_abd_df[related_abd_df['SIG'] == 'SIG1']['MGS'])
sig_dict['sig1'] = [x.replace(' ', '_') for x in sig_dict['sig1']]
sig_dict['sig1'] = ['s__{}'.format(x) for x in sig_dict['sig1']]
sig_dict['sig2'] = list(related_abd_df[related_abd_df['SIG'] == 'SIG2']['MGS'])
sig_dict['sig2'] = [x.replace(' ', '_') for x in sig_dict['sig2']]
sig_dict['sig2'] = ['s__{}'.format(x) for x in sig_dict['sig2']]
sig_dict['sig3'] = list(set(raw_profile.columns).difference(set(sig_dict['sig1'] + sig_dict['sig2'])))


In [4]:
parent_dict = {'sig1': 'root', 'sig2': 'root', 'sig3': 'root'}
for sp in raw_profile.columns:
    if sp in sig_dict['sig1']:
        parent_dict[sp] = 'sig1'
    elif sp in sig_dict['sig2']:
        parent_dict[sp] = 'sig2'
    else:
        parent_dict[sp] = 'sig3'

node_leaves = copy.deepcopy(sig_dict)
leaf_list = list(raw_profile.columns)
subtree_nodes = {}
for l in leaf_list:
    parent = parent_dict[l]
    if parent not in subtree_nodes.keys():
        subtree_nodes[parent] = []
    subtree_nodes[parent].append(l)

for node in node_leaves.keys():
    parent = parent_dict[node]
    if parent not in subtree_nodes.keys():
        subtree_nodes[parent] = []
    subtree_nodes[parent] += subtree_nodes[node]
    subtree_nodes[parent].append(node)

for node in subtree_nodes.keys():
    subtree_nodes[node].append(node)


direct_children_dict = {}
for node, parent in parent_dict.items():
    if parent not in direct_children_dict:
        direct_children_dict[parent] = []
    direct_children_dict[parent].append(node)

node_leaves['root'] = leaf_list

In [5]:
# compute FR network
def fr(d_df, profile, sname):
    #print(profile.columns, d_df.index)
    sp_list = list(set(profile.columns).intersection(set(d_df.index)))
    #print(sp_list)
    sp_d_df = d_df.loc[sp_list, sp_list]
    #print(profile)
    sp_profile = np.array(profile.loc[sname, sp_list])
    value = np.dot(sp_profile.reshape(len(sp_profile), 1),sp_profile.reshape(1, len(sp_profile)))
    width = value.shape[0]
    cor_df = np.ones(shape=(width, width)) - sp_d_df.values
    for i in range(width):
        cor_df[i][i] = 0
    value = np.multiply(value, cor_df)
    fr_df = pd.DataFrame(value, index=sp_list, columns=sp_list)
    return fr_df

# compute SE for all sample
def multisample_se(profile, d_df, parent_dict, node_leaves, child_dict, direct_children_dict, param):
    result = pd.DataFrame(index=profile.index, columns=(list(node_leaves.keys())))
    for sname in profile.index:
        # print(sname)
        edge_df = fr(d_df, profile, sname)
        tmp = se.subtree_se_adj(edge_df, parent_dict, node_leaves, child_dict, direct_children_dict, param)
        for node in tmp.keys():
            value = tmp[node]
            result.loc[sname, node] = value
    return result


In [6]:
cohort_list = ['Disc', 'Valid']
cohort_se_dict = {}
param = 0.5

# compute SE for all samples
for cohort in cohort_list:
    metadata_cohort = metadata[(metadata['Cohort'] == cohort)] #  
    metadata_cohort = metadata_cohort[metadata_cohort['Sample Name'].isin(list(raw_profile.index))]


    selected_raw_profile = raw_profile.loc[list(metadata_cohort['Sample Name']), :]
    crc_profile = abd_profile.check(selected_raw_profile, d_df)

    pheno_list = {}
    pheno_profiles = {}
    for i in metadata_cohort.index:
        pheno = metadata_cohort.loc[i, 'OS12']
        if pheno == 'NA':
            continue
        if pheno not in pheno_list.keys():
            pheno_list[pheno] = []
        if metadata_cohort.loc[i, 'Sample Name'] in list(crc_profile.index):
            pheno_list[pheno].append(metadata_cohort.loc[i, 'Sample Name'])

    for c, clist in pheno_list.items():
        pheno_profiles[c] = copy.deepcopy(crc_profile.loc[clist, :])
    
    se_result = {}   
    for p, profile in pheno_profiles.items():
        p_result = multisample_se(profile, d_df, parent_dict, node_leaves, subtree_nodes, direct_children_dict, param)
        se_result[p] = p_result

    cohort_se_dict[cohort] = copy.deepcopy(se_result)


In [7]:
# output se to files
for cohort in cohort_se_dict.keys():
    odir = os.path.join(outdir, cohort)
    if not os.path.exists(odir):
        os.makedirs(odir)
    
    phenos = list(cohort_se_dict[cohort].keys())
    for pheno in phenos:
        cohort_se_dict[cohort][pheno].to_csv(os.path.join(odir, 'se_{}.tsv'.format(pheno)), sep='\t')


In [8]:
if not os.path.exists(outdir):
    os.makedirs(outdir)
p_cutoff = 0.05

# differential test 
p_df = pd.DataFrame()
mean_df = pd.DataFrame()
valid_dict = {}
show_node = []

for cohort in cohort_se_dict.keys():
    
    if 'NR' not in cohort_se_dict[cohort].keys():
        continue
    if 'R' not in cohort_se_dict[cohort].keys():
        continue
    if cohort not in valid_dict.keys():
        valid_dict[cohort] = []
    l_se1 = cohort_se_dict[cohort]['NR']
    l_se2 = cohort_se_dict[cohort]['R']
    for inode in l_se1.columns:
        if (list(l_se1[inode]).count(0) > len(list(l_se1[inode]))*0.8) and (list(l_se2[inode]).count(0) > len(list(l_se2[inode]))*0.8):
            continue
        valid_dict[cohort].append(inode)
        t, p1 = mannwhitneyu(list(l_se1[inode]), list(l_se2[inode]))
        p_df.loc[cohort, inode] = p1
        mean_df.loc[cohort, inode] = np.mean(l_se1[inode]) - np.mean(l_se2[inode])
p_df.fillna(np.nan, inplace=True)

In [9]:
# adjust p-values by FDR
for dir in p_df.index:
    valid_nodes = valid_dict[dir]
    p_vector = list(p_df.loc[dir, valid_nodes])
    p_adj = fdr(p_vector, p_cutoff)[1]
    for i, node in enumerate(valid_nodes):
        p_df.loc[dir, node] = p_adj[i]
p_df.to_csv(os.path.join(outdir, 'p_all_cohorts.tsv'), sep='\t')

In [10]:
node_plot = pd.DataFrame()
for dir in p_df.index:
    for inode in p_df.columns:
        p1 = p_df.loc[dir, inode]
        if pd.isna(p1):
            node_plot.loc[dir, inode] = np.nan
        elif p1 < p_cutoff:
            if mean_df.loc[dir, inode] > 0:
                node_plot.loc[dir, inode] = 1
            else:
                node_plot.loc[dir, inode] = -1
        else:
            node_plot.loc[dir, inode] = 0


In [11]:
# plot result
show_node = []   
part_df = node_plot
for inode in part_df.columns:
    all_zero = True
    all_nan = True
    for dir in part_df.index:
        if not pd.isna(node_plot.loc[dir, inode]):
            all_nan = False
        if node_plot.loc[dir, inode] != 0:
            all_zero = False
    if not (all_zero or all_nan):
        show_node.append(inode)
plt.figure(figsize=(5, 12))
part_df = part_df[show_node]
non_zero_count = part_df.astype(bool).sum(axis=0) - part_df.isnull().sum(axis=0)
sorted_df = part_df.iloc[:, non_zero_count.argsort()]
#show_node = sorted(show_node)
sns.heatmap(sorted_df.T, vmax=1, vmin=-1, square=True, linecolor='black', cbar=False, xticklabels=True, yticklabels=True, linewidths=1, cmap="coolwarm")
plt.title('Different SE of NSCLC cohorts')
plt.xticks(rotation=90)
ax = plt.gca()

ax.spines['top'].set_visible(True)
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)
ax.spines['right'].set_visible(True)
opath = os.path.join(outdir, 'NSCLC.pdf')
plt.tight_layout()
plt.savefig(opath, dpi=300, format='pdf')
part_df[show_node].to_csv(os.path.join(outdir, 'NSCLC.tsv'), sep='\t')
#plt.show()
#print(part_df[show_node].shape)
plt.clf()

In [12]:
# load data
abd_path = os.path.join(indir, 'merged_species.txt')
raw_profile = abd_profile.input_profile(abd_path, transfer=True)
raw_profile = raw_profile[list(set(d_df.index).intersection(set(raw_profile.columns)))]
metadata_path = os.path.join(indir, 'metadata.txt')
metadata = pd.read_csv(metadata_path, sep=',', header=0)
metadata.fillna('NA', inplace=True)
metadata_cohort = metadata[(metadata['Cohort'] == 'Disc')] #  
metadata_cohort = metadata_cohort[metadata_cohort['Sample Name'].isin(list(raw_profile.index))]
selected_raw_profile = raw_profile.loc[list(metadata_cohort['Sample Name']), :]
disc_profile = abd_profile.check(selected_raw_profile, d_df)
delete_sp = []
binary_disc = copy.deepcopy(disc_profile)
binary_disc[binary_disc > 0] = 1
for sp in disc_profile.columns:
    if binary_disc[sp].sum() < len(disc_profile)*2.5/100:
        delete_sp.append(sp)
disc_profile.drop(columns=delete_sp, inplace=True)
clusters = ['sig1', 'sig2']
interested_dict = {}
num_dict = {}
cluster_sp_dict = {}
for cluster in clusters:
    leaves = node_leaves[cluster]
    leaves = [leaf.replace('-', '_') for leaf in leaves]
    common_sp = list(set(disc_profile.columns).intersection(set(leaves)))
    interested_dict[cluster] = copy.deepcopy(disc_profile[common_sp])
    num_dict[cluster] = len(common_sp)
    cluster_sp_dict[cluster] = ','.join(common_sp)

with open(os.path.join(outdir, 'cluster.tsv'), 'w') as fp:
    s = 'cluster\tsize\tleaves\n'
    for cluster in clusters:
        s += '{}\t{}\t{}\n'.format(cluster, num_dict[cluster], ','.join(list(interested_dict[cluster].columns)))
    fp.write(s)

json.dump(cluster_sp_dict, open(os.path.join(outdir, 'cluster_sp.json'), 'w'))

In [13]:
# compute FR S Score
existed_sp = {}
cluster_ratio = {}
for cluster in clusters:
    cluster_p = interested_dict[cluster]
    column_medians = cluster_p.median()
    for sp in cluster_p.columns:
        cluster_p.loc[:, sp] = cluster_p.loc[:, sp] - column_medians[sp]
    cluster_p[cluster_p > 0] = 1
    cluster_p[cluster_p < 0] = 0
    total_n = num_dict[cluster]
    # sum by row
    sum_result = cluster_p.sum(axis=1)
    ratio = sum_result/total_n
    cluster_ratio[cluster] = copy.deepcopy(ratio)
    existed_sp[cluster] = {}
    for sample in cluster_p.index:
        existed_sp[cluster][sample] = []
        for sp in cluster_p.columns:
            if cluster_p.loc[sample, sp] == 1:
                existed_sp[cluster][sample].append(sp)
        existed_sp[cluster][sample] = ','.join(existed_sp[cluster][sample])
json.dump(existed_sp, open(os.path.join(outdir, 'existed_sp.json'), 'w'))
result_score = (cluster_ratio[clusters[1]]-cluster_ratio[clusters[0]][cluster_ratio[clusters[0]].index]+1)/2
result_score.to_csv(os.path.join(outdir, 'score_disc.tsv'), sep='\t')

In [14]:
# merge FR S score to metadata and binary classification TOPOB01
DS1 = pd.read_csv('../data/immu/DS1_oncology_clinical_data.csv', header=0)
DS1 = DS1[DS1['Cohort']=='Disc']
DS1.set_index('Sample_id', inplace=True)
DS1.dropna(subset=['OS12'], inplace=True)
DS1['Sample_id'] = DS1.index

pred_disc = pd.read_csv(os.path.join(outdir, 'score_disc.tsv'), sep='\t', header=0, index_col=0)
DS1['TOPOB01'] = pred_disc.loc[DS1.index, '0']
DS1.sort_values(by='TOPOB01').to_csv(os.path.join(outdir, 'pred_disc.tsv'), sep='\t', index=False)

DS1.loc[DS1[DS1['TOPOB01'] < 0.499].index, 'TOPOB01'] = 0
DS1.loc[DS1[DS1['TOPOB01'] >= 0.722].index, 'TOPOB01'] = 1
pred_DS1 = DS1[(DS1['TOPOB01'] == 0) | (DS1['TOPOB01'] == 1)]
pred_DS1.to_csv(os.path.join(outdir, 'pred_binary_disc.tsv'), sep='\t', index=False)
print(len(pred_DS1[((pred_DS1['OS12']=='R')&(pred_DS1['TOPOB01'] == 0))])/len(pred_DS1[pred_DS1['TOPOB01'] == 0]))
print(len(pred_DS1[((pred_DS1['OS12']=='R')&(pred_DS1['TOPOB01'] == 1))])/len(pred_DS1[pred_DS1['TOPOB01'] == 1]))

# NR [56] < 0.50 < gray_zone [111] < 0.72 < R [63]
# auc = 0.7218449

# add akk
for sample in DS1.index:
    if DS1.loc[sample, 'TOPOB01'] < 1 and DS1.loc[sample, 'TOPOB01'] > 0:
        if DS1.loc[sample, 'AKK_TRICHO'] == 'Low':
            DS1.loc[sample, 'TOPOB01'] = 1
        else:
            DS1.loc[sample, 'TOPOB01'] = 0
DS1.to_csv(os.path.join(outdir, 'pred_disc_akk.tsv'), sep='\t', index=False)
# auc = 0.62
print(len(DS1[((DS1['OS12']=='R')&(DS1['TOPOB01'] == 0))])/len(DS1[DS1['TOPOB01'] == 0]))
print(len(DS1[((DS1['OS12']=='R')&(DS1['TOPOB01'] == 1))])/len(DS1[DS1['TOPOB01'] == 1]))
print(len(DS1[DS1['TOPOB01'] == 0]))
print(len(DS1[DS1['TOPOB01'] == 1]))

0.2857142857142857
0.7301587301587301
0.4076923076923077
0.65
130
100
