In [1]:
# This script is for mutiple regression on nFR, days after FMT and fraction at each cluster/super-cluster.
# FMT dataset 2
import sys
sys.path.append('..')
import os
import pandas as pd
import copy
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import tree_util
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load data
frac_df = pd.read_csv('../data/FMT/FMT2/Eric.txt', sep='\t', header=0, index_col=0)
deltat = pd.read_csv('../data/FMT/FMT2/deltat.txt', sep='\t', header=0, index_col=0)
triads = pd.read_csv('../data/FMT/FMT2/triads.txt', sep='\t', header=0, index_col=None)

In [3]:
for idx in triads.index:
    posts = triads.loc[idx, 'post_samples'].strip().split(',')
    times = []
    for p in posts:
        if p not in deltat.index:
            print(p)
        else:
            times.append(str(deltat.loc[p, 'delta_t']))
    t_str = ','.join(times)
    triads.loc[idx, 'days'] = t_str

In [4]:
for idx in frac_df.index:
    d = frac_df.loc[idx, 'Days postFMT']
    cand = triads[triads['days'] == d]['pre_sample']
    ids = triads[triads['days'] == d].index
    if len(cand) == 1:
        #print(list(cand))
        s = list(cand)[0]
        frac_df.loc[idx, 'sname'] = s
        triads.loc[list(ids)[0], 'alias'] = idx
triads.loc[10, 'alias'] = 'FMT5' 
s = triads.loc[10, 'pre_sample']
frac_df.loc['FMT5', 'sname'] = s

In [5]:
# compute nFR for a sample
def nfr_value(d_df, profile, sname):
    sp_list = list(profile.columns)
    n = len(sp_list)
    corr = np.ones(shape=(n, n)) - d_df.loc[sp_list, sp_list].values
    np.fill_diagonal(corr, 0)
    # print(corr[1, 1])
    a = np.array(profile.loc[sname, sp_list])
    inter_matrix = np.dot(a.reshape(len(a), 1),a.reshape(1, len(a)))
    np.fill_diagonal(inter_matrix, 0)
    td = np.sum(inter_matrix)/2
    fr = np.sum(np.multiply(inter_matrix, corr))/2
    fd = np.sum(np.multiply(inter_matrix, d_df.loc[sp_list, sp_list].values))/2
    if td == 0:
        return 0
    return fr/td

# compute nFR for all sample
def multisample_nfr(profile, d_df, node_leaves):
    result = pd.DataFrame(index=profile.index, columns=(list(node_leaves.keys()) + ['nroot']))
    for sname in profile.index:
       for node, sp_list in node_leaves.items():
           sp_list = list(set(sp_list).intersection(set(profile.columns)))
           selected_d = d_df.loc[sp_list, sp_list]
           selected_profile = profile.loc[:, sp_list]
           value = nfr_value(selected_d, selected_profile, sname)
           result.loc[sname, node] = value
    return result

In [6]:
# load tree
with open('../result/GCN_fix_tree/renamed_GCN_tree.newick') as fp:
    newick_tree = fp.read()
    # newick_tree = newick_tree.replace('-', '_')
    
# construct tree with newick string
json_tree = tree_util.parse(newick_tree)
largest = {'largest': 0}
leaf_list, l = tree_util.recu_compute(json_tree, 0, largest)
largest_level = largest['largest']
nlayer = largest_level
layer_leaves_dict = tree_util.make_layer_dict(nlayer)

tree_util.recu_layer(json_tree, layer_leaves_dict)
tree_util.to_layer_leaves(layer_leaves_dict, nlayer)
leaves_dict = copy.deepcopy(layer_leaves_dict)

parent_dict = {}
tree_util.parents(json_tree, parent_dict)
node_leaves = {}
for level in layer_leaves_dict.keys():
    for node, sp_list in layer_leaves_dict[level].items():
        if node in node_leaves.keys():
            continue
        node_leaves[node] = copy.deepcopy(sp_list)
subtree_nodes = {}
for l in leaf_list:
    parent = parent_dict[l]
    if parent not in subtree_nodes.keys():
        subtree_nodes[parent] = []
    subtree_nodes[parent].append(l)

for node in node_leaves.keys():
    parent = parent_dict[node]
    if parent not in subtree_nodes.keys():
        subtree_nodes[parent] = []
    subtree_nodes[parent] += subtree_nodes[node]
    subtree_nodes[parent].append(node)

for node in subtree_nodes.keys():
    subtree_nodes[node].append(node)

direct_children_dict = {}
for node, parent in parent_dict.items():
    if parent not in direct_children_dict:
        direct_children_dict[parent] = []
    direct_children_dict[parent].append(node)

node_leaves['root'] = copy.deepcopy(leaf_list)

In [7]:
# load distance, abundance profile and metadata
outdir = '../result/FMT/nFR/FMT2'
if not os.path.exists(outdir):
    os.makedirs(outdir)
param = 0.5
d_df = pd.read_csv('../data/sp_d.tsv', sep='\t', header=0, index_col=0)
abd = pd.read_csv('../data/FMT/FMT2/Eric_abd.tsv', sep='\t', header=0, index_col=0)
name_dict = {}
for sp in d_df.columns:
        name_dict[sp] = sp.replace('_', '-')
d_df = d_df.rename(columns=name_dict, index=name_dict)
abd = abd.rename(columns=name_dict)
profile = abd[list(set(abd.columns).intersection(set(d_df.columns)))]
profile = profile.div(profile.sum(axis=1), axis=0)

profile = profile.rename(index={'FMT5':'FMT05'})
pre_slist = list(frac_df['sname'])
profile = profile.loc[pre_slist,]
rename_dict = {}
for idx in frac_df.index:
    rename_dict[frac_df.loc[idx, 'sname']] = idx
profile.rename(index=rename_dict).to_csv(os.path.join(outdir, 'abd.tsv'), sep='\t')

In [8]:
# compute nFR for all samples at timepoint 0
nfr_df = multisample_nfr(profile, d_df, node_leaves)
nfr_df = nfr_df.T
nfr_df = nfr_df.drop('nroot')

In [9]:
p_cutoff = 0.05
# prepare frac and t for regression
frac = []
t = []
sname_list = []
for idx in frac_df.index:
    sname = frac_df.loc[idx, 'sname']
    t_str = frac_df.loc[idx, 'Days postFMT']
    frac_str = frac_df.loc[idx, 'Fraction of donor specific strains']
    t += [int(ts) for ts in t_str.split(',')]
    frac += [float(f) for f in frac_str.split(',')]
    sname_list += [sname]*len(t_str.split(','))


In [10]:
# regression
odf = pd.DataFrame(columns=['F-pvalue', 'nfr_co', 't_co', 'const_co', 'nfr_p', 't_p', 'const_p'])
for node in node_leaves.keys():
    nfr = []
    for sname in sname_list:
        value = nfr_df.loc[node, sname]
        nfr.append(value)
    if sum(nfr) == 0:
        continue
    df = pd.DataFrame()
    df['frac'] = frac
    df['nfr'] = nfr
    df['t'] = t

    X = df[['nfr', 't']]
    y = df['frac']
    X = sm.add_constant(X)
    model = sm.OLS(y, X)
    result = model.fit()
    
    odf.loc[node, 'F-pvalue'] = result.f_pvalue
    for var in result.pvalues.index:
        odf.loc[node, "{}_co".format(var)] = result.params[var]
        odf.loc[node, "{}_p".format(var)] = result.pvalues[var]
odf.sort_values(by='F-pvalue').to_csv(os.path.join(outdir, 'p_values.tsv'), index=True, header=True, sep='\t')

In [11]:
odf[odf['F-pvalue']<p_cutoff]

Unnamed: 0,F-pvalue,nfr_co,t_co,const_co,nfr_p,t_p,const_p
cluster_S1-C3,0.015189,-0.442987,0.000550979,0.653932,0.0040558,0.57552,1.56364e-10
cluster_S1-C4,0.00413328,1.0492,0.000521183,-0.0943784,0.00099963,0.581792,0.55523
cluster_S1-C5,0.0252543,-0.535224,0.000609198,0.760055,0.0070705,0.543664,6.7932e-08
cluster_S1-C10,0.0140769,-0.470651,-9.36477e-05,0.516373,0.00373415,0.923037,1.93443e-14
cluster_S1-C15,0.00198524,-0.402447,0.00107787,0.577465,0.000458754,0.26274,2.12027e-14
cluster_S2-C5,0.0323182,-0.581848,0.00023281,0.787635,0.00928013,0.814373,3.18566e-07
cluster_C7,0.0442497,-0.296316,-1.8189e-05,0.507224,0.0131572,0.98547,7.0465e-14
supercluster_S1,0.000268792,1.86365,-0.00045584,-0.289235,5.62406e-05,0.604101,0.0925307


In [12]:
# plot regression results
formula_tmp = r'$f_{{ds}} = {:.4f}*nFR_{{pre}} + {:.4f}*t_{{post}} + {:.4f}$' + '\np = {:.4}'

for node in sorted(list(odf.index)):
    nfr_co = odf.loc[node, 'nfr_co']
    t_co = odf.loc[node, 't_co']
    const = odf.loc[node, 'const_co']
    p_value = odf.loc[node, 'F-pvalue']
    nfr = np.linspace(0, max(max(nfr_df.loc[node, ])*1.1, 0.05), 1000)
    t = np.linspace(0, 150, 1000)
    X, Y = np.meshgrid(nfr, t)
    f = nfr_co*X + t_co*Y + const
    f[f>1] = np.nan
    f[f<0] = np.nan
    formula = formula_tmp.format(nfr_co, t_co, const, p_value)
    fig = plt.figure(figsize=(5,5))
    ax = fig.add_subplot(111, projection='3d')
    cmp = plt.cm.YlGnBu_r
    surf = ax.plot_surface(X, Y, f, cmap=cmp, alpha=0.5)
    surf.set_clim(0, 1)
    ax.set_yticks(np.arange(0, 150, 50))
    ax.set_yticklabels(np.arange(0, 150, 50))

    ax.set_zlim((0, 1))
    ax.set_zticks(np.array([0, 0.2, 0.4, 0.6, 0.8, 1]))
    ax.set_zticklabels(np.array([0, 0.2, 0.4, 0.6, 0.8, 1]))

    ax.set_xlabel(r'$nFR_{pre}$', fontweight='bold', fontsize=12)
    ax.set_ylabel(r'$t_{post}$', fontweight='bold', fontsize=14)
    ax.zaxis.set_rotate_label(False) 
    ax.set_zlabel(r'$f_{ds}$', fontweight='bold', fontsize=14)

    nfr = []
    frac = []
    t = []
    colors = []
    slist = []
    for sname in frac_df.index:
        sname2 = frac_df.loc[sname, 'sname']
        t_str = frac_df.loc[sname, 'Days postFMT']
        frac_str = frac_df.loc[sname, 'Fraction of donor specific strains']
        n_t = len(t_str.split(','))
        t += [int(ts) for ts in t_str.split(',')]
        frac += [float(f) for f in frac_str.split(',')]
        #c += [color_dict[int(sname[3:])]] * n_t
        colors += [int(sname[3:])-1] * n_t
        nfr += [nfr_df.loc[node, sname2]] * n_t
        slist += [sname] * n_t

    point_df = pd.DataFrame(columns=['sample', 'nFR_pre', 't_post', 'f_ds'])
    point_df['sample'] = slist
    point_df['nFR_pre'] = nfr
    point_df['t_post'] = t
    point_df['f_ds'] = frac

    point_df.to_csv(os.path.join(outdir, '{}.tsv'.format(node)), sep='\t', header=True, index=False)

    ax.scatter(nfr, t, frac, c = colors, cmap='tab20')
    ax.view_init(elev=15, azim=245)
    ax.set_title(formula, y = 0.95)
    ax.xaxis.set_pane_color((1, 1, 1, 1))
    ax.yaxis.set_pane_color((1, 1, 1, 1))
    ax.zaxis.set_pane_color((1, 1, 1, 1))
    plt.suptitle(node.replace('_', ' '))
    # fig.colorbar(surf)
    plt.tight_layout()
    opath = os.path.join(outdir, '{}.pdf'.format(node))
    plt.savefig(opath, dpi=300, format='pdf')
    # plt.show()