In [1]:
import sys
sys.path.append('..')
import os
import pandas as pd
import copy
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import tree_util

In [2]:
def fr(d_df, profile, sname):
    sp_list = list(profile.columns)
    sp_d_df = d_df.loc[sp_list, sp_list]
    #print(profile)
    sp_profile = np.array(profile.loc[sname, sp_list])
    value = np.dot(sp_profile.reshape(len(sp_profile), 1),sp_profile.reshape(1, len(sp_profile)))
    width = value.shape[0]
    cor_df = np.ones(shape=(width, width)) - sp_d_df.values
    for i in range(width):
        cor_df[i][i] = 0
    value = np.multiply(value, cor_df)
    df = pd.DataFrame(data= value, index=sp_list, columns=sp_list)
    return df

def nfr(d_df, profile, sname):
    sp_list = list(profile.columns)
    n = len(sp_list)
    corr = np.ones(shape=(n, n)) - d_df.loc[sp_list, sp_list].values
    np.fill_diagonal(corr, 0)
    # print(corr[1, 1])
    a = np.array(profile.loc[sname, sp_list])
    inter_matrix = np.dot(a.reshape(len(a), 1),a.reshape(1, len(a)))
    np.fill_diagonal(inter_matrix, 0)
    td = np.sum(inter_matrix)/2
    fr = np.sum(np.multiply(inter_matrix, corr))/2
    fd = np.sum(np.multiply(inter_matrix, d_df.loc[sp_list, sp_list].values))/2
    if td == 0:
        return 0
    #return fr/td
    return fr

def multisample_nfr(profile, d_df, node_leaves):
    result = pd.DataFrame(index=profile.index, columns=(list(node_leaves.keys()) + ['nroot']))
    for sname in profile.index:
       for node, sp_list in node_leaves.items():
           sp_list = list(set(sp_list).intersection(set(profile.columns)))
           selected_d = d_df.loc[sp_list, sp_list]
           selected_profile = profile.loc[:, sp_list]
           value = nfr(selected_d, selected_profile, sname)
           result.loc[sname, node] = value
    return result


In [3]:
outdir = '../result/FMT_FR/FMT1'
if not os.path.exists(outdir):
    os.makedirs(outdir)
with open('../result/GCN_fix_tree/renamed_GCN_tree.newick') as fp:
    newick_tree = fp.read()
    # newick_tree = newick_tree.replace('-', '_')

json_tree = tree_util.parse(newick_tree)
largest = {'largest': 0}
leaf_list, l = tree_util.recu_compute(json_tree, 0, largest)
largest_level = largest['largest']
nlayer = largest_level
layer_leaves_dict = tree_util.make_layer_dict(nlayer)
tree_util.recu_layer(json_tree, layer_leaves_dict)
tree_util.to_layer_leaves(layer_leaves_dict, nlayer)
leaves_dict = copy.deepcopy(layer_leaves_dict)
parent_dict = {}
tree_util.parents(json_tree, parent_dict)
node_leaves = {}
for level in layer_leaves_dict.keys():
    for node, sp_list in layer_leaves_dict[level].items():
        if node in node_leaves.keys():
            continue
        node_leaves[node] = copy.deepcopy(sp_list)
subtree_nodes = {}
for l in leaf_list:
    parent = parent_dict[l]
    if parent not in subtree_nodes.keys():
        subtree_nodes[parent] = []
    subtree_nodes[parent].append(l)

for node in node_leaves.keys():
    parent = parent_dict[node]
    if parent not in subtree_nodes.keys():
        subtree_nodes[parent] = []
    subtree_nodes[parent] += subtree_nodes[node]
    subtree_nodes[parent].append(node)

for node in subtree_nodes.keys():
    subtree_nodes[node].append(node)

direct_children_dict = {}
for node, parent in parent_dict.items():
    if parent not in direct_children_dict:
        direct_children_dict[parent] = []
    direct_children_dict[parent].append(node)

node_leaves['root'] = copy.deepcopy(leaf_list)

In [4]:
d_df = pd.read_csv('../data/sp_d.tsv', sep='\t', header=0, index_col=0)
metadata = pd.read_csv('../data/FMT/FMT1/metadata.tsv', sep='\t', index_col=None, header=0)
abd = pd.read_csv('../data/FMT/FMT1/fmt_abd.tsv', sep='\t', header=0, index_col=0)
name_dict = {}
for sp in d_df.columns:
        name_dict[sp] = sp.replace('_', '-')
d_df = d_df.rename(columns=name_dict, index=name_dict)
abd = abd.rename(columns=name_dict)
sub_ids = ['FAT_006', 'FAT_015', 'FAT_008', 'FAT_020', 'FAT_012']
days = [0, 2, 14, 42, 84]
frac_df = pd.read_csv('../data/FMT/FMT1/Li.txt', sep='\t', header=0, index_col=0)

In [5]:
profile = abd[list(set(abd.columns).intersection(set(d_df.columns)))]/100
rename_dict = {}
ids = []
i = 1
for id in sub_ids:
    for d in days:
        sid = "{}-22-{}-0".format(id, d)
        ids.append(sid)
        rename_dict[sid] = 'FMT{}_day{}'.format(i,d)
    i += 1
profile = profile.loc[ids, ].rename(index=rename_dict)
profile.to_csv(os.path.join(outdir, 'abd.tsv'), sep='\t')
se_df = multisample_nfr(profile, d_df, node_leaves)
se_df = se_df.T
se_df.drop('nroot', axis=0, inplace=True)
se_df.to_csv(os.path.join(outdir, 'fr.tsv'), sep='\t')