## Imports

In [1]:
import pandas as pd
from pathlib import Path

## Input

* mapping file generated with Ensembl BioMart

In [2]:
PROJECTDIR = Path('.').resolve().parents[2]

htdb_path = '../data/DatabaseExtract_v_1.01.csv'
pp_path = '../data/all_human_across_mammalia.phyloprofile'
map_path = '../data/human_tfs_ensembl2uniprot.txt'
phylostrata_path = '../data/htdb_tfs_uniprot2lca.json'

## Output

In [3]:
phylostrata_overview = f'{PROJECTDIR}/publication/supplement_tables/TF_phylostrata.tsv'
non_conserved_phyloprofile = '../results/nonconserved_human_TFs.phyloprofile'

## Load data

In [4]:
from collections import defaultdict

def all_htdb_tfs(path):
    df = pd.read_csv(path, sep=',', index_col=0)
    df = df[df['Is TF?'] == 'Yes']
    display(df)
    return set(df['HGNC symbol'])


def load_ensembl2uniprot(path):
    ensembl2uniprot = defaultdict(set)
    no_uniprot = set()
    all_uniprots = set()
    with open(path) as fh:
        header = next(fh)
        for i, line in enumerate(fh, 1):
            linedata = line.strip().split()
            if len(linedata) == 2:
                uniprot, ensembl = linedata
                ensembl2uniprot[ensembl].add(uniprot)
                
                all_uniprots.add(uniprot)
            elif len(linedata) == 3:
                swissprot, trembl, ensembl = linedata
                
                ensembl2uniprot[ensembl].add(swissprot)
                ensembl2uniprot[ensembl].add(trembl)
                
                all_uniprots.add(swissprot)
                all_uniprots.add(trembl)
            elif len(linedata) == 1:
                no_uniprot.add(linedata[0])
            else:
                raise ValueError(linedata)
    print(f'Warning: no Uniprot ID found for {len(no_uniprot)} out of {i} Ensembl IDs')
    return all_uniprots

    

tf_uniprot_ids = load_ensembl2uniprot(map_path)
# all_tfs = all_htdb_tfs(htdb_path)
print(tf_uniprot_ids)

{'A0A024R4I3', 'G4XH65', 'A0A8Q3WLD2', 'H0YBB6', 'K7EQA0', 'O75364', 'C9JD04', 'Q92949', 'Q15583', 'Q99607', 'Q8N8L2', 'F5H771', 'A0A024R0P4', 'M0QXC7', 'A0A6E1WCP9', 'Q5VV16', 'F8W1Q1', 'E9PQ00', 'A0A804HJJ4', 'Q5T0B9', 'A6NJT0', 'A0A286YEX4', 'K7EJ04', 'B7WNT5', 'Q96GC6', 'A0A024QZG5', 'A0A6Q8PH19', 'A0A2P9DTZ8', 'A0A494C0H2', 'O75971', 'Q8IW36', 'M0QYP0', 'A0A994J813', 'K7EJ55', 'H0YNI2', 'Q9NX45', 'P32242', 'Q4W5G0', 'M0QXM7', 'A0A087WW29', 'Q6V3B1', 'A0A087WY37', 'Q9Y4X4', 'Q9ULX6', 'A0A1B0GVC6', 'C9IZC5', 'H3BPU3', 'Q15744', 'X6RBL6', 'D6R992', 'A0A3B3ISB7', 'E7EV44', 'Q86V93', 'A0A0D9SGE5', 'P09629', 'F5H8K3', 'A0A087X1H1', 'A0A024RBA3', 'A0A994J5Q4', 'A0A669KAX1', 'Q8WVJ9', 'C9JLQ8', 'A0A6Q8PHQ3', 'D6RC00', 'Q15319', 'Q99697', 'G3V370', 'Q9NVV9', 'B3KPE6', 'M0R091', 'O00358', 'A0A8I5KU66', 'E9PLT6', 'M0R2I1', 'A0A1B0GW91', 'H0YLL0', 'J3KSE0', 'A0A8I5KQG8', 'A0A8V8TL94', 'Q9NQV8', 'C9J9L1', 'B4DKD5', 'A0A024R4L3', 'Q8N587', 'A0A0D9SGE7', 'Q5VVR8', 'O75461', 'A0A1B0GXL9', 'Q9UIH9

## Filter PhlyoProfile

* Only keep first co-ortholog (i.e. TF is present in species), so you can transform to wide format

In [5]:
def tfs_in_pp(path, uniprot_ids):
    with open(path) as fh:
        header = next(fh)
        col = []
        for line in fh:
            # print(line)
            uniprotid, taxid, orthoid, fasf, fasb = line.strip().split()
            if uniprotid in uniprot_ids:
                int_taxid = int(taxid.replace('ncbi', ''))
                col.append([uniprotid, int_taxid])
        
        df = pd.DataFrame(col, columns=['uniprotid', 'taxid'])
        df = df.drop_duplicates()
        wide_df = pd.crosstab(df['uniprotid'], df['taxid'])
        return wide_df
    
tfs_pp = tfs_in_pp(pp_path, tf_uniprot_ids)
display(tfs_pp)

taxid,9258,9261,9305,9337,9361,9365,9371,9402,9407,9417,...,591936,885580,1026970,1047088,1230840,1574408,1608482,1706337,1868482,2715852
uniprotid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A087WUV0,0,1,0,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
A0AVK6,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
A0PJY2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
A1A519,0,0,0,0,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
A1YPR0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9Y6Q9,1,1,1,1,1,1,1,1,1,1,...,1,1,1,0,1,1,1,1,1,1
Q9Y6R6,1,1,1,1,1,0,1,0,1,0,...,1,0,0,1,0,1,1,1,1,1
Q9Y6X0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Q9Y6X8,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,0,1


In [6]:
from ete3 import NCBITaxa
from tqdm import tqdm
import json
import os


def phylostrata(df):
    ncbi = NCBITaxa()
    
    taxid_array = df.apply(lambda row: [col for col in df.columns if row[col] == 1], axis=1)
    uniprot2lca = {}
    for uniprotid, taxids in tqdm(taxid_array.items()):
        tree = ncbi.get_topology(taxids)
        lca_node = tree.get_common_ancestor([str(taxid) for taxid in taxids])
        lca_taxid = lca_node.name
        lca_name = list(ncbi.get_taxid_translator([lca_taxid]).values())[0]
        uniprot2lca[uniprotid] = lca_name
    
    return uniprot2lca
    
 
if not os.path.isfile(phylostrata_path):
    uniprot2lca = phylostrata(tfs_pp)
    with open(phylostrata_path, 'w') as of:
        json.dump(uniprot2lca, of)
else:
    with open(phylostrata_path) as fh:
        uniprot2lca = json.load(fh)
        
print(uniprot2lca)

{'A0A087WUV0': 'Mammalia', 'A0AVK6': 'Mammalia', 'A0PJY2': 'Mammalia', 'A1A519': 'Theria', 'A1YPR0': 'Mammalia', 'A2RRD8': 'Mammalia', 'A2RU54': 'Mammalia', 'A4D1E1': 'Theria', 'A6NCS4': 'Mammalia', 'A6NFD8': 'Mammalia', 'A6NFI3': 'Mammalia', 'A6NFQ7': 'Theria', 'A6NGD5': 'Eutheria', 'A6NHJ4': 'Theria', 'A6NHT5': 'Mammalia', 'A6NI15': 'Mammalia', 'A6NJ46': 'Mammalia', 'A6NJG6': 'Eutheria', 'A6NJL1': 'Eutheria', 'A6NJT0': 'Mammalia', 'A6NK53': 'Eutheria', 'A6NK75': 'Mammalia', 'A6NKF2': 'Mammalia', 'A6NLW8': 'Mammalia', 'A6NM28': 'Mammalia', 'A6NMT0': 'Mammalia', 'A6NN14': 'Mammalia', 'A6NNA5': 'Mammalia', 'A6NNF4': 'Theria', 'A6NP11': 'Mammalia', 'A8K8V0': 'Theria', 'A8MQ14': 'Mammalia', 'A8MT65': 'Eutheria', 'A8MT69': 'Mammalia', 'A8MTJ6': 'Mammalia', 'A8MTQ0': 'Mammalia', 'A8MTY0': 'Mammalia', 'A8MUV8': 'Mammalia', 'A8MUZ8': 'Boreoeutheria', 'A8MXY4': 'Mammalia', 'A8MYZ6': 'Mammalia', 'A8MZ59': 'Eutheria', 'B2RXF5': 'Mammalia', 'B4DU55': 'Mammalia', 'B4DX44': 'Mammalia', 'B4DXR9': 'M

## Count LCAs

In [7]:
from collections import Counter

c = Counter(uniprot2lca.values())
display(c)

Counter({'Mammalia': 1347,
         'Theria': 172,
         'Eutheria': 67,
         'Boreoeutheria': 11,
         'Euarchontoglires': 3,
         'Simiiformes': 1,
         'Haplorrhini': 1,
         'Primates': 1,
         'Homo sapiens': 1})

## Find conserved and non-conserved TFs

* for each lca, how large is the maximum?

In [8]:
def maximum_orthologs_per_LCA(df, counter):
    ncbi = NCBITaxa()
    
    lca2members = {}
    # get the NCBI taxonomy tree for all taxa
    alltaxa = list(df.columns)
    tree = ncbi.get_topology(alltaxa)
    # get the node of each possible LCA
    lca_name2taxidlist = ncbi.get_name_translator(dict(counter).keys())
    # extract subtrees for each LCA
    for lca_name, taxidlist in lca_name2taxidlist.items():
        taxid = taxidlist[0]
        # print(taxid)
        subtree = tree.search_nodes(name=str(taxid))[0]
        num_taxa = len(subtree)
        
        lca2members[lca_name] = num_taxa
    return lca2members


    
lca2members = maximum_orthologs_per_LCA(tfs_pp, c)
print(lca2members)

{'Boreoeutheria': 153, 'Euarchontoglires': 65, 'Eutheria': 161, 'Haplorrhini': 26, 'Homo sapiens': 1, 'Mammalia': 169, 'Primates': 29, 'Simiiformes': 25, 'Theria': 167}


## Put everything together

In [14]:
def join_results(tfs_pp, uniprot2lca, lca2members):
    sum_array = tfs_pp.sum(axis=1)
    uniprot2lca = pd.DataFrame.from_dict(uniprot2lca, orient='index', columns=['gained_in'])
    df = uniprot2lca.join(pd.DataFrame(sum_array, columns=['number_orthologs']))
    memberdf = pd.DataFrame.from_dict(lca2members, orient='index', columns=['maximum_number'])
    df = df.merge(memberdf, left_on='gained_in', right_index=True)
    return df


def find_conserved(df, min_fraction=0.95):
    df['fraction'] =  df.number_orthologs / df.maximum_number
    df['strict_core'] = df['fraction'] == 1.0
    df['conserved_min_95percent'] = df['fraction'] > min_fraction
    return df

    
    
    
summary = join_results(tfs_pp, uniprot2lca, lca2members)
overview = find_conserved(summary)
display(overview)
overview.to_csv(phylostrata_overview, sep='\t', index=False)

Unnamed: 0,gained_in,number_orthologs,maximum_number,fraction,strict_core,conserved_min_95percent
A0A087WUV0,Mammalia,142,169,0.840237,False,False
A0AVK6,Mammalia,169,169,1.000000,True,True
A0PJY2,Mammalia,168,169,0.994083,False,True
A1A519,Theria,163,167,0.976048,False,True
A1YPR0,Mammalia,168,169,0.994083,False,True
...,...,...,...,...,...,...
Q9Y6Q9,Mammalia,168,169,0.994083,False,True
Q9Y6R6,Mammalia,84,169,0.497041,False,False
Q9Y6X0,Mammalia,169,169,1.000000,True,True
Q9Y6X8,Mammalia,168,169,0.994083,False,True


## Filter PhyloProfile

In [13]:
non_conserved = overview[~overview.conserved_min_95percent]
print(Counter(non_conserved.gained_in))
display(non_conserved)
#non_conserved_phyloprofile

with open(pp_path) as fh:
    header = next(fh)
    col = [header]
    for line in fh:
        uniprotid = line.split()[0]
        if uniprotid in non_conserved.index:
            col.append(line)
            
with open(non_conserved_phyloprofile, 'w') as of:
    for line in col:
        of.write(line)
        


Counter({'Mammalia': 378, 'Theria': 136, 'Eutheria': 54, 'Boreoeutheria': 11, 'Euarchontoglires': 3, 'Simiiformes': 1, 'Haplorrhini': 1, 'Primates': 1})


Unnamed: 0,gained_in,number_orthologs,maximum_number,fraction,conserved_min_95percent
A0A087WUV0,Mammalia,142,169,0.840237,False
A2RRD8,Mammalia,67,169,0.396450,False
A6NCS4,Mammalia,156,169,0.923077,False
A6NFQ7,Theria,104,167,0.622754,False
A6NGD5,Eutheria,139,161,0.863354,False
...,...,...,...,...,...
Q9Y5W3,Mammalia,150,169,0.887574,False
Q9Y603,Theria,104,167,0.622754,False
Q9Y651,Mammalia,145,169,0.857988,False
Q9Y6Q3,Mammalia,153,169,0.905325,False


In [12]:
conserved = overview[overview.conserved_min_95percent]
display(conserved)

print(Counter(conserved.gained_in))

Unnamed: 0,gained_in,number_orthologs,maximum_number,fraction,conserved_min_95percent
A0AVK6,Mammalia,169,169,1.000000,True
A0PJY2,Mammalia,168,169,0.994083,True
A1A519,Theria,163,167,0.976048,True
A1YPR0,Mammalia,168,169,0.994083,True
A2RU54,Mammalia,163,169,0.964497,True
...,...,...,...,...,...
Q9Y692,Mammalia,169,169,1.000000,True
Q9Y6Q9,Mammalia,168,169,0.994083,True
Q9Y6X0,Mammalia,169,169,1.000000,True
Q9Y6X8,Mammalia,168,169,0.994083,True


Counter({'Mammalia': 969, 'Theria': 36, 'Eutheria': 13, 'Homo sapiens': 1})


In [16]:
strict_core = overview[overview.strict_core]
print(Counter(strict_core.gained_in))
display(strict_core)

Counter({'Mammalia': 330, 'Theria': 7, 'Eutheria': 2, 'Homo sapiens': 1})


Unnamed: 0,gained_in,number_orthologs,maximum_number,fraction,strict_core,conserved_min_95percent
A0AVK6,Mammalia,169,169,1.0,True,True
A6NI15,Mammalia,169,169,1.0,True,True
A6NNA5,Mammalia,169,169,1.0,True,True
O00321,Mammalia,169,169,1.0,True,True
O00409,Mammalia,169,169,1.0,True,True
...,...,...,...,...,...,...
Q9Y5R5,Mammalia,169,169,1.0,True,True
Q9Y5X4,Mammalia,169,169,1.0,True,True
Q9Y692,Mammalia,169,169,1.0,True,True
Q9Y6X0,Mammalia,169,169,1.0,True,True
