In [1]:
from rodent_loss_src.rnaseq import split_up_down_regulated, parse_rnaseq, read_oma
from pathlib import Path
import pandas as pd
from collections import defaultdict

PROJECTDIR = Path('.').resolve().parents[2]

ORGANISMS = ["human", "mouse"]
MIRNAS = ["mir197", "mir769"]
htdb_path = f'{PROJECTDIR}/analyses/protein_loss/data/DatabaseExtract_v_1.01.csv'
dge_path = '{}/milestones/data/rnaseq/{}_{}_differential_genes.tsv'
regnet_path = '/share/project/felixl/regnet/regNetwork/{}/{}.source'
omapath = f'{PROJECTDIR}/external_data/human_mouse_omapairwise.txt'
rnaseq_path = '../data/{}_results_Neg_vs_{}.tsv'


def load_regnetwork(path):
    tf2target = defaultdict(set)
    with open(path) as fh:
        for line in fh:
            tf, sid, target, tid = line.strip().split()
            tf2target[tf.upper()].add(target.upper())
    return tf2target


def load_htdb_tf_symbols(path):
    df = pd.read_csv(path)
    human_symbols = set(df['HGNC symbol'].unique())
    mouse_symbols = {symbol.capitalize() for symbol in human_symbols}
    return human_symbols.union(mouse_symbols)


def find_expressed_genes(path, omapath, organism, mirna, min_baseMean=100):
    orthos = read_oma(omapath)
    df = parse_rnaseq(path.format(organism, mirna), orthos, organism, mirna)
    df = df[df['baseMean'] >= min_baseMean]
    allgenes = df.gene.str.upper()
    return allgenes


def correlate_targets_with_upregulation(down_tf2targets, up, down, expressed_genes):
    all_targets = set()
    for tf, targets in down_tf2targets.items():
        all_targets.update(targets)
    expressed_targets = all_targets.intersection(expressed_genes)
    upregulated_targets = all_targets.intersection(set(up))
    # print(all_targets)
    # print(up)
    secondary_targets = all_targets.intersection(set(down))
    print(f'Downregulated TFs with known targets: {len(down_tf2targets)}')
    print(f'Total known targets: {len(all_targets)}')
    print(f'Total expressed targets: {len(expressed_targets)}')
    print(f'Up-regulated known targets: {len(upregulated_targets)}')
    print(f'Down-regulated known targets: {len(secondary_targets)}')
    print()
    # print(upregulated_targets)
    
    

htdb_tf_symbols = load_htdb_tf_symbols(htdb_path)
# condition2downregulatedTFs = {}
for organism in ORGANISMS:
    tf2target = load_regnetwork(regnet_path.format(organism, organism))
    for mirna in MIRNAS:
        condition = f'{organism}|{mirna}'
        print(condition)
        # load rnaseq data
        expressed_genes = find_expressed_genes(rnaseq_path, omapath, organism, mirna)
        
        # find TFs
        up, down = split_up_down_regulated(dge_path.format(PROJECTDIR, organism, mirna))
        down_regulated_tfs = set(down).intersection(htdb_tf_symbols)
        
        # check for known interactions
        down_tf2targets = {tf: tf2target[tf] for tf in down_regulated_tfs if tf in tf2target}
        
        # check if known TF targets are up-regulated and expressed
        correlate_targets_with_upregulation(down_tf2targets, up, down, expressed_genes)




        

human|mir197
Downregulated TFs with known targets: 8
Total known targets: 75
Total expressed targets: 54
Up-regulated known targets: 1
Down-regulated known targets: 4

human|mir769
Downregulated TFs with known targets: 7
Total known targets: 1012
Total expressed targets: 481
Up-regulated known targets: 4
Down-regulated known targets: 6

mouse|mir197
Downregulated TFs with known targets: 4
Total known targets: 43
Total expressed targets: 26
Up-regulated known targets: 1
Down-regulated known targets: 0

mouse|mir769
Downregulated TFs with known targets: 5
Total known targets: 35
Total expressed targets: 21
Up-regulated known targets: 0
Down-regulated known targets: 0



In [2]:
from collections import defaultdict

regnet_path = '/share/project/felixl/regnet/regNetwork/human/human.source'

def load_regnetwork(path):
    tf2target = defaultdict(set)
    with open(path) as fh:
        for line in fh:
            tf, sid, target, tid = line.strip().split()
            tf2target[tf.upper()].add(target.upper())
    return tf2target


tf2target = load_regnetwork(regnet_path)
print(tf2target.keys())

dict_keys(['USF1', 'TP53', 'TFAP2A', 'STAT5A', 'STAT3', 'STAT1', 'SRF', 'SP1', 'RELA', 'PPARG', 'NFKB1', 'MYC', 'MAX', 'JUN', 'FOS', 'ELK1', 'EGR1', 'E2F1', 'CREB1', 'YY1', 'TFAP2C', 'STAT6', 'STAT5B', 'STAT4', 'STAT2', 'SPI1', 'SMAD4', 'SMAD3', 'SMAD1', 'RXRA', 'RORA', 'REST', 'RELB', 'REL', 'PPARD', 'PPARA', 'POU2F2', 'POU2F1', 'PAX6', 'PAX5', 'NFYA', 'NFKB2', 'NFIC', 'MYB', 'LEF1', 'HNF4A', 'HNF1A', 'HIF1A', 'GLI2', 'GATA2', 'GATA1', 'ETS1', 'ESR1', 'EPAS1', 'ELK4', 'EGR3', 'EGR2', 'E2F4', 'E2F2', 'CEBPB', 'CEBPA', 'ATF4', 'ATF2', 'ATF1', 'AR', 'ZIC2', 'ZEB1', 'ZBTB6', 'ZBTB17', 'ZBTB16', 'XBP1', 'WT1', 'VSX2', 'USF2', 'U2AF1', 'TP73', 'TP63', 'TOPORS', 'TLX2', 'TLX1', 'TGIF1', 'TFDP2', 'TFDP1', 'TFAP4', 'TFAP2B', 'TCF7L2', 'TCF7L1', 'TCF7', 'TCF4', 'TCF3', 'TBP', 'TAL1', '2623', 'SRY', 'SREBF2', 'SREBF1', 'SP4', 'SP3', 'SP2', 'SOX9', 'SOS2', 'SOS1', 'SMAD9', 'SMAD7', 'SMAD6', 'SMAD5', 'SMAD2', 'RXRG', 'RXRB', '5915', '7421', 'RUNX1T1', 'RUNX1', 'RREB1', 'RORC', 'RORB', 'RFX1', 'RBP