# Sample solution

In [3]:
# build up the miR-to-genes & gene-to-miRs dicts
miR2genes = {}
gene2miRs = {}
for line in open('hw4.tsv').readlines()[1:]:
    miR, gene = line.rstrip().split('\t')
    miR2genes.setdefault(miR, set()).add(gene)
    gene2miRs.setdefault(gene, set()).add(miR)

miR2genes is a dict whose keys are miR families and values are correponsind target gene sets. E.g.,

In [5]:
miR2genes['miR-124/124ab/506']

{'SSPN',
 'PGM2L1',
 'AGL',
 'NIPA1',
 'SSFA2',
 'SLC50A1',
 'PIK3CA',
 'EN2',
 'ZNF704',
 'RNF114',
 'SPTLC2',
 'SPTLC1',
 'ABCD1',
 'GRIN1',
 'FOXQ1',
 'FAM59A',
 'SP1',
 'SP2',
 'SP3',
 'SP7',
 'POGLUT1',
 'PARP16',
 'KLLN',
 'P4HA1',
 'SPPL2A',
 'NTHL1',
 'ITGA3',
 'ITGA6',
 'ITGA7',
 'DENND4C',
 'CYP2U1',
 'C15orf17',
 'CSDE1',
 'BACH2',
 'DGAT2',
 'HRH1',
 'FAM116A',
 'SDK1',
 'THSD4',
 'LIN28B',
 'COL4A1',
 'CHST1',
 'ANAPC7',
 'RPS6KA1',
 'RPS6KA4',
 'SFT2D3',
 'SFT2D2',
 'ATAD1',
 'GIT2',
 'TMEM198',
 'HLF',
 'FAM134B',
 'ZNF292',
 'SMAD4',
 'SMAD5',
 'SMAD2',
 'ZNF706',
 'AMMECR1L',
 'SHROOM4',
 'SNTB2',
 'GDAP2',
 'ILDR2',
 'GLTP',
 'MYLK4',
 'MYLK2',
 'BARX2',
 'SH2B3',
 'NOL4',
 'SAR1B',
 'PQLC3',
 'SRPK3',
 'SMCR8',
 'RAB11FIP1',
 'RAB11FIP5',
 'RAB11FIP4',
 'DCAKD',
 'ATP1A1',
 'SPHK1',
 'CPEB1',
 'CHP2',
 'ARL10',
 'EEA1',
 'CLIP3',
 'CDCA7',
 'ETNK1',
 'FGFR1OP',
 'SASH1',
 'THAP2',
 'THAP3',
 'GPT2',
 'ANKFY1',
 'CYB5A',
 'FAR1',
 'CALCOCO1',
 'ATP6V0E1',
 'IGDCC4',
 

In [1]:
# get the miR w/ max out-degree
miR2outDeg = {k: len(v) for k, v in miR2genes.items()}  # this is called "dict comprehension", a cool way to build up a dict
miR = sorted(miR2outDeg, key=miR2outDeg.get)[-1]  # sort a dict by its values

print miR
print miR2outDeg[miR]

# get average out-degree
avg = lambda L: float(sum(L)) / len(L)  # a lazy way to define a function
print '%.2f' % avg(miR2outDeg.values())

miR-124/124ab/506
1791
585.56


In [7]:
# get the gene w/ max in-degree 
gene2inDeg = {k: len(v) for k, v in gene2miRs.items()}
gene = sorted(gene2inDeg, key=gene2inDeg.get)[-1]

print gene
print gene2inDeg[gene]

# get average in-degree 
print '%.2f' % avg(gene2inDeg.values())

TNRC6B
80
7.34


In [10]:
# get the miR-pair w/ max shared targets 
pair2size = {(k1, k2): len(v1 & v2) \
             for k1,v1 in miR2genes.items() \
             for k2,v2 in miR2genes.items() \
             if k1 != k2} 
pair = sorted(pair2size, key=pair2size.get)[-1]

print pair[0], pair[1]
print pair2size[pair]

miR-182 miR-96/507/1271
881


#Alternative ways

In [11]:
# use defaultdict to build the dicts
from collections import defaultdict

miR2genes = defaultdict(set)
gene2miRs = defaultdict(set)
for line in open('hw4.tsv').readlines()[1:]:
    miR, gene = line.strip().split('\t')
    miR2genes[miR].add(gene)
    gene2miRs[gene].add(miR)

In [12]:
# use combinations to get all possible pairs
from itertools import combinations

pair2size = {(x,y): len(miR2genes[x] & miR2genes[y]) \
             for x,y in combinations(miR2genes, 2)}
pair = sorted(pair2size, key=pair2size.get)[-1]

print pair[0], pair[1]
print pair2size[pair]

miR-96/507/1271 miR-182
881


# Wrap everything into a function

In [22]:
def print_network_info(taxid='9606'):
    """Print the miR regulatory network info for the given tax ID.
    """
    # build up the miR-to-genes & gene-to-miRs dicts
    miR2genes = {}
    gene2miRs = {}
    for line in open('Predicted_Targets_Info.txt'):
        L = line.rstrip().split('\t')
        if L[4] == taxid:
            miR, gene = L[0], L[2]
            miR2genes.setdefault(miR, set()).add(gene)
            gene2miRs.setdefault(gene, set()).add(miR)
            
    # get the miR w/ max out-degree
    miR2outDeg = {k: len(v) for k, v in miR2genes.items()}  # this is called "dict comprehension", a cool way to build up a dict
    miR = sorted(miR2outDeg, key=miR2outDeg.get)[-1]  # sort a dict by its values
    print miR
    print miR2outDeg[miR]

    # get average out-degree
    avg = lambda L: float(sum(L)) / len(L)  # a lazy way to define a function
    print '%.2f' % avg(miR2outDeg.values())

    # get the gene w/ max in-degree 
    gene2inDeg = {k: len(v) for k, v in gene2miRs.items()}
    gene = sorted(gene2inDeg, key=gene2inDeg.get)[-1]
    print gene
    print gene2inDeg[gene]

    # get average in-degree 
    print '%.2f' % avg(gene2inDeg.values())
    
    # get the miR-pair w/ max shared targets 
    pair2size = {(k1, k2): len(v1 & v2) \
                 for k1,v1 in miR2genes.items() \
                 for k2,v2 in miR2genes.items() \
                 if k1 != k2} 
    pair = sorted(pair2size, key=pair2size.get)[-1]

    print pair[0], pair[1]
    print pair2size[pair]

In [24]:
print_network_info()

miR-124/124ab/506
1654
475.62
TNRC6B
74
6.52
miR-182 miR-96/507/1271
762


In [26]:
print_network_info(raw_input("Tax ID:"))

Tax ID:9913
miR-124/124ab/506
1526
435.16
TNRC6B
71
6.32
miR-182 miR-96/507/1271
704
