# MAF file and miRNA hit table

Read the maf file in the same folder. Give me a table listing all hit and non-hit miRNAs and the number of mutations on them. For example,

|miR|#hits|
|---|---|
|miR1|2|
|miR2|0|
|miR3|5|

##Reading MAF

In [1]:
import pandas as pd
from pylab import *

FILE = "hgsc.bcm.edu__Mixed_curated_DNA_sequencing_level2.maf"

In [2]:
maf = pd.read_table(FILE, usecols=["Hugo_Symbol", "Chrom", "Start_Position", "End_Position"])
maf.head()

Unnamed: 0,Hugo_Symbol,Chrom,Start_Position,End_Position
0,A1BG,19,58864353,58864353
1,A1CF,10,52573773,52573773
2,A2ML1,12,8995922,8995922
3,A4GALT,22,43089055,43089055
4,A4GALT,22,43089757,43089757


In [3]:
# for demo only (keep only M* genes)
#maf = maf.iloc[9848:11105]
#maf.head()

In [4]:
# how many mutation records are there
print len(maf)

20166


##Reading miRBase

In [5]:
FILE = "../hw6/hsa.gff3"
df = pd.read_table(FILE, comment="#", usecols=["Chromosome", "Start", "End", "miR Name"])
df.head()

Unnamed: 0,Chromosome,Start,End,miR Name
0,chr1,17369,17436,ID=MI0022705;Alias=MI0022705;Name=hsa-mir-6859-1
1,chr1,17409,17431,ID=MIMAT0027618;Alias=MIMAT0027618;Name=hsa-mi...
2,chr1,17369,17391,ID=MIMAT0027619;Alias=MIMAT0027619;Name=hsa-mi...
3,chr1,30366,30503,ID=MI0006363;Alias=MI0006363;Name=hsa-mir-1302-2
4,chr1,30438,30458,ID=MIMAT0005890;Alias=MIMAT0005890;Name=hsa-mi...


In [6]:
# break df into chromosomes for speeding
chr2df = {}
for chr in [str(i) for i in range(1,23)] + ["X", "Y"]:
    chr2df[chr] = df[df["Chromosome"] == "chr" + chr]
    print "chr%s\thas %d\tmiRNA records" % (chr, len(chr2df[chr]))

chr1	has 398	miRNA records
chr2	has 271	miRNA records
chr3	has 227	miRNA records
chr4	has 144	miRNA records
chr5	has 175	miRNA records
chr6	has 162	miRNA records
chr7	has 198	miRNA records
chr8	has 224	miRNA records
chr9	has 207	miRNA records
chr10	has 167	miRNA records
chr11	has 255	miRNA records
chr12	has 186	miRNA records
chr13	has 97	miRNA records
chr14	has 258	miRNA records
chr15	has 158	miRNA records
chr16	has 194	miRNA records
chr17	has 284	miRNA records
chr18	has 83	miRNA records
chr19	has 376	miRNA records
chr20	has 115	miRNA records
chr21	has 54	miRNA records
chr22	has 118	miRNA records
chrX	has 310	miRNA records
chrY	has 4	miRNA records


##Defining a searching function

In [7]:
def pos2miR(chr, start, end):
    chunk = chr2df[str(chr)]  # shrink the search space to the given chr
    out = []
    for idx, row in chunk.iterrows():
        if row["Start"] <= start <= row["End"] or \
           row["Start"] <=  end  <= row["End"] or \
           (start < row["Start"] and row["End"] < end):
            # get the name (either hsa-mir-### or hsa-miR-###)
            miR = row["miR Name"].split("Name=")[1].split(";")[0]
            out.append(miR)
    return out

In [8]:
# test
pos2miR(1, 17370, 17370)

['hsa-mir-6859-1', 'hsa-miR-6859-3p']

In [9]:
# test2
pos2miR("Y", 2477271, 2477370)

['hsa-mir-6089-2', 'hsa-miR-6089']

##Searching for hits

In [10]:
from collections import Counter
miR2hits = Counter()
for idx, row in maf.iterrows():
    print idx, '\r',
    hits = pos2miR(row["Chrom"], row["Start_Position"], row["End_Position"])
    for hit in hits:
        print row["Hugo_Symbol"], row["Chrom"], row["Start_Position"], row["End_Position"], "hits on", hit
        miR2hits[hit] += 1

MIR124-3 20 61809858 61809858 hits on hsa-mir-124-3
MIR135B 1 205417469 205417469 hits on hsa-mir-135b
MIR135B 1 205417469 205417469 hits on hsa-miR-135b-3p
MIR144 17 27188561 27188561 hits on hsa-mir-144
MIR1469 15 96876498 96876498 hits on hsa-mir-1469
MIR1469 15 96876498 96876498 hits on hsa-miR-1469
MIR192 11 64658676 64658676 hits on hsa-mir-192
MIR192 11 64658676 64658676 hits on hsa-miR-192-5p
MIR193A 17 29887061 29887061 hits on hsa-mir-193a
MIR194-2 11 64658846 64658846 hits on hsa-mir-194-2
MIR194-2 11 64658846 64658846 hits on hsa-miR-194-3p
MIR2053 8 113655752 113655752 hits on hsa-mir-2053
MIR218-1 4 20529995 20529995 hits on hsa-mir-218-1
MIR298 20 57393302 57393302 hits on hsa-mir-298
MIR323B 14 101522556 101522556 hits on hsa-mir-323b
MIR34C 11 111384236 111384236 hits on hsa-mir-34c
MIR448 X 114058112 114058112 hits on hsa-mir-448
MIR452 X 151128149 151128149 hits on hsa-mir-452
MIR452 X 151128169 151128169 hits on hsa-mir-452
MIR452 X 151128169 151128169 hits on hsa-m

In [11]:
pd.Series(miR2hits)

hsa-miR-135b-3p      1
hsa-miR-1469         1
hsa-miR-192-5p       1
hsa-miR-194-3p       1
hsa-miR-452-5p       1
hsa-miR-509-3-5p     1
hsa-miR-509-5p       1
hsa-miR-517-5p       1
hsa-miR-520c-5p      1
hsa-miR-520f-3p      1
hsa-miR-596          1
hsa-miR-608          1
hsa-miR-6501-5p      1
hsa-mir-124-3        1
hsa-mir-135b         1
hsa-mir-136          1
hsa-mir-144          1
hsa-mir-1469         1
hsa-mir-192          1
hsa-mir-193a         1
hsa-mir-194-2        1
hsa-mir-2053         1
hsa-mir-218-1        1
hsa-mir-298          1
hsa-mir-323b         1
hsa-mir-34c          1
hsa-mir-448          1
hsa-mir-452          2
hsa-mir-504          1
hsa-mir-506          2
hsa-mir-509-2        1
hsa-mir-509-3        1
hsa-mir-512-1        1
hsa-mir-512-2        2
hsa-mir-517a         1
hsa-mir-520c         1
hsa-mir-520f         1
hsa-mir-542          1
hsa-mir-564          2
hsa-mir-596          1
hsa-mir-608          1
hsa-mir-6084         3
hsa-mir-639          1
hsa-mir-650