# MAF file and miRNA hit table

Read the maf file in the same folder. Give me a table listing all hit and non-hit miRNAs and the number of mutations on them. For example,

|miR|#hits|
|---|---|
|miR1|2|
|miR2|0|
|miR3|5|

##Reading MAF

In [1]:
import pandas as pd
from pylab import *

FILE = "hgsc.bcm.edu__Mixed_curated_DNA_sequencing_level2.maf"

In [2]:
maf = pd.read_table(FILE, usecols=["Hugo_Symbol", "Chrom", "Start_Position", "End_Position", "Tumor_Sample_Barcode"])
maf.head()

Unnamed: 0,Hugo_Symbol,Chrom,Start_Position,End_Position,Tumor_Sample_Barcode
0,A1BG,19,58864353,58864353,TCGA-OR-A5KB-01A-11D-A30A-10
1,A1CF,10,52573773,52573773,TCGA-OR-A5KB-01A-11D-A30A-10
2,A2ML1,12,8995922,8995922,TCGA-OR-A5J4-01A-11D-A29I-10
3,A4GALT,22,43089055,43089055,TCGA-OR-A5JY-01A-31D-A29I-10
4,A4GALT,22,43089757,43089757,TCGA-PK-A5HB-01A-11D-A29I-10


In [5]:
# for demo only (keep only M* genes)
#maf = maf.iloc[9848:11105]
#maf.head()

In [3]:
# how many mutation records are there
print len(maf)

20166


##Reading miRBase

In [8]:
FILE = "../hw6/hsa.gff3"
df = pd.read_table(FILE, comment="#", usecols=["Chromosome", "Start", "End", "Type", "miR Name"])
df.head()

Unnamed: 0,Chromosome,Type,Start,End,miR Name
0,chr1,miRNA_primary_transcript,17369,17436,ID=MI0022705;Alias=MI0022705;Name=hsa-mir-6859-1
1,chr1,miRNA,17409,17431,ID=MIMAT0027618;Alias=MIMAT0027618;Name=hsa-mi...
2,chr1,miRNA,17369,17391,ID=MIMAT0027619;Alias=MIMAT0027619;Name=hsa-mi...
3,chr1,miRNA_primary_transcript,30366,30503,ID=MI0006363;Alias=MI0006363;Name=hsa-mir-1302-2
4,chr1,miRNA,30438,30458,ID=MIMAT0005890;Alias=MIMAT0005890;Name=hsa-mi...


In [13]:
# break df into chromosomes for speeding
chr2df = {}
for chr in [str(i) for i in range(1,23)] + ["X", "Y"]:
    chr2df[chr] = df[df["Chromosome"] == "chr" + chr]
    print "chr%s\thas %d\tmiRNA records" % (chr, len(chr2df[chr]))

chr1	has 398	miRNA records
chr2	has 271	miRNA records
chr3	has 227	miRNA records
chr4	has 144	miRNA records
chr5	has 175	miRNA records
chr6	has 162	miRNA records
chr7	has 198	miRNA records
chr8	has 224	miRNA records
chr9	has 207	miRNA records
chr10	has 167	miRNA records
chr11	has 255	miRNA records
chr12	has 186	miRNA records
chr13	has 97	miRNA records
chr14	has 258	miRNA records
chr15	has 158	miRNA records
chr16	has 194	miRNA records
chr17	has 284	miRNA records
chr18	has 83	miRNA records
chr19	has 376	miRNA records
chr20	has 115	miRNA records
chr21	has 54	miRNA records
chr22	has 118	miRNA records
chrX	has 310	miRNA records
chrY	has 4	miRNA records


##Defining a searching function

In [48]:
def pos2miR(chr, start, end):
    chunk = chr2df[str(chr)]  # shrink the search space to the given chr
    out = []
    for idx, row in chunk.iterrows():
        if row["Start"] <= start <= row["End"] or \
           row["Start"] <=  end  <= row["End"] or \
           (start < row["Start"] and row["End"] < end):
            # get the name (either hsa-mir-### or hsa-miR-###)
            miR = row["Type"] + ";" + str(row["Start"]) + ";" + str(row["End"]) + ";" + row["miR Name"].split("Name=")[1].split(";")[0]
            out.append(miR)
    return out

In [49]:
# test
pos2miR(1, 17370, 17370)

['miRNA_primary_transcript;17369;17436;hsa-mir-6859-1',
 'miRNA;17369;17391;hsa-miR-6859-3p']

In [50]:
# test2
pos2miR("Y", 2477271, 2477370)

['miRNA_primary_transcript;2477232;2477295;hsa-mir-6089-2',
 'miRNA;2477270;2477293;hsa-miR-6089']

##Searching for hits

In [90]:
c = pd.DataFrame()
ms = pd.DataFrame()
me = pd.DataFrame()
rs = pd.DataFrame()
re = pd.DataFrame()
t = pd.DataFrame()
i = pd.DataFrame()
s = pd.DataFrame()
from collections import Counter
miR2hits = Counter()
for idx, row in maf.iterrows():
    print idx, '\r',
    hits = pos2miR(row["Chrom"], row["Start_Position"], row["End_Position"])
    for hit in hits:
        print row["Hugo_Symbol"], row["Chrom"], row["Start_Position"], row["End_Position"], row["Tumor_Sample_Barcode"], "hits on", hit
        c = c.append([row["Chrom"]])
        ms = ms.append([row["Start_Position"]])
        me = me.append([row["End_Position"]])
        rs = rs.append([hit.split(";")[1]])
        re = re.append([hit.split(";")[2]])
        t = t.append([hit.split(";")[0]])
        i = i.append([hit.split(";")[3]])
        s = s.append([row["Tumor_Sample_Barcode"]])
        miR2hits[hit] += 1

MIR124-3 20 61809858 61809858 TCGA-OR-A5JW-01A-11D-A29I-10 hits on miRNA_primary_transcript;61809852;61809938;hsa-mir-124-3
MIR135B 1 205417469 205417469 TCGA-PK-A5HB-01A-11D-A29I-10 hits on miRNA_primary_transcript;205417430;205417526;hsa-mir-135b
MIR135B 1 205417469 205417469 TCGA-PK-A5HB-01A-11D-A29I-10 hits on miRNA;205417451;205417472;hsa-miR-135b-3p
MIR144 17 27188561 27188561 TCGA-OR-A5J5-01A-11D-A29I-10 hits on miRNA_primary_transcript;27188551;27188636;hsa-mir-144
MIR1469 15 96876498 96876498 TCGA-OR-A5KB-01A-11D-A30A-10 hits on miRNA_primary_transcript;96876490;96876536;hsa-mir-1469
MIR1469 15 96876498 96876498 TCGA-OR-A5KB-01A-11D-A30A-10 hits on miRNA;96876490;96876511;hsa-miR-1469
MIR192 11 64658676 64658676 TCGA-OR-A5K4-01A-11D-A29I-10 hits on miRNA_primary_transcript;64658609;64658718;hsa-mir-192
MIR192 11 64658676 64658676 TCGA-OR-A5K4-01A-11D-A29I-10 hits on miRNA;64658675;64658695;hsa-miR-192-5p
MIR193A 17 29887061 29887061 TCGA-OR-A5KX-01A-11D-A29I-10 hits on miRNA_p

In [91]:
pd.Series(miR2hits)

miRNA;102734757;102734781;hsa-miR-608                          1
miRNA;146340329;146340349;hsa-miR-509-5p                       1
miRNA;146341214;146341235;hsa-miR-509-3-5p                     1
miRNA;151128150;151128171;hsa-miR-452-5p                       1
miRNA;1765412;1765432;hsa-miR-596                              1
miRNA;205417451;205417472;hsa-miR-135b-3p                      1
miRNA;34922970;34922991;hsa-miR-6501-5p                        1
miRNA;54185467;54185488;hsa-miR-520f-3p                        1
miRNA;54210722;54210743;hsa-miR-520c-5p                        1
miRNA;54215536;54215557;hsa-miR-517-5p                         1
miRNA;64658675;64658695;hsa-miR-192-5p                         1
miRNA;64658840;64658861;hsa-miR-194-3p                         1
miRNA;96876490;96876511;hsa-miR-1469                           1
miRNA_primary_transcript;100549014;100549089;hsa-mir-875       1
miRNA_primary_transcript;101351039;101351120;hsa-mir-136       1
miRNA_primary_transcript;

In [92]:
c = c.append(["17", "3"])
ms = ms.append(["67302911", "132321037"])
me = me.append(["67302912", "132321037"])
rs = rs.append(["67302907", "132321033"])
re = re.append(["67302913", "132321039"])
t = t.append(["polyA signal", "polyA signal"])
i = i.append(["535691", "501672"])
s = s.append(["TCGA-OR-A5JI-01A-11D-A29I-10", "TCGA-PK-A5HB-01A-11D-A29I-10"])
c.rename(columns={0:"Chrom"}, inplace =True)
ms.rename(columns={0:"mStart"}, inplace=True)
me.rename(columns={0:"mEnd"}, inplace=True)
rs.rename(columns={0:"rStart"}, inplace=True)
re.rename(columns={0:"rEnd"}, inplace=True)
t.rename(columns={0:"Type"}, inplace=True)
i.rename(columns={0:"ID"}, inplace=True)
s.rename(columns={0:"Tumor Sample Barcode"}, inplace=True)
final = pd.concat([c,ms,me,rs,re,t,i,s], axis=1)
final = final.reset_index(drop=True)
final

Unnamed: 0,Chrom,mStart,mEnd,rStart,rEnd,Type,ID,Tumor Sample Barcode
0,20,61809858,61809858,61809852,61809938,miRNA_primary_transcript,hsa-mir-124-3,TCGA-OR-A5JW-01A-11D-A29I-10
1,1,205417469,205417469,205417430,205417526,miRNA_primary_transcript,hsa-mir-135b,TCGA-PK-A5HB-01A-11D-A29I-10
2,1,205417469,205417469,205417451,205417472,miRNA,hsa-miR-135b-3p,TCGA-PK-A5HB-01A-11D-A29I-10
3,17,27188561,27188561,27188551,27188636,miRNA_primary_transcript,hsa-mir-144,TCGA-OR-A5J5-01A-11D-A29I-10
4,15,96876498,96876498,96876490,96876536,miRNA_primary_transcript,hsa-mir-1469,TCGA-OR-A5KB-01A-11D-A30A-10
5,15,96876498,96876498,96876490,96876511,miRNA,hsa-miR-1469,TCGA-OR-A5KB-01A-11D-A30A-10
6,11,64658676,64658676,64658609,64658718,miRNA_primary_transcript,hsa-mir-192,TCGA-OR-A5K4-01A-11D-A29I-10
7,11,64658676,64658676,64658675,64658695,miRNA,hsa-miR-192-5p,TCGA-OR-A5K4-01A-11D-A29I-10
8,17,29887061,29887061,29887015,29887102,miRNA_primary_transcript,hsa-mir-193a,TCGA-OR-A5KX-01A-11D-A29I-10
9,11,64658846,64658846,64658827,64658911,miRNA_primary_transcript,hsa-mir-194-2,TCGA-OR-A5LB-01A-11D-A29I-10


In [93]:
final.to_csv("hittable.txt", sep="\t")

In [94]:
final.to_excel("hittable.xls")