/
mirna_annotation.py
196 lines (149 loc) · 10.2 KB
/
mirna_annotation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
###########################################################################################
### Annotation : Python Script
###########################################################################################
### @author: Sunantha Sethuraman (November, 2018)
###
### Description: Annotate miRNA sequencing reads using miRBase and GENCODE annotation information.
### The annotated information was downloaded and pre-processed and the script is not compatible with user
### supplied annotation, unless the information is pre-processed the same way.
###
### Purpose: The primary purpose and scope of this script is to processes CPTAC miRNA sequencing data.
###
### Usage: mirna_annotation.py --sample <sample_name>
###########################################################################################
###########################################################################################
### Import modules
###########################################################################################
import pandas as pd
import re
import numpy as np
import string
import random
import argparse
###########################################################################################
### Parse arguments
###########################################################################################
parser = argparse.ArgumentParser(description='miRNA annotation pipeline')
parser.add_argument('--sample', type=str, help='sample name', required = True)
args = parser.parse_args()
s_name = args.sample
###########################################################################################
### Assign filenames and check if they exist
###########################################################################################
mirfile = s_name + "_mirbase.bed"
trnafile = s_name + "_tRNAs.bed"
exonfile = s_name + "_exons.bed"
unanfile = s_name + "_unannot.bed"
samfile = s_name + ".sam"
###########################################################################################
### Prioritization within miRNA annotations based on overlap
###########################################################################################
### All mature miRNA sequences will also be present in pre-miRNAs. Thus we need an extra overlap-based prioritization to distinguish mature miRNAs from pre-miRNAs.
### Mature form is prioritized over pre form.
with open(mirfile, "r") as file:
mirs = pd.read_csv(file, sep="\t", header=None)
mirs = mirs.iloc[:, [0,1,2,3,5,6,7,8,9,13,15,16]]
mirs.columns = ["Chr_read", "Start_read", "End_read", "Read_ID", "Strand", "Chr_ann", "Start_ann", "End_ann", "Annotation", "Class", "Tag", "Overlap" ]
mirs['Overlap_adj'] = mirs['Overlap']
mirs.loc[mirs['Class'] == "miRNA", 'Overlap_adj'] = mirs.loc[mirs['Class'] == "miRNA", 'Overlap_adj'] +3
mirs = mirs.sort_values(['Overlap_adj', 'Class'], ascending = [False, True]).groupby(['Chr_read', 'Start_read', 'End_read', 'Read_ID']).head(1)
mirs = mirs.drop('Overlap_adj', axis=1)
###########################################################################################
### tRNA annotation clean up
###########################################################################################
with open(trnafile, "r") as file:
trna = pd.read_csv(file, sep = "\t", header = None)
trna = trna.iloc[:, [0,1,2,3,5,6,7,8,9,13,15,16]]
trna.columns = ["Chr_read", "Start_read", "End_read", "Read_ID", "Strand", "Chr_ann", "Start_ann", "End_ann", "Annotation", "Class", "Tag", "Overlap" ]
trna.Annotation = [re.sub(".*gene_type=(.*?);.*", r"\1", x) for x in trna.Tag]
###########################################################################################
### Exon annotation clean up
###########################################################################################
with open(exonfile, "r") as file:
exon = pd.read_csv(file, sep = "\t", header = None)
exon = exon.iloc[:, [0,1,2,3,5,6,7,8,9,13,15,16]]
exon.columns = ["Chr_read", "Start_read", "End_read", "Read_ID", "Strand", "Chr_ann", "Start_ann", "End_ann", "Annotation", "Class", "Tag", "Overlap" ]
exon.Class = [re.sub(".*gene_type=(.*?);.*", r"\1", x) for x in exon.Tag]
###########################################################################################
### Unannotated reads clean up
###########################################################################################
with open(unanfile, "r") as file:
unan = pd.read_csv(file, sep="\t", header=None)
unan = unan.iloc[:, [0,1,2,3,5]]
unan.columns = ["Chr_read", "Start_read", "End_read", "Read_ID", "Strand"]
unan = unan.reindex(columns=["Chr_read", "Start_read", "End_read", "Read_ID", "Strand", "Chr_ann", "Start_ann", "End_ann", "Annotation", "Class", "Tag", "Overlap" ], fill_value=".")
unan.Class = "unannotated"
###########################################################################################
### Combine all annotations
###########################################################################################
comb = mirs.append(trna)
comb = comb.append(exon)
comb = comb.append(unan)
###########################################################################################
### Remove multiple annotations of the same mapping based on a set priority
###########################################################################################
rna_priorities = ['miRNA', 'miRNA_primary_transcript', 'tRNA', 'rRNA', 'sRNA', 'snRNA', 'snoRNA', 'scaRNA', 'scRNA', 'ribozyme', 'vaultRNA', 'Mt_tRNA', 'Mt_rRNA', 'protein_coding',
'3prime_overlapping_ncRNA', 'bidirectional_promoter_lncRNA', 'lincRNA','macro_lncRNA', 'misc_RNA', 'non_coding',
'antisense', 'sense_intronic', 'sense_overlapping',
'IG_C_gene', 'IG_D_gene', 'IG_J_gene', 'IG_V_gene', 'TR_C_gene', 'TR_D_gene', 'TR_J_gene', 'TR_V_gene', 'IG_C_pseudogene','IG_J_pseudogene', 'IG_V_pseudogene', 'IG_pseudogene', 'TR_J_pseudogene', 'TR_V_pseudogene',
'rRNA_pseudogene', 'processed_transcript', 'pseudogene', 'polymorphic_pseudogene', 'processed_pseudogene', 'transcribed_processed_pseudogene', 'transcribed_unitary_pseudogene', 'transcribed_unprocessed_pseudogene', 'translated_processed_pseudogene',
'unitary_pseudogene', 'unprocessed_pseudogene', 'TEC','unannotated']
comb['Class'] = pd.Categorical(comb['Class'], rna_priorities)
comb = comb.sort_values('Class', ascending = True).groupby(['Chr_read', 'Start_read', 'End_read', 'Read_ID']).head(1)
ann = comb.copy() # Save the annotation information at this stage for later use in annotating the SAM file
###########################################################################################
### Multi-mapping hit management : Multi-mappers within same annotation class
###########################################################################################
## For multi-mapping hits with the same annotation class, pick one random hit to keep
## The code below is in some sense equivalent to running : comb = comb.groupby(['Read_ID', 'Class']).apply(lambda x: x.sample(n=1))
## However this is computationally intensive, so we use the lengthy code below.
randomNum = list(range(0, comb.shape[0]))
random.shuffle(randomNum)
comb['randomNum'] = randomNum
mhits = comb[comb['Read_ID'].groupby(comb['Read_ID']).transform('size') >1]
mhits = mhits.sort_values('randomNum', ascending = True).groupby(['Read_ID', 'Class']).head(1)
comb = comb.loc[~ comb['Read_ID'].isin(mhits.Read_ID)]
comb = comb.append(mhits, ignore_index = True)
comb = comb.drop('randomNum', axis =1)
###########################################################################################
### Multi-mapping hit management : Multi-mappers across annotation classes
###########################################################################################
## For multi-mapping hits across annotation classes, use the RNA priorities as defined before
comb['Class'] = pd.Categorical(comb['Class'], rna_priorities) # Not necessary, better to ensure
comb = comb.sort_values('Class', ascending = True).groupby('Read_ID').head(1)
###########################################################################################
### Write annotation output as a TXT file
###########################################################################################
### In this output, each mapped read, has a single annotation. Multiple mapping and multiple annotations are not reflected in this file.
### For multiple-mapping annotation, see annotated SAM file generated below where are mapping is preserved.
comb.to_csv(s_name + "_Annotated.txt", sep='\t', index=False)
"""
###########################################################################################
### SAM file annotation
###########################################################################################
### Format annotation information for SAM annotation
ann = ann[['Read_ID', 'Chr_read', 'Start_read', 'Annotation', 'Class', 'Tag']]
ann.Class = ann.Class.astype(str)
ann.Start_read = ann.Start_read + 1
ann.Annotation = "YA:Z:"+ ann.Annotation # This structure is essential to make this information readable by samtools
ann.Class = "YC:Z:"+ ann.Class
ann.Tag = "YT:Z:" + ann.Tag
ann = ann.astype('str') # Pandas merge runs into problems unless dtypes match, this ensures that
### Read SAM file and format for annotation
with open(samfile, "r") as file:
sam = pd.read_csv(file, sep = "\t", names = list(string.ascii_lowercase)[0:20], engine = 'c', comment='@', dtype = 'str')
sam = sam.rename(index=str, columns={"a": "Read_ID", "c": "Chr_read", "d": "Start_read"})
sam = sam.astype('str')
### Annotation
sam_ann = pd.merge(sam, ann, 'left')
sam_ann = sam_ann.replace("nan", '', regex=True)
cols = sam_ann.columns.tolist() # SAM format does not support empty columns, this this rearrangement is essential
cols = cols[0:11] + cols[-3:] + cols[11:-3]
sam_ann = sam_ann[cols]
###########################################################################################
### Write annotation output as a SAM file
###########################################################################################
### In this output, each line represents a unique mapping. Some reads might have multiple (up to 10) lines (mappings).
### The output is NOT ready for viewing using samtools. The pipeline will manipulate this further to make it samtools compatible.
sam_ann.to_csv(s_name + "_Annotated.sam", sep='\t', index=False, header=False)
"""