/
ConvertFASTAdnaSEQtoRNA.py
193 lines (122 loc) · 6.63 KB
/
ConvertFASTAdnaSEQtoRNA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#ConvertFASTAdnaSEQtoRNA.py by Wayne Decatur
#ver 0.1
#
#
# To GET HELP/MANUAL, enter on command line:
# python ConvertFASTAdnaSEQtoRNA.py --help
#
#*************************************************************************
#USES Python 2.7
# Purpose: Takes as input a file of nucleotide sequences in FASTA format
# from NCBI and changes all 'T's to 'U's in the sequence, effectively
# changing it to mRNA sequence. Importantly, it doesn't alter the description
# line of the FASTA entries, i.e., the line beginning '>' before the sequence.
# Note there is no check if these are protein or nucleic sequences. Thus it will
# change T to U in protein if you try to give it protein sequences.
#
#
#
#
# TO RUN:
# For example, enter on the command line, the line
#-----------------------------------
# python ConvertFASTAdnaSEQtoRNA.py <your-FASTA-file>
#-----------------------------------
# replacing <your-FASTA-file> with your file name.
# Easiest use is to put that file in same directory with script.
#
#
#
#*************************************************************************
##################################
# USER ADJUSTABLE VALUES #
##################################
#
#
#
#*******************************************************************************
#*******************************************************************************
#*******************************************************************************
#*******************************************************************************
###DO NOT EDIT BELOW HERE - ENTER VALUES ABOVE###
import os
import sys
import logging
import argparse
from argparse import RawTextHelpFormatter
from Bio import Entrez
import urllib
import re
import time
#import gzip
#DEBUG CONTROL
#comment line below to turn off debug print statements
#logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
#path_to_folder_with_file = "" # LEFT HERE FOR POSSIBLE USE IN DEBUGGING
#FASTA_protein_sequence_records_file = "30test.faa" # LEFT HERE FOR USE IN DEBUGGING
###---------------------------HELPER FUNCTIONS---------------------------------###
###--------------------------END OF HELPER FUNCTIONS---------------------------###
###-----------------Actual Main function of script---------------------------###
###------------------GET FASTA FILE AND PREPARE TO PARSE---------------------###
#file to be provided as a argument when call program.
#argparser from http://docs.python.org/2/library/argparse.html#module-argparse and http://docs.python.org/2/howto/argparse.html#id1
parser = argparse.ArgumentParser(
prog='ConvertFASTAdnaSEQtoRNA.py',description="ConvertFASTAdnaSEQtoRNA takes as input a file of nucleotide sequences in \nFASTA format from NCBI and changes all 'T's to 'U's in the sequence,\neffectively changing it to mRNA sequence.\nImportantly, it doesn't alter the description line of the FASTA entries.\n\n\nWritten by Wayne Decatur --> Fomightez @ Github or Twitter. \n\n\nActual example what to enter on command line to run program:\npython GetmRNAforProtein.py input_file.txt\n ", formatter_class=RawTextHelpFormatter
)
#learned how to control line breaks in description above from http://stackoverflow.com/questions/3853722/python-argparse-how-to-insert-newline-the-help-text
#DANG THOUGH THE 'RawTextHelpFormatter' setting seems to apply to all the text for argument choices. I don't know yet if that is what really what I wanted.
parser.add_argument("InputFile", help="name of data file containing a list of FASTA DNA sequences\nfor which you want mRNA sequences. REQUIRED.")
#I would also like trigger help to display if no arguments provided because need at least input file
if len(sys.argv)==1: #from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu
parser.print_help()
sys.exit(1)
args = parser.parse_args()
if os.path.isfile(args.InputFile):
#root_path = path_to_folder_with_file # LEFT HERE FOR USE IN DEBUGGING
#fasta_file = open(root_path + FASTA_protein_sequence_records_file , "r")# LEFT HERE FOR USE IN DEBUGGING; JUST UNCOMMENT THIS AND ABOVE LINE AND COMMENTOUT NEXT LINE
fasta_file_name = args.InputFile
logging.debug(fasta_file_name)
#open the input file for reading
fasta_file = open(fasta_file_name , "r")
#Set_up_for_ouput_file
TheFileNameMainPart, fileExtension = os.path.splitext(fasta_file_name) #from http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python
if '.' in fasta_file_name: #I don't know if this is needed with the os.path.splitext method but I had it before so left it
output_fileName= TheFileNameMainPart+"_mRNAconv"+fileExtension
else:
output_fileName= fasta_file_name +"_mRNAconv"
#Open the output file for writing
output_file = open(output_fileName, 'w')
#Next..
# step through the lines of the input file and replace T with U in sequence,
# ignoring the decription line that begins with '>'. Make a new file
# with the unaltered (in case of description line) line or altered sequence.
NumberofFastaEntries = 0
for line in fasta_file:
line = line.strip ();#this way I have better knowledge of first character and control of ends ultimately becaue I know there isn't any unless I add
if line.startswith('>'): # only need those that start with the greater-than symbol as they have the accessions and need to be modified
NumberofFastaEntries += 1
output_line = line
else:
#Could convert to uppercase first so no need to deal with catching
# lower or upper case versions with line below.
#output_line = (line.upper()).replace('T','U')
# However, to avoid altering maybe some meta information made by
# user, maybe best just to leave alone. Easy in this case to
# handle both.
output_line = line.replace('T','U')
output_line = output_line.replace('t','u')
output_file.write(output_line + "\n")
#done with user file read in and making output file; close them
fasta_file.close()
output_file.close()
#Give user some feedback
sys.stderr.write("\nResults written to file '"+ output_fileName +"'.")
#some additional cleaning up
#next line is just to clean up so stdout is on next line at end
sys.stderr.write("\n")
else:
sys.stderr.write("SORRY. " + args.InputFile + " IS NOT RECOGNIZED AS A FILE.\n\n")
parser.print_help()
sys.exit(1)
#*******************************************************************************
#*******************************************************************************