/
score_columns_in_clustal_msa.py
443 lines (339 loc) · 15.3 KB
/
score_columns_in_clustal_msa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
#!/usr/bin/env python
# score_columns_in_clustal_msa.py
__author__ = "Wayne Decatur" #fomightez on GitHub
__license__ = "MIT"
__version__ = "0.1.0"
# score_columns_in_clustal_msa.py by
# Wayne Decatur
# ver 0.1
#
#*******************************************************************************
# Verified compatible with both Python 2.7 and Python 3.6; written initially in
# Python 3.
#
#
# PURPOSE: Takes a text document of an alignment in CLUSTAL format and generates
# a couple of quick-n-dirty 'scores' based on percent overall conservation in an
# alignment. Specifically, determines conservation of residues in alignment
# columns both including and excluding counting gaps in the column. This is done
# in order to assign some rough scoring metric(s) to an alignment. The metric(s)
# can then be used when comparing separate alignments of related, but distinct
# genetic elements or protein paralogs/orthologs/gene family members/gene
# cluster members in order to objectively assess which of the related elements
# are more highly conserved overall given what a particular 'score' emphasizes.
# Mainly meant to be a rough guide to replace (or confirm) visual inspection.
#
# Excluding gaps in the column is meant to allow insertions in a few sequences
# not to be penalized as much and instead allow greater impact from segments
# that may match well.
#
#
#
#
# Written to run from command line or imported into/pasted/loaded inside a
# Jupyter notebook cell. (Is this working--> When doing in Jupyter (IPython, I
# believe) you can skip the file save intermediate, see https://git.io/vh8Mi for
# similar advanced examples.)
#
#
#
#
#
#
#
#
#
#
# If you are Wayne, see `scoring XXXXXXX XXXXXX among strains in multiple
# sequence alignments.md` for specific impetus behind this script.
#
#
#
# Dependencies beyond the mostly standard libraries/modules:
#
#
#
# VERSION HISTORY:
# v.0.1. basic working version
#
# To do:
# - see if works when alignment provided as in-memory string
#
#
#
#
# TO RUN:
# Examples,
# Enter on the command line of your terminal, the line
#-----------------------------------
# python score_columns_in_clustal_msa.py ALIGNMENT_TEXT_FILE
#-----------------------------------
#
# Issue `score_columns_in_clustal_msa.py -h` for details.
#
#
#
# To use this after importing/pasting or loading into a cell in a Jupyter
# notebook, specify at least the results file (or results as a string) in the
# call to the main function similar to below:
# score_columns_in_clustal_msa("test.clustal")
#
#-or-
# To specify OPTIONAL ....
#
#
#
'''
CURRENT ACTUAL CODE FOR RUNNING/TESTING IN A NOTEBOOK WHEN IMPORTED/LOADED OR
PASTED IN ANOTHER CELL:
score_columns_in_clustal_msa("test.clustal")
'''
#
#
#*******************************************************************************
#
#*******************************************************************************
##################################
# USER ADJUSTABLE VALUES #
##################################
#
suffix_for_saving = "_qnd_scores"#to be used for naming the output automatically
# when running script from command line to act on an input file
gap_indicator = "-" #change to "." if using dots. Used by
# `get_aln_index_and_real_pos()` and `.ungap()`.
percent_threshold = 1.0 # percent that matches need to match in column to be
# counted as conserved
#
#*******************************************************************************
#**********************END USER ADJUSTABLE VARIABLES****************************
#*******************************************************************************
#*******************************************************************************
###DO NOT EDIT BELOW HERE - ENTER VALUES ABOVE###
import sys
import os
from collections import Counter
from Bio import AlignIO
###---------------------------HELPER FUNCTIONS-------------------------------###
def increment_warranted(column_seq, count_gaps=True):
'''
Takes a column from a multiple sequence alignment and returns a `True` if
the percent of matches of residues meets the `percent_threshold`.
If `count_gaps=False`, then rows with gaps are not considered in trying
to determine if conserved. This setting would lessen impact from an insert
in the sequence.
No matter what `count_gaps` and `percent_threshold` are, gaps can never be
considered conserved
'''
if not count_gaps:
column_seq = column_seq.replace(gap_indicator,"")
total_in_column = len(column_seq)
match_percent = (
Counter(column_seq.replace(gap_indicator,"")).most_common(
1)[0][1])/float(total_in_column)
# note that because of the inclusion of
# `column_seq.replace(gap_indicator,"")` gaps can never be considered as
# most abundant. That may becaome important if ever lower threshold from
# 100% might because with `Counter(column_seq)` nine gap characters and one
# nucleotide in a column would be deemed conserved.
if match_percent >= percent_threshold:
return True
else:
return False
def generate_output_file_name(file_name,suffix_for_saving, fa=False, tb = False):
'''
Takes a file name as an argument and returns string for the name of the
output file. The generated name is based on the original file
name.
Added `fa` parameter so can say if FASTA and then make file extension `.fa`.
Added `tb` parameter so can say if tab-delimited table and then make file
extension `.tsv`.
Specific example
=================
Calling function with
("alignment.clustal")
returns
"alignment_ADJUSTED.clustal"
'''
main_part_of_name, file_extension = os.path.splitext(
file_name) #from
#http://stackoverflow.com/questions/541390/extracting-extension-from-filename-in-python
if '.' in file_name: #I don't know if this is needed with the os.path.splitext method but I had it before so left it
if fa:
return main_part_of_name + suffix_for_saving + ".fa"
elif tb:
return main_part_of_name + suffix_for_saving + ".tsv"
else:
return main_part_of_name + suffix_for_saving + file_extension
else:
if fa:
return file_name + suffix_for_saving + ".fa"
elif tb:
return main_part_of_name + suffix_for_saving + ".tsv"
else:
return file_name + suffix_for_saving + ".clustal"
###--------------------------END OF HELPER FUNCTIONS---------------------------###
###--------------------------END OF HELPER FUNCTIONS---------------------------###
#*******************************************************************************
###------------------------'main' function of script---------------------------##
def score_columns_in_clustal_msa(
alignment,
suffix_for_saving = suffix_for_saving, name_basis="alignment.clustal"):
'''
Main function of script.
It will take an alignment text file (clustal form) (or the alignment as a
Python string BUT THIS DOESN'T SEEM TO WORK BECAUSE GET OSERROR ABOUT File
NAME EVEN THOUGH EXCEPT `OSError` already there.)
and return some quick-n-dirty scoring assessments that reflect conservation.
The option to provide the results as a string is to handle where sending the
data directly from shell script to Python without a typical file
intermediate, see the advanced notebook at https://git.io/vpr7i for
examples. The obvious use case for that is when working in the Jupyter
# notebook environment.
'''
# feedback about important option
sys.stderr.write("\n**NOTE: gap indicator in this script is currently set "
"to '{}'. If\nthat does not match what provided alignment uses to "
"indicate gaps,\nplease change the setting within the script code "
"under\n'USER ADJUSTABLE VALUES' around line 120 "
"(give or take a few).**\n".format(gap_indicator))
# Bring in the necessary alignment data:
#---------------------------------------------------------------------------
try:
file_name = alignment
alignment = AlignIO.read(alignment, "clustal")
except (TypeError,OSError,IOError) as e:
file_name = name_basis
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
alignment = AlignIO.read(StringIO(alignment), "clustal")
# Note "FileNotFoundError is a subclass of OSError"
# (https://stackoverflow.com/a/28633573/8508004)
# feedback
sys.stderr.write("Alignment file read...")
# Extract length of alignment
#---------------------------------------------------------------------------
# Sanity check first:
# Verify that alignments are equal in length
lengths_of_alignments = [len(record) for record in alignment]
assert lengths_of_alignments.count(
lengths_of_alignments[0]) == len(lengths_of_alignments), ("The length "
"of all parsed alignments should be the same.") # see where
# `conservation_line` being made in
# `pretty_msa_maker_from_clustal_nucleic.py ` for mechanism used in
# assertion test involving Ivo van der Wijk's solution from
# https://stackoverflow.com/a/3844948/8508004
len_alignment = lengths_of_alignments[0] #-or- use `len(alignment[0])`
# Go though column by column of alignment and increment counter if
# appropriate
#---------------------------------------------------------------------------
matches_with_gaps = 0
matches_without_gaps = 0
for column_no in range(len_alignment):
column_seq = alignment[:, column_no]
if increment_warranted(column_seq, count_gaps=True):
matches_with_gaps += 1
if increment_warranted(column_seq, count_gaps=False):
matches_without_gaps += 1
# Normalize the collected scores based on alignment length
#---------------------------------------------------------------------------
score_with_gaps = matches_with_gaps/len_alignment
score_not_counting_gaps = matches_without_gaps/len_alignment
# Feedback
#---------------------------------------------------------------------------
# report scores
sys.stderr.write("\n\nSimilarity percent including gaps is:\n"
"'{}'.".format(score_with_gaps))
sys.stderr.write("\nSimilarity percent not counting gaps is:\n"
"'{}'.".format(score_not_counting_gaps))
score_with_gaps
score_not_counting_gaps
# Save a table of the scores
import pandas as pd
scores_dict = {
'alignment': file_name,
'score_including_gaps': [score_with_gaps],
'score_without_considering_gaps': [score_not_counting_gaps],
}
# Even though using single items for scores above in making dict, I have to
# make them a list or otherwise when try to make a dataframe with them in
# nextline get error: `ValueError: If using all scalar values, you must
# pass an index`. See https://stackoverflow.com/a/17840195/8508004 .
scores_df = pd.DataFrame(
scores_dict,columns = ['alignment',
'score_including_gaps','score_without_considering_gaps'])
scores_file_name = generate_output_file_name(
file_name,suffix_for_saving,tb=True)
scores_df.to_csv(scores_file_name, sep='\t',index = False)
# Feedback
sys.stderr.write("\n\nScores for provided alignment saved as\n"
"a table '{}'.".format(scores_file_name))
###--------------------------END OF MAIN FUNCTION----------------------------###
###--------------------------END OF MAIN FUNCTION----------------------------###
#*******************************************************************************
###------------------------'main' section of script---------------------------##
def main():
""" Main entry point of the script """
# placing actual main action in a 'helper'script so can call that easily
# with a distinguishing name in Jupyter notebooks, where `main()` may get
# assigned multiple times depending how many scripts imported/pasted in.
kwargs = {}
#kwargs['descr_source'] = descr_source
kwargs['suffix_for_saving'] = suffix_for_saving
score_columns_in_clustal_msa(alignment,**kwargs)
# using https://www.saltycrane.com/blog/2008/01/how-to-use-args-and-kwargs-in-python/#calling-a-function
# to build keyword arguments to pass to the function above
# (see https://stackoverflow.com/a/28986876/8508004 and
# https://stackoverflow.com/a/1496355/8508004
# (maybe https://stackoverflow.com/a/7437238/8508004 might help too) for
# related help). Makes it easy to add more later.
if __name__ == "__main__" and '__file__' in globals():
""" This is executed when run from the command line """
# Code with just `if __name__ == "__main__":` alone will be run if pasted
# into a notebook. The addition of ` and '__file__' in globals()` is based
# on https://stackoverflow.com/a/22923872/8508004
# See also https://stackoverflow.com/a/22424821/8508004 for an option to
# provide arguments when prototyping a full script in the notebook.
###-----------------for parsing command line arguments-----------------------###
import argparse
parser = argparse.ArgumentParser(prog=
'score_columns_in_clustal_msa.py',
description="score_columns_in_clustal_msa.py \
takes a text document of an alignment in CLUSTAL format and generates \
a couple of quick-n-dirty 'scores' based on percent overall \
conservation in an alignment. Specifically, determines conservation of \
residues in alignment columns both including and excluding counting \
gaps in the column. Intended to be used when comparing separate \
alignments of related but distinct genetic elements or protein \
para/orthologs/families in \
order to objectively assess which of the related elements are more \
highly conserved overall. \
**** Script by Wayne Decatur \
(fomightez @ github) ***")
parser.add_argument("align_file", help="Name of file of alignment text \
file (CLUSTAL format).\
", metavar="ALIGNMENT_FILE")
'''parser.add_argument('-ds', '--descr_source', action='store', type=str,
default= None, help="OPTIONAL: Provide FASTA file with same \
ids to use as source of alignment to use as source of descriptions. The \
typical file is output as `*_extracted_ungapped.fa` files from the \
`extract_regions_from_clustal_alignment.py` script\
.")
'''
parser.add_argument('-os', '--output_suffix', action='store', type=str,
default= suffix_for_saving, help="OPTIONAL: Set a suffix for including in file \
name of output. \
If none provided, '{}' will be used.".format(suffix_for_saving))
#I would also like trigger help to display if no arguments provided because
# need at least one for url
if len(sys.argv)==1: #from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu
parser.print_help()
sys.exit(1)
args = parser.parse_args()
alignment = args.align_file
suffix_for_saving = args.output_suffix
main()
#*******************************************************************************
###-***********************END MAIN PORTION OF SCRIPT***********************-###
#*******************************************************************************