Skip to content

Commit

Permalink
Add support for ESTScan scores matrices (smat) data type
Browse files Browse the repository at this point in the history
  • Loading branch information
gregvonkuster committed Dec 2, 2016
1 parent 3b84136 commit 86447ec
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 0 deletions.
3 changes: 3 additions & 0 deletions config/datatypes_conf.xml.sample
Expand Up @@ -547,6 +547,8 @@
<datatype extension="maskinfo-asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" subclass="True" display_in_upload="true" />
<datatype extension="maskinfo-asn1-binary" type="galaxy.datatypes.binary:GenericAsn1Binary" mimetype="application/octet-stream" subclass="True" display_in_upload="true" />
<datatype extension="pssm-asn1" type="galaxy.datatypes.data:GenericAsn1" mimetype="text/plain" subclass="True" display_in_upload="true" />
<!-- PlantTribes datatypes -->
<datatype extension="smat" type="galaxy.datatypes.estscan:Smat" display_in_upload="true" />
</registration>
<sniffers>
<!--
Expand All @@ -556,6 +558,7 @@
defined format first, followed by next-most rigidly defined,
and so on.
-->
<sniffer type="galaxy.datatypes.estscan:Smat"/>
<sniffer type="galaxy.datatypes.mothur:Sabund"/>
<sniffer type="galaxy.datatypes.mothur:Otu"/>
<sniffer type="galaxy.datatypes.mothur:GroupAbund"/>
Expand Down
11 changes: 11 additions & 0 deletions lib/galaxy/datatypes/test/1.smat
@@ -0,0 +1,11 @@
FORMAT: hse.conf CODING REGION 6 3 1 s C+G: 0 43
-1 0 3 -2
2 1 -8 0
1 0 2 -4
-1 -1 4 -3
0 -1 3 -3
3 0 -8 0
0 0 2 -1
-3 0 4 -2
2 -1 1 -3
3 1 -9 -1
64 changes: 64 additions & 0 deletions lib/galaxy/datatypes/text.py
Expand Up @@ -542,3 +542,67 @@ def set_peek( self, dataset, is_multi_byte=False ):
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disc'


class Smat(Text):
file_ext = "smat"

def display_peek(self, dataset):
try:
return dataset.peek
except:
return "ESTScan scores matrices (%s)" % (nice_size(dataset.get_size()))

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
dataset.blurb = "ESTScan scores matrices"
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disc'

def sniff(self, filename):
"""
The use of ESTScan implies the creation of scores matrices which
reflect the codons preferences in the studied organisms. The
ESTScan package includes scripts for generating these files. The
output of these scripts consists of the matrices, one for each
isochor, and which look like this:
FORMAT: hse_4is.conf CODING REGION 6 3 1 s C+G: 0 44
-1 0 2 -2
2 1 -8 0
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('test_space.txt')
>>> Smat().sniff(fname)
False
>>> fname = get_test_fname('test_tab.bed')
>>> Smat().sniff(fname)
False
>>> fname = get_test_fname('1.smat')
>>> Smat().sniff(fname)
True
"""
line_no = 0
with open(filename, "r") as fh:
line_no += 1
if line_no > 10000:
return True
line = fh.readline(500)
if line_no == 1 and not line.startswith('FORMAT'):
# The first line is always the start of a format section.
return False
if not line.startswith('FORMAT'):
if line.find('\t') >= 0:
# Smat files are not tabular.
return False
items = line.split()
if len(items) != 4:
return False
for item in items:
# Make sure each item is an integer.
if re.match(r"[-+]?\d+$", item) is None:
return False
fh.close()
return True

0 comments on commit 86447ec

Please sign in to comment.