Navigation Menu

Skip to content

Commit

Permalink
Add cython code jamescasbon#25
Browse files Browse the repository at this point in the history
  • Loading branch information
James Casbon committed Jun 27, 2012
1 parent 6a54372 commit 3365dd2
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 4 deletions.
7 changes: 6 additions & 1 deletion setup.py
@@ -1,4 +1,7 @@
from setuptools import setup
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext

requires = []

Expand Down Expand Up @@ -66,5 +69,7 @@
include_package_data=True,
package_data = {
'': ['*.vcf', '*.gz', '*.tbi'],
}
},
cmdclass = {'build_ext': build_ext},
ext_modules = [Extension("vcf.cparse", ["vcf/cparse.pyx"])]
)
75 changes: 75 additions & 0 deletions vcf/cparse.pyx
@@ -0,0 +1,75 @@
from model import _Call

cdef _map(func, iterable, bad='.'):
'''``map``, but make bad values None.'''
return [func(x) if x != bad else None
for x in iterable]

cdef char *INTEGER = 'Integer'
cdef char *FLOAT = 'Float'
cdef char *NUMERIC = 'Numeric'

def parse_samples(
list names, list samples, list samp_fmt,
list samp_fmt_types, list samp_fmt_nums, site):

cdef char *name, *fmt, *entry_type, *sample
cdef int i, j
cdef list samp_data = []
cdef dict sampdict
cdef list sampvals
n_samples = len(samples)
n_formats = len(samp_fmt)

for i in range(n_samples):
name = names[i]
sample = samples[i]

# parse the data for this sample
sampdict = dict([(x, None) for x in samp_fmt])

sampvals = sample.split(':')

for j in range(n_formats):
if j >= len(sampvals):
break
fmt = samp_fmt[j]
vals = sampvals[j]
entry_type = samp_fmt_types[j]
# TODO: entry_num is None for unbounded lists
entry_num = samp_fmt_nums[j]

# short circuit the most common
if vals == '.' or vals == './.':
sampdict[fmt] = None
continue

# we don't need to split single entries
if entry_num == 1 or ',' not in vals:

if entry_type == INTEGER:
sampdict[fmt] = int(vals)
elif entry_type == FLOAT or entry_type == NUMERIC:
sampdict[fmt] = float(vals)
else:
sampdict[fmt] = vals

if entry_num != 1:
sampdict[fmt] = (sampdict[fmt])

continue

vals = vals.split(',')

if entry_type == INTEGER:
sampdict[fmt] = _map(int, vals)
elif entry_type == FLOAT or entry_type == NUMERIC:
sampdict[fmt] = _map(float, vals)
else:
sampdict[fmt] = vals

# create a call object
call = _Call(site, name, sampdict)
samp_data.append(call)

return samp_data
15 changes: 14 additions & 1 deletion vcf/parser.py
Expand Up @@ -16,6 +16,11 @@
except ImportError:
pysam = None

try:
import cparse
except ImportError:
cparse = None


# Metadata parsers/constants
RESERVED_INFO = {
Expand Down Expand Up @@ -846,7 +851,11 @@ def _parse_sample_format(self, samp_fmt):

def _parse_samples(self, samples, samp_fmt, site):
'''Parse a sample entry according to the format specified in the FORMAT
column.'''
column.
NOTE: this method has a cython equivalent and care must be taken
to keep the two methods equivalent
'''

# check whether we already know how to parse this format
if samp_fmt in self._format_cache:
Expand All @@ -857,6 +866,10 @@ def _parse_samples(self, samples, samp_fmt, site):
self._format_cache[samp_fmt] = (sf, samp_fmt_types, samp_fmt_nums)
samp_fmt = sf

if cparse:
return cparse.parse_samples(
self.samples, samples, samp_fmt, samp_fmt_types, samp_fmt_nums, site)

samp_data = []
_map = self._map

Expand Down
4 changes: 2 additions & 2 deletions vcf/test/prof.py
@@ -1,4 +1,4 @@
import vcf
import vcf as vcf
import cProfile
import timeit
import pstats
Expand All @@ -17,7 +17,7 @@ def parse_1kg():
p.strip_dirs().sort_stats('time').print_stats()

elif sys.argv[1] == 'time':
n = 5
n = 1
t = timeit.timeit('parse_1kg()', "from __main__ import parse_1kg", number=n)
print t/n
else:
Expand Down

0 comments on commit 3365dd2

Please sign in to comment.