From 3365dd25a8400d2790466472141deb5b36a25b90 Mon Sep 17 00:00:00 2001 From: James Casbon Date: Wed, 27 Jun 2012 10:38:15 +0100 Subject: [PATCH] Add cython code #25 --- setup.py | 7 ++++- vcf/cparse.pyx | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ vcf/parser.py | 15 +++++++++- vcf/test/prof.py | 4 +-- 4 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 vcf/cparse.pyx diff --git a/setup.py b/setup.py index 212e62d..f62d6ab 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,7 @@ from setuptools import setup +from distutils.core import setup +from distutils.extension import Extension +from Cython.Distutils import build_ext requires = [] @@ -66,5 +69,7 @@ include_package_data=True, package_data = { '': ['*.vcf', '*.gz', '*.tbi'], - } + }, + cmdclass = {'build_ext': build_ext}, + ext_modules = [Extension("vcf.cparse", ["vcf/cparse.pyx"])] ) diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx new file mode 100644 index 0000000..30a05ec --- /dev/null +++ b/vcf/cparse.pyx @@ -0,0 +1,75 @@ +from model import _Call + +cdef _map(func, iterable, bad='.'): + '''``map``, but make bad values None.''' + return [func(x) if x != bad else None + for x in iterable] + +cdef char *INTEGER = 'Integer' +cdef char *FLOAT = 'Float' +cdef char *NUMERIC = 'Numeric' + +def parse_samples( + list names, list samples, list samp_fmt, + list samp_fmt_types, list samp_fmt_nums, site): + + cdef char *name, *fmt, *entry_type, *sample + cdef int i, j + cdef list samp_data = [] + cdef dict sampdict + cdef list sampvals + n_samples = len(samples) + n_formats = len(samp_fmt) + + for i in range(n_samples): + name = names[i] + sample = samples[i] + + # parse the data for this sample + sampdict = dict([(x, None) for x in samp_fmt]) + + sampvals = sample.split(':') + + for j in range(n_formats): + if j >= len(sampvals): + break + fmt = samp_fmt[j] + vals = sampvals[j] + entry_type = samp_fmt_types[j] + # TODO: entry_num is None for unbounded lists + entry_num = samp_fmt_nums[j] + + # short circuit the most common + if vals == '.' or vals == './.': + sampdict[fmt] = None + continue + + # we don't need to split single entries + if entry_num == 1 or ',' not in vals: + + if entry_type == INTEGER: + sampdict[fmt] = int(vals) + elif entry_type == FLOAT or entry_type == NUMERIC: + sampdict[fmt] = float(vals) + else: + sampdict[fmt] = vals + + if entry_num != 1: + sampdict[fmt] = (sampdict[fmt]) + + continue + + vals = vals.split(',') + + if entry_type == INTEGER: + sampdict[fmt] = _map(int, vals) + elif entry_type == FLOAT or entry_type == NUMERIC: + sampdict[fmt] = _map(float, vals) + else: + sampdict[fmt] = vals + + # create a call object + call = _Call(site, name, sampdict) + samp_data.append(call) + + return samp_data diff --git a/vcf/parser.py b/vcf/parser.py index c79803e..4d2d2d0 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -16,6 +16,11 @@ except ImportError: pysam = None +try: + import cparse +except ImportError: + cparse = None + # Metadata parsers/constants RESERVED_INFO = { @@ -846,7 +851,11 @@ def _parse_sample_format(self, samp_fmt): def _parse_samples(self, samples, samp_fmt, site): '''Parse a sample entry according to the format specified in the FORMAT - column.''' + column. + + NOTE: this method has a cython equivalent and care must be taken + to keep the two methods equivalent + ''' # check whether we already know how to parse this format if samp_fmt in self._format_cache: @@ -857,6 +866,10 @@ def _parse_samples(self, samples, samp_fmt, site): self._format_cache[samp_fmt] = (sf, samp_fmt_types, samp_fmt_nums) samp_fmt = sf + if cparse: + return cparse.parse_samples( + self.samples, samples, samp_fmt, samp_fmt_types, samp_fmt_nums, site) + samp_data = [] _map = self._map diff --git a/vcf/test/prof.py b/vcf/test/prof.py index 22852b1..85a09df 100755 --- a/vcf/test/prof.py +++ b/vcf/test/prof.py @@ -1,4 +1,4 @@ -import vcf +import vcf as vcf import cProfile import timeit import pstats @@ -17,7 +17,7 @@ def parse_1kg(): p.strip_dirs().sort_stats('time').print_stats() elif sys.argv[1] == 'time': - n = 5 + n = 1 t = timeit.timeit('parse_1kg()', "from __main__ import parse_1kg", number=n) print t/n else: