Add cython code jamescasbon#25

gotgenes · Jun 27, 2012 · 3365dd2 · 3365dd2
1 parent 6a54372
commit 3365dd2
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 4 deletions.
diff --git a/setup.py b/setup.py
@@ -1,4 +1,7 @@
 from setuptools import setup
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
 
 requires = []
 
@@ -66,5 +69,7 @@
     include_package_data=True,
     package_data = {
         '': ['*.vcf', '*.gz', '*.tbi'],
-        }
+        },
+    cmdclass = {'build_ext': build_ext},
+    ext_modules = [Extension("vcf.cparse", ["vcf/cparse.pyx"])]
 )
diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx
@@ -0,0 +1,75 @@
+from model import _Call
+
+cdef _map(func, iterable, bad='.'):
+    '''``map``, but make bad values None.'''
+    return [func(x) if x != bad else None
+            for x in iterable]
+
+cdef char *INTEGER = 'Integer'
+cdef char *FLOAT = 'Float'
+cdef char *NUMERIC = 'Numeric'
+
+def parse_samples(
+        list names, list samples, list samp_fmt,
+        list samp_fmt_types, list samp_fmt_nums, site):
+
+    cdef char *name, *fmt, *entry_type, *sample
+    cdef int i, j
+    cdef list samp_data = []
+    cdef dict sampdict
+    cdef list sampvals
+    n_samples = len(samples)
+    n_formats = len(samp_fmt)
+
+    for i in range(n_samples):
+        name = names[i]
+        sample = samples[i]
+
+        # parse the data for this sample
+        sampdict = dict([(x, None) for x in samp_fmt])
+
+        sampvals = sample.split(':')
+
+        for j in range(n_formats):
+            if j >= len(sampvals):
+                break
+            fmt = samp_fmt[j]
+            vals = sampvals[j]
+            entry_type = samp_fmt_types[j]
+            # TODO: entry_num is None for unbounded lists
+            entry_num = samp_fmt_nums[j]
+
+            # short circuit the most common
+            if vals == '.' or vals == './.':
+                sampdict[fmt] = None
+                continue
+
+            # we don't need to split single entries
+            if entry_num == 1 or ',' not in vals:
+
+                if entry_type == INTEGER:
+                    sampdict[fmt] = int(vals)
+                elif entry_type == FLOAT or entry_type == NUMERIC:
+                    sampdict[fmt] = float(vals)
+                else:
+                    sampdict[fmt] = vals
+
+                if entry_num != 1:
+                    sampdict[fmt] = (sampdict[fmt])
+
+                continue
+
+            vals = vals.split(',')
+
+            if entry_type == INTEGER:
+                sampdict[fmt] = _map(int, vals)
+            elif entry_type == FLOAT or entry_type == NUMERIC:
+                sampdict[fmt] = _map(float, vals)
+            else:
+                sampdict[fmt] = vals
+
+        # create a call object
+        call = _Call(site, name, sampdict)
+        samp_data.append(call)
+
+    return samp_data
diff --git a/vcf/parser.py b/vcf/parser.py
@@ -16,6 +16,11 @@
 except ImportError:
     pysam = None
 
+try:
+    import cparse
+except ImportError:
+    cparse = None
+
 
 # Metadata parsers/constants
 RESERVED_INFO = {
@@ -846,7 +851,11 @@ def _parse_sample_format(self, samp_fmt):
 
     def _parse_samples(self, samples, samp_fmt, site):
         '''Parse a sample entry according to the format specified in the FORMAT
-        column.'''
+        column.
+
+        NOTE: this method has a cython equivalent and care must be taken
+        to keep the two methods equivalent
+        '''
 
         # check whether we already know how to parse this format
         if samp_fmt in self._format_cache:
@@ -857,6 +866,10 @@ def _parse_samples(self, samples, samp_fmt, site):
             self._format_cache[samp_fmt] = (sf, samp_fmt_types, samp_fmt_nums)
             samp_fmt = sf
 
+        if cparse:
+            return cparse.parse_samples(
+                self.samples, samples, samp_fmt, samp_fmt_types, samp_fmt_nums, site)
+
         samp_data = []
         _map = self._map
 

diff --git a/vcf/test/prof.py b/vcf/test/prof.py
@@ -1,4 +1,4 @@
-import vcf
+import vcf as vcf
 import cProfile
 import timeit
 import pstats
@@ -17,7 +17,7 @@ def parse_1kg():
     p.strip_dirs().sort_stats('time').print_stats()
 
 elif sys.argv[1] == 'time':
-    n = 5
+    n = 1
     t = timeit.timeit('parse_1kg()',  "from __main__ import parse_1kg", number=n)
     print t/n
 else: