Skip to content
Browse files

initial commit, tabix 0.2.5

  • Loading branch information...
0 parents commit 645fdb21d8278502ff78e84c6d0b3095a84d9508 @ekg committed
Showing with 6,966 additions and 0 deletions.
  1. +593 −0 ChangeLog
  2. +63 −0 Makefile
  3. +126 −0 NEWS
  4. +395 −0 TabixReader.java
  5. +42 −0 bam_endian.h
  6. +156 −0 bedidx.c
  7. +714 −0 bgzf.c
  8. +157 −0 bgzf.h
  9. +206 −0 bgzip.c
  10. BIN example.gtf.gz
  11. BIN example.gtf.gz.tbi
  12. +998 −0 index.c
  13. +486 −0 khash.h
  14. +632 −0 knetfile.c
  15. +75 −0 knetfile.h
  16. +227 −0 kseq.h
  17. +271 −0 ksort.h
  18. +165 −0 kstring.c
  19. +68 −0 kstring.h
  20. +290 −0 main.c
  21. +8 −0 perl/MANIFEST
  22. +8 −0 perl/Makefile.PL
  23. +76 −0 perl/Tabix.pm
  24. +71 −0 perl/Tabix.xs
  25. +41 −0 perl/TabixIterator.pm
  26. +28 −0 perl/t/01local.t
  27. +28 −0 perl/t/02remote.t
  28. +3 −0 perl/typemap
  29. +55 −0 python/setup.py
  30. +408 −0 python/tabixmodule.c
  31. +91 −0 python/test.py
  32. +132 −0 tabix.1
  33. +145 −0 tabix.h
  34. +87 −0 tabix.py
  35. +121 −0 tabix.tex
593 ChangeLog
@@ -0,0 +1,593 @@
+------------------------------------------------------------------------
+r942 | lh3lh3 | 2011-03-31 16:39:50 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+ M /trunk/tabix/main.c
+
+update version number
+
+------------------------------------------------------------------------
+r940 | lh3lh3 | 2011-03-31 16:38:03 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+ M /trunk/tabix/bedidx.c
+ M /trunk/tabix/main.c
+
+fixed two bugs due to recent changes
+
+------------------------------------------------------------------------
+r939 | lh3lh3 | 2011-03-31 16:12:21 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+ M /trunk/tabix/bgzf.c
+ M /trunk/tabix/bgzf.h
+ M /trunk/tabix/main.c
+
+update to the latest bgzf.*
+
+------------------------------------------------------------------------
+r938 | lh3lh3 | 2011-03-31 16:02:21 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+ M /trunk/tabix/tabix.h
+
+BED support
+
+------------------------------------------------------------------------
+r937 | lh3lh3 | 2011-03-31 15:03:49 -0400 (Thu, 31 Mar 2011) | 2 lines
+Changed paths:
+ M /trunk/tabix/Makefile
+ A /trunk/tabix/bedidx.c
+ M /trunk/tabix/example.gtf.gz.tbi
+ M /trunk/tabix/index.c
+ A /trunk/tabix/kseq.h
+ M /trunk/tabix/tabix.h
+
+restructure get_intv() for BED support
+
+------------------------------------------------------------------------
+r919 | petulda | 2011-02-24 10:14:14 -0500 (Thu, 24 Feb 2011) | 1 line
+Changed paths:
+ M /trunk/tabix/bgzf.c
+ M /trunk/tabix/bgzf.h
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+
+New -r (reheader) option for efficient header replacement.
+------------------------------------------------------------------------
+r915 | lh3lh3 | 2011-02-22 09:50:57 -0500 (Tue, 22 Feb 2011) | 2 lines
+Changed paths:
+ A /trunk/tabix/python
+ A /trunk/tabix/python/setup.py (from /trunk/tabix/setup.py:914)
+ A /trunk/tabix/python/tabixmodule.c (from /trunk/tabix/tabixmodule.c:914)
+ A /trunk/tabix/python/test.py (from /trunk/tabix/test.py:914)
+ D /trunk/tabix/setup.py
+ D /trunk/tabix/tabixmodule.c
+ D /trunk/tabix/test.py
+
+move to a new python/ directory
+
+------------------------------------------------------------------------
+r914 | lh3lh3 | 2011-02-22 09:49:35 -0500 (Tue, 22 Feb 2011) | 2 lines
+Changed paths:
+ A /trunk/tabix/setup.py
+ A /trunk/tabix/tabixmodule.c
+ A /trunk/tabix/test.py
+
+CPython C-API by Hyeshik Chang
+
+------------------------------------------------------------------------
+r904 | petulda | 2011-01-28 08:06:27 -0500 (Fri, 28 Jan 2011) | 1 line
+Changed paths:
+ M /trunk/tabix/index.c
+
+Check the number of fields on each line and exit nicely without segfault
+------------------------------------------------------------------------
+r901 | petulda | 2011-01-21 06:45:37 -0500 (Fri, 21 Jan 2011) | 1 line
+Changed paths:
+ M /trunk/tabix/main.c
+
+Fix: Complain only when VCF is newer, not newer or same mtime
+------------------------------------------------------------------------
+r900 | petulda | 2011-01-21 04:23:04 -0500 (Fri, 21 Jan 2011) | 1 line
+Changed paths:
+ M /trunk/tabix/main.c
+
+Prevent the common user mistake and check the timestamps of the vcf and index file
+------------------------------------------------------------------------
+r876 | lh3lh3 | 2010-12-08 12:38:45 -0500 (Wed, 08 Dec 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/ChangeLog
+ M /trunk/tabix/NEWS
+ M /trunk/tabix/main.c
+
+Release tabix-0.2.3
+
+------------------------------------------------------------------------
+r875 | lh3lh3 | 2010-12-08 12:28:35 -0500 (Wed, 08 Dec 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/ChangeLog
+ M /trunk/tabix/index.c
+
+Fixed a minor bug in generating index
+
+------------------------------------------------------------------------
+r855 | petulda | 2010-11-25 11:50:13 -0500 (Thu, 25 Nov 2010) | 1 line
+Changed paths:
+ M /trunk/tabix/main.c
+
+Disable "unknown target name or minus interval" warning.
+------------------------------------------------------------------------
+r775 | petulda | 2010-10-26 15:02:30 -0400 (Tue, 26 Oct 2010) | 1 line
+Changed paths:
+ M /trunk/tabix/main.c
+
+Added -h option to print header lines
+------------------------------------------------------------------------
+r742 | jmarshall | 2010-09-27 06:47:23 -0400 (Mon, 27 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix
+
+Add svn:ignore properties for intermediate and generated files.
+
+------------------------------------------------------------------------
+r725 | lh3lh3 | 2010-09-15 13:01:53 -0400 (Wed, 15 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/bgzip.c
+
+patches by Peter Chines
+
+------------------------------------------------------------------------
+r714 | lh3lh3 | 2010-09-07 10:13:25 -0400 (Tue, 07 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/TabixReader.java
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+
+fixed a bug in C/Java when n_off == 0
+
+------------------------------------------------------------------------
+r712 | lh3lh3 | 2010-09-03 09:21:23 -0400 (Fri, 03 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/TabixReader.java
+
+fixed a bug in parsing region strings
+
+------------------------------------------------------------------------
+r700 | petulda | 2010-08-25 10:42:37 -0400 (Wed, 25 Aug 2010) | 1 line
+Changed paths:
+ M /trunk/tabix/main.c
+
+Fix: Exit with an error rather than segfault when index is not present and region is queried
+------------------------------------------------------------------------
+r696 | petulda | 2010-08-24 10:24:12 -0400 (Tue, 24 Aug 2010) | 1 line
+Changed paths:
+ M /trunk/tabix/bgzf.c
+ M /trunk/tabix/bgzf.h
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+
+Complain about not-bgzipped files and check for noncontinuous chromosome blocks
+------------------------------------------------------------------------
+r603 | lh3lh3 | 2010-06-28 10:49:39 -0400 (Mon, 28 Jun 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/NEWS
+ M /trunk/tabix/TabixReader.java
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+
+Release tabix-0.2.2
+
+------------------------------------------------------------------------
+r597 | lh3lh3 | 2010-06-13 21:08:29 -0400 (Sun, 13 Jun 2010) | 3 lines
+Changed paths:
+ M /trunk/tabix/index.c
+
+Change the namespace of sorting, to avoid function name collision with samtools.
+
+
+------------------------------------------------------------------------
+r582 | lh3lh3 | 2010-06-03 10:40:25 -0400 (Thu, 03 Jun 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/NEWS
+ M /trunk/tabix/main.c
+ M /trunk/tabix/tabix.py
+
+Release tabix-0.2.1
+
+------------------------------------------------------------------------
+r581 | lh3lh3 | 2010-05-24 14:24:24 -0400 (Mon, 24 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/tabix.py
+
+OOP interface with the help from Aaron Quinlan
+
+------------------------------------------------------------------------
+r580 | lh3lh3 | 2010-05-23 23:36:05 -0400 (Sun, 23 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/tabix.py
+
+minor change
+
+------------------------------------------------------------------------
+r579 | lh3lh3 | 2010-05-23 23:25:24 -0400 (Sun, 23 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/tabix.py
+
+For Snow Leopard compatibility
+
+------------------------------------------------------------------------
+r575 | lh3lh3 | 2010-05-12 19:31:27 -0400 (Wed, 12 May 2010) | 4 lines
+Changed paths:
+ M /trunk/tabix/Makefile
+ M /trunk/tabix/index.c
+ M /trunk/tabix/tabix.h
+ A /trunk/tabix/tabix.py
+
+ * optionally generate shared library for Mac and Linux
+ * added a python script that directly calls the shared library
+ * added a new API for easy python access
+
+------------------------------------------------------------------------
+r574 | lh3lh3 | 2010-05-11 12:14:27 -0400 (Tue, 11 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/ChangeLog
+ M /trunk/tabix/NEWS
+ M /trunk/tabix/perl/Tabix.pm
+ M /trunk/tabix/perl/TabixIterator.pm
+ M /trunk/tabix/tabix.1
+
+Release tabix-0.2.0
+
+------------------------------------------------------------------------
+r573 | lh3lh3 | 2010-05-11 12:08:30 -0400 (Tue, 11 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/Makefile
+
+Added -fPIC
+
+------------------------------------------------------------------------
+r572 | lh3lh3 | 2010-05-11 11:59:07 -0400 (Tue, 11 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/perl/MANIFEST
+
+update
+
+------------------------------------------------------------------------
+r571 | lh3lh3 | 2010-05-11 11:56:54 -0400 (Tue, 11 May 2010) | 4 lines
+Changed paths:
+ A /trunk/tabix/example.gtf.gz
+ A /trunk/tabix/example.gtf.gz.tbi
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+ M /trunk/tabix/perl/MANIFEST
+ M /trunk/tabix/perl/Tabix.pm
+ M /trunk/tabix/perl/Tabix.xs
+ A /trunk/tabix/perl/TabixIterator.pm
+ A /trunk/tabix/perl/t
+ A /trunk/tabix/perl/t/01local.t
+ A /trunk/tabix/perl/t/02remote.t
+ M /trunk/tabix/tabix.1
+ M /trunk/tabix/tabix.h
+
+ * improved C/Perl APIs
+ * added test for Perl
+ * added an tiny example
+
+------------------------------------------------------------------------
+r570 | lh3lh3 | 2010-05-11 01:04:21 -0400 (Tue, 11 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/TabixReader.java
+
+fixed the same issue in java
+
+------------------------------------------------------------------------
+r569 | lh3lh3 | 2010-05-11 01:03:24 -0400 (Tue, 11 May 2010) | 3 lines
+Changed paths:
+ M /trunk/tabix/index.c
+ M /trunk/tabix/perl/Tabix.pm
+ M /trunk/tabix/perl/Tabix.xs
+
+ * fixed a potential issue in index.c
+ * improve perl APIs
+
+------------------------------------------------------------------------
+r568 | lh3lh3 | 2010-05-10 23:46:21 -0400 (Mon, 10 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/perl/Tabix.xs
+
+return an array from get_names()
+
+------------------------------------------------------------------------
+r567 | lh3lh3 | 2010-05-10 23:38:46 -0400 (Mon, 10 May 2010) | 4 lines
+Changed paths:
+ M /trunk/tabix/TabixReader.java
+ M /trunk/tabix/index.c
+ A /trunk/tabix/perl
+ A /trunk/tabix/perl/MANIFEST
+ A /trunk/tabix/perl/Makefile.PL
+ A /trunk/tabix/perl/Tabix.pm
+ A /trunk/tabix/perl/Tabix.xs
+ A /trunk/tabix/perl/typemap
+ M /trunk/tabix/tabix.h
+
+ * added the initial perl binding. The interface needs to be improved.
+ * added a new API for perl binding
+ * fixed a potential bug in java.
+
+------------------------------------------------------------------------
+r565 | lh3lh3 | 2010-05-09 23:24:35 -0400 (Sun, 09 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/main.c
+
+Release tabix-0.1.6
+
+------------------------------------------------------------------------
+r564 | lh3lh3 | 2010-05-09 23:01:49 -0400 (Sun, 09 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/index.c
+
+fixed a typo
+
+------------------------------------------------------------------------
+r563 | lh3lh3 | 2010-05-09 22:58:26 -0400 (Sun, 09 May 2010) | 2 lines
+Changed paths:
+ A /trunk/tabix/ChangeLog
+ M /trunk/tabix/NEWS
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+ M /trunk/tabix/tabix.h
+
+If nothing bad happens, this will become 0.1.6
+
+------------------------------------------------------------------------
+r562 | lh3lh3 | 2010-05-09 19:43:56 -0400 (Sun, 09 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/index.c
+
+Fixed a bug
+
+------------------------------------------------------------------------
+r560 | lh3lh3 | 2010-05-05 10:59:09 -0400 (Wed, 05 May 2010) | 3 lines
+Changed paths:
+ A /trunk/tabix/NEWS
+ M /trunk/tabix/TabixReader.java
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+ M /trunk/tabix/tabix.1
+ M /trunk/tabix/tabix.h
+
+ * Release tabix-0.1.5 (r560)
+ * Improve seeking efficiency. Index file needs to be rebuilt.
+
+------------------------------------------------------------------------
+r559 | lh3lh3 | 2010-05-04 23:11:42 -0400 (Tue, 04 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/main.c
+
+Release tabix-0.1.4 (r559)
+
+------------------------------------------------------------------------
+r558 | lh3lh3 | 2010-05-01 12:48:01 -0400 (Sat, 01 May 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/TabixReader.java
+
+implement SAM/VCF support; NOT tested yet
+
+------------------------------------------------------------------------
+r557 | lh3lh3 | 2010-05-01 00:42:34 -0400 (Sat, 01 May 2010) | 2 lines
+Changed paths:
+ A /trunk/tabix/TabixReader.java
+
+The Java implementation of tabix.
+
+------------------------------------------------------------------------
+r556 | lh3lh3 | 2010-04-30 22:34:07 -0400 (Fri, 30 Apr 2010) | 4 lines
+Changed paths:
+ M /trunk/tabix/index.c
+ M /trunk/tabix/knetfile.c
+ M /trunk/tabix/main.c
+
+ * tabix-0.1.3-3 (r556)
+ * fixed a small memory leak in knetfile
+ * fixed a minor bug for remote downloading
+
+------------------------------------------------------------------------
+r555 | lh3lh3 | 2010-04-30 22:15:12 -0400 (Fri, 30 Apr 2010) | 4 lines
+Changed paths:
+ M /trunk/tabix/Makefile
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+
+ * tabix-0.1.3-2 (r555)
+ * do not overwrite index file by default
+ * a little code cleanup
+
+------------------------------------------------------------------------
+r554 | lh3lh3 | 2010-04-30 21:44:31 -0400 (Fri, 30 Apr 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/index.c
+
+fixed a potential bug for UCSC-like coordinate
+
+------------------------------------------------------------------------
+r553 | lh3lh3 | 2010-04-28 17:43:41 -0400 (Wed, 28 Apr 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/tabix.tex
+
+minor clarification to the format spec
+
+------------------------------------------------------------------------
+r552 | lh3lh3 | 2010-04-28 16:33:07 -0400 (Wed, 28 Apr 2010) | 3 lines
+Changed paths:
+ M /trunk/tabix/Makefile
+ M /trunk/tabix/bgzip.c
+ A /trunk/tabix/tabix.tex
+
+ * added the format specification
+ * fixed a typo in bgzip
+
+------------------------------------------------------------------------
+r550 | petulda | 2010-04-22 11:03:24 -0400 (Thu, 22 Apr 2010) | 1 line
+Changed paths:
+ M /trunk/tabix/bgzip.c
+
+The behaviour changed slightly to mimic gzip. Detect if std descriptors are connected to the terminal.
+------------------------------------------------------------------------
+r549 | petulda | 2010-04-22 09:46:10 -0400 (Thu, 22 Apr 2010) | 1 line
+Changed paths:
+ M /trunk/tabix/bgzip.c
+
+Fix in src/dst file detection and slight change of behaviour
+------------------------------------------------------------------------
+r548 | petulda | 2010-04-19 04:39:46 -0400 (Mon, 19 Apr 2010) | 1 line
+Changed paths:
+ M /trunk/tabix/index.c
+
+Close file descriptor in ti_list_chromosomes
+------------------------------------------------------------------------
+r547 | petulda | 2010-04-16 09:27:11 -0400 (Fri, 16 Apr 2010) | 1 line
+Changed paths:
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+ M /trunk/tabix/tabix.h
+
+Added the -l option for listing chromosomes
+------------------------------------------------------------------------
+r544 | lh3lh3 | 2010-03-29 10:58:48 -0400 (Mon, 29 Mar 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/main.c
+
+removed a line of debugging code
+
+------------------------------------------------------------------------
+r543 | lh3lh3 | 2010-03-19 12:29:16 -0400 (Fri, 19 Mar 2010) | 3 lines
+Changed paths:
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+ M /trunk/tabix/tabix.1
+
+ * tabix-0.1.3 (r543)
+ * fixed another off-by-one bug
+
+------------------------------------------------------------------------
+r542 | lh3lh3 | 2010-03-16 22:35:52 -0400 (Tue, 16 Mar 2010) | 2 lines
+Changed paths:
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+ M /trunk/tabix/tabix.1
+
+Release tabix-0.1.1
+
+------------------------------------------------------------------------
+r506 | lh3lh3 | 2009-11-02 23:20:12 -0500 (Mon, 02 Nov 2009) | 2 lines
+Changed paths:
+ M /trunk/tabix/main.c
+
+Release tabix-0.1.0
+
+------------------------------------------------------------------------
+r505 | lh3lh3 | 2009-11-02 23:15:49 -0500 (Mon, 02 Nov 2009) | 2 lines
+Changed paths:
+ A /trunk/tabix/tabix.1
+
+documentation
+
+------------------------------------------------------------------------
+r504 | lh3lh3 | 2009-11-02 11:08:18 -0500 (Mon, 02 Nov 2009) | 5 lines
+Changed paths:
+ M /trunk/tabix/Makefile
+ M /trunk/tabix/bgzip.c
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+ M /trunk/tabix/tabix.h
+
+ * tabix-0.0.0-5 (r504)
+ * fixed a critical bug in fetching data (a typo in fact)
+ * support SAM (tested on ex1.sam) and VCF (not tested)
+ * improve the command-line interface
+
+------------------------------------------------------------------------
+r503 | lh3lh3 | 2009-11-02 10:04:43 -0500 (Mon, 02 Nov 2009) | 3 lines
+Changed paths:
+ M /trunk/tabix/Makefile
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+
+ * tabix-0.0.0-4 (r503)
+ * index files are bgzf compressed
+
+------------------------------------------------------------------------
+r502 | lh3lh3 | 2009-11-02 09:47:25 -0500 (Mon, 02 Nov 2009) | 4 lines
+Changed paths:
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+ M /trunk/tabix/tabix.h
+
+ * tabix-0.0.0-3 (r502)
+ * support meta lines (not tested)
+ * I am going to make the index file in the BGZF format
+
+------------------------------------------------------------------------
+r501 | lh3lh3 | 2009-11-01 22:03:07 -0500 (Sun, 01 Nov 2009) | 3 lines
+Changed paths:
+ M /trunk/tabix/Makefile
+ M /trunk/tabix/bgzf.h
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+
+ * tabix-0.0.0-2 (r501)
+ * accelerate ti_readline()
+
+------------------------------------------------------------------------
+r500 | lh3lh3 | 2009-11-01 20:49:52 -0500 (Sun, 01 Nov 2009) | 3 lines
+Changed paths:
+ M /trunk/tabix/Makefile
+ M /trunk/tabix/bgzip.c
+ M /trunk/tabix/index.c
+ M /trunk/tabix/main.c
+
+ * tabix-0.0.0-1 (r500)
+ * apparently working
+
+------------------------------------------------------------------------
+r499 | lh3lh3 | 2009-11-01 14:04:52 -0500 (Sun, 01 Nov 2009) | 2 lines
+Changed paths:
+ D /trunk/tabix/parser.c
+
+obsolete file
+
+------------------------------------------------------------------------
+r498 | lh3lh3 | 2009-11-01 14:04:08 -0500 (Sun, 01 Nov 2009) | 2 lines
+Changed paths:
+ M /trunk/tabix/bgzip.c
+
+bgzip is more like gzip in its command-line interface
+
+------------------------------------------------------------------------
+r497 | lh3lh3 | 2009-11-01 13:43:35 -0500 (Sun, 01 Nov 2009) | 2 lines
+Changed paths:
+ A /trunk/tabix/Makefile
+ A /trunk/tabix/bam_endian.h
+ A /trunk/tabix/bgzf.c
+ A /trunk/tabix/bgzf.h
+ A /trunk/tabix/bgzip.c
+ A /trunk/tabix/index.c
+ A /trunk/tabix/khash.h
+ A /trunk/tabix/knetfile.c
+ A /trunk/tabix/knetfile.h
+ A /trunk/tabix/ksort.h
+ A /trunk/tabix/kstring.c
+ A /trunk/tabix/kstring.h
+ A /trunk/tabix/main.c
+ A /trunk/tabix/parser.c
+ A /trunk/tabix/tabix.h
+
+initial source code. It is BUGGY!
+
+------------------------------------------------------------------------
+r496 | lh3lh3 | 2009-11-01 13:42:39 -0500 (Sun, 01 Nov 2009) | 2 lines
+Changed paths:
+ A /trunk/tabix
+
+A generic indexer for TAB-delimited genome position files
+
+------------------------------------------------------------------------
63 Makefile
@@ -0,0 +1,63 @@
+CC= gcc
+CFLAGS= -g -Wall -O2 -fPIC #-m64 #-arch ppc
+DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE
+LOBJS= bgzf.o kstring.o knetfile.o index.o bedidx.o
+AOBJS= main.o
+PROG= tabix bgzip
+INCLUDES=
+SUBDIRS= .
+LIBPATH=
+LIBCURSES=
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+ @target=`echo $@ | sed s/-recur//`; \
+ wdir=`pwd`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ cd $$subdir; \
+ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+ INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
+ cd $$wdir; \
+ done;
+
+all:$(PROG)
+
+lib:libtabix.a
+
+libtabix.so.1:$(LOBJS)
+ $(CC) -shared -Wl,-soname,libtabix.so -o $@ $(LOBJS) -lc -lz
+
+libtabix.1.dylib:$(LOBJS)
+ libtool -dynamic $(LOBJS) -o $@ -lc -lz
+
+libtabix.a:$(LOBJS)
+ $(AR) -cru $@ $(LOBJS)
+
+tabix:lib $(AOBJS)
+ $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) -lz -L. -ltabix
+
+bgzip:bgzip.o bgzf.o knetfile.o
+ $(CC) $(CFLAGS) -o $@ bgzip.o bgzf.o knetfile.o -lz
+
+TabixReader.class:TabixReader.java
+ javac -cp .:sam.jar TabixReader.java
+
+kstring.o:kstring.h
+knetfile.o:knetfile.h
+bgzf.o:bgzf.h knetfile.h
+index.o:bgzf.h tabix.h khash.h ksort.h kstring.h
+main.o:tabix.h kstring.h bgzf.h
+bgzip.o:bgzf.h
+bedidx.o:kseq.h khash.h
+
+tabix.pdf:tabix.tex
+ pdflatex tabix.tex
+
+cleanlocal:
+ rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a tabix.aux tabix.log tabix.pdf *.class libtabix.*.dylib libtabix.so*
+
+clean:cleanlocal-recur
126 NEWS
@@ -0,0 +1,126 @@
+Release 0.2.4 (10 April, 2011)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Give an error if the index file is older than the data file.
+
+ * Avoid a segfault given flawed input.
+
+ * Added Python APIs contributed by Hyeshik Chang. The new APIs do not bind to
+ the dynamic library and are reported to be faster. Pysam also comes with a
+ tabix binding.
+
+ * Added option "-r" for efficient header replacement.
+
+ * Added BED support.
+
+ * Synchronized the BGZF library between tabix and samtools.
+
+(0.2.4: 10 April 2011, r949)
+
+
+
+Beta Release 0.2.3 (8 December, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Fixed a minor bug where the first record in a headerless file may be
+ missed.
+
+ * Added an option to print header lines.
+
+ * Fixed a rare bug which may occasionally happen when retrieving data
+ from a region without any records.
+
+ * Enhanced error reporting.
+
+ * Fixed a bug in bgzip which may delete the original file even if not
+ intended.
+
+(0.2.3: 8 December 2010, r876)
+
+
+
+Beta Release 0.2.2 (28 June, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Dropped the VCF3 support. Added VCF4 support.
+
+ * Avoided the function name collision with samtools.
+
+(0.2.2: 28 June 2010, r603)
+
+
+
+Beta Release 0.2.1 (3 June, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Allow shared library to be compiled. Added python binding to the
+ shared library.
+
+(0.2.1: 3 June 2010, r582)
+
+
+
+Beta Release 0.2.0 (11 May, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Fixed an issue for random access given an interval end larger than
+ 2^29.
+
+ * Updated the Java binding.
+
+ * Added a Perl module using XS.
+
+ * Improved the C APIs.
+
+(0.2.0: 11 May 2010, r574)
+
+
+
+Beta Release 0.1.6 (9 May, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Improved backward compatibility. Release 0.1.5 does not work with the
+ buggy index file generated by 0.1.2.
+
+ * Fixed a bug in building linear index. The bug does not affect the
+ results, only affects efficiency in rare cases.
+
+ * Reduced the number of seek calls given an index generated by old
+ version of tabix.
+
+ * Added new APIs for retrieving data via an iterator. The old callback
+ APIs are not changed, although internally it uses iterator to
+ retrieve data.
+
+I am trying to freeze tabix. I just hope I am committing new bugs.
+
+(0.1.6: 9 May 2010, r563)
+
+
+
+Beta Release 0.1.5 (5 May, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notable changes:
+
+ * Clarified that tabix is released under MIT/X11.
+
+ * Improved the robustness of indexing and retrieval.
+
+ * Reduced the number of seek calls when the specified region starts
+ from a 16kb block with no data. The index format is the same, but the
+ content is changed a little.
+
+(0.1.5: 5 May 2010, r560)
395 TabixReader.java
@@ -0,0 +1,395 @@
+/* The MIT License
+
+ Copyright (c) 2010 Broad Institute.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <hengli@broadinstitute.org> */
+
+import net.sf.samtools.util.BlockCompressedInputStream;
+
+import java.io.*;
+import java.nio.*;
+import java.util.HashMap;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.lang.StringBuffer;
+
+public class TabixReader
+{
+ private String mFn;
+ private BlockCompressedInputStream mFp;
+
+ private int mPreset;
+ private int mSc;
+ private int mBc;
+ private int mEc;
+ private int mMeta;
+ private int mSkip;
+ private String[] mSeq;
+
+ private HashMap<String, Integer> mChr2tid;
+
+ private static int MAX_BIN = 37450;
+ private static int TAD_MIN_CHUNK_GAP = 32768;
+ private static int TAD_LIDX_SHIFT = 14;
+
+ private class TPair64 implements Comparable<TPair64> {
+ long u, v;
+ public TPair64(final long _u, final long _v) {
+ u = _u; v = _v;
+ }
+ public TPair64(final TPair64 p) {
+ u = p.u; v = p.v;
+ }
+ public int compareTo(final TPair64 p) {
+ return u == p.u? 0 : ((u < p.u) ^ (u < 0) ^ (p.u < 0))? -1 : 1; // unsigned 64-bit comparison
+ }
+ };
+
+ private class TIndex {
+ HashMap<Integer, TPair64[]> b; // binning index
+ long[] l; // linear index
+ };
+ private TIndex[] mIndex;
+
+ private class TIntv {
+ int tid, beg, end;
+ };
+
+ private static boolean less64(final long u, final long v) { // unsigned 64-bit comparison
+ return (u < v) ^ (u < 0) ^ (v < 0);
+ }
+
+ /**
+ * The constructor
+ *
+ * @param fn File name of the data file
+ */
+ public TabixReader(final String fn) throws IOException {
+ mFn = fn;
+ mFp = new BlockCompressedInputStream(new File(fn));
+ readIndex();
+ }
+
+ private static int reg2bins(final int beg, final int _end, final int[] list) {
+ int i = 0, k, end = _end;
+ if (beg >= end) return 0;
+ if (end >= 1<<29) end = 1<<29;
+ --end;
+ list[i++] = 0;
+ for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k;
+ for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k;
+ for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k;
+ for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k;
+ for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+ return i;
+ }
+
+ public static int readInt(final InputStream is) throws IOException {
+ byte[] buf = new byte[4];
+ is.read(buf);
+ return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt();
+ }
+
+ public static long readLong(final InputStream is) throws IOException {
+ byte[] buf = new byte[8];
+ is.read(buf);
+ return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong();
+ }
+
+ public static String readLine(final InputStream is) throws IOException {
+ StringBuffer buf = new StringBuffer();
+ int c;
+ while ((c = is.read()) >= 0 && c != '\n')
+ buf.append((char)c);
+ if (c < 0) return null;
+ return buf.toString();
+ }
+
+ /**
+ * Read the Tabix index from a file
+ *
+ * @param fp File pointer
+ */
+ public void readIndex(final File fp) throws IOException {
+ if (fp == null) return;
+ BlockCompressedInputStream is = new BlockCompressedInputStream(fp);
+ byte[] buf = new byte[4];
+
+ is.read(buf, 0, 4); // read "TBI\1"
+ mSeq = new String[readInt(is)]; // # sequences
+ mChr2tid = new HashMap<String, Integer>();
+ mPreset = readInt(is);
+ mSc = readInt(is);
+ mBc = readInt(is);
+ mEc = readInt(is);
+ mMeta = readInt(is);
+ mSkip = readInt(is);
+ // read sequence dictionary
+ int i, j, k, l = readInt(is);
+ buf = new byte[l];
+ is.read(buf);
+ for (i = j = k = 0; i < buf.length; ++i) {
+ if (buf[i] == 0) {
+ byte[] b = new byte[i - j];
+ System.arraycopy(buf, j, b, 0, b.length);
+ String s = new String(b);
+ mChr2tid.put(s, k);
+ mSeq[k++] = s;
+ j = i + 1;
+ }
+ }
+ // read the index
+ mIndex = new TIndex[mSeq.length];
+ for (i = 0; i < mSeq.length; ++i) {
+ // the binning index
+ int n_bin = readInt(is);
+ mIndex[i] = new TIndex();
+ mIndex[i].b = new HashMap<Integer, TPair64[]>();
+ for (j = 0; j < n_bin; ++j) {
+ int bin = readInt(is);
+ TPair64[] chunks = new TPair64[readInt(is)];
+ for (k = 0; k < chunks.length; ++k) {
+ long u = readLong(is);
+ long v = readLong(is);
+ chunks[k] = new TPair64(u, v); // in C, this is inefficient
+ }
+ mIndex[i].b.put(bin, chunks);
+ }
+ // the linear index
+ mIndex[i].l = new long[readInt(is)];
+ for (k = 0; k < mIndex[i].l.length; ++k)
+ mIndex[i].l[k] = readLong(is);
+ }
+ // close
+ is.close();
+ }
+
+ /**
+ * Read the Tabix index from the default file.
+ */
+ public void readIndex() throws IOException {
+ readIndex(new File(mFn + ".tbi"));
+ }
+
+ /**
+ * Read one line from the data file.
+ */
+ public String readLine() throws IOException {
+ return readLine(mFp);
+ }
+
+ private int chr2tid(final String chr) {
+ if (mChr2tid.containsKey(chr)) return mChr2tid.get(chr);
+ else return -1;
+ }
+
+ /**
+ * Parse a region in the format of "chr1", "chr1:100" or "chr1:100-1000"
+ *
+ * @param reg Region string
+ * @return An array where the three elements are sequence_id,
+ * region_begin and region_end. On failure, sequence_id==-1.
+ */
+ public int[] parseReg(final String reg) { // FIXME: NOT working when the sequence name contains : or -.
+ String chr;
+ int colon, hyphen;
+ int[] ret = new int[3];
+ colon = reg.indexOf(':'); hyphen = reg.indexOf('-');
+ chr = colon >= 0? reg.substring(0, colon) : reg;
+ ret[1] = colon >= 0? Integer.parseInt(reg.substring(colon+1, hyphen >= 0? hyphen : reg.length())) - 1 : 0;
+ ret[2] = hyphen >= 0? Integer.parseInt(reg.substring(hyphen+1)) : 0x7fffffff;
+ ret[0] = chr2tid(chr);
+ return ret;
+ }
+
+ private TIntv getIntv(final String s) {
+ TIntv intv = new TIntv();
+ int col = 0, end = 0, beg = 0;
+ while ((end = s.indexOf('\t', beg)) >= 0 || end == -1) {
+ ++col;
+ if (col == mSc) {
+ intv.tid = chr2tid(s.substring(beg, end));
+ } else if (col == mBc) {
+ intv.beg = intv.end = Integer.parseInt(s.substring(beg, end));
+ if ((mPreset&0x10000) != 0) ++intv.end;
+ else --intv.beg;
+ if (intv.beg < 0) intv.beg = 0;
+ if (intv.end < 1) intv.end = 1;
+ } else { // FIXME: SAM supports are not tested yet
+ if ((mPreset&0xffff) == 0) { // generic
+ if (col == mEc)
+ intv.end = Integer.parseInt(s.substring(beg, end));
+ } else if ((mPreset&0xffff) == 1) { // SAM
+ if (col == 6) { // CIGAR
+ int l = 0, i, j;
+ String cigar = s.substring(beg, end);
+ for (i = j = 0; i < cigar.length(); ++i) {
+ if (cigar.charAt(i) > '9') {
+ int op = cigar.charAt(i);
+ if (op == 'M' || op == 'D' || op == 'N')
+ l += Integer.parseInt(cigar.substring(j, i));
+ }
+ }
+ intv.end = intv.beg + l;
+ }
+ } else if ((mPreset&0xffff) == 2) { // VCF
+ String alt;
+ alt = end >= 0? s.substring(beg, end) : s.substring(beg);
+ if (col == 4) { // REF
+ if (alt.length() > 0) intv.end = intv.beg + alt.length();
+ } else if (col == 8) { // INFO
+ int e_off = -1, i = alt.indexOf("END=");
+ if (i == 0) e_off = 4;
+ else if (i > 0) {
+ i = alt.indexOf(";END=");
+ if (i >= 0) e_off = i + 5;
+ }
+ if (e_off > 0) {
+ i = alt.indexOf(";", e_off);
+ intv.end = Integer.parseInt(i > e_off? alt.substring(e_off, i) : alt.substring(e_off));
+ }
+ }
+ }
+ }
+ if (end == -1) break;
+ beg = end + 1;
+ }
+ return intv;
+ }
+
+ public class Iterator {
+ private int i, n_seeks;
+ private int tid, beg, end;
+ private TPair64[] off;
+ private long curr_off;
+ private boolean iseof;
+
+ public Iterator(final int _tid, final int _beg, final int _end, final TPair64[] _off) {
+ i = -1; n_seeks = 0; curr_off = 0; iseof = false;
+ off = _off; tid = _tid; beg = _beg; end = _end;
+ }
+
+ public String next() throws IOException {
+ if (iseof) return null;
+ for (;;) {
+ if (curr_off == 0 || !less64(curr_off, off[i].v)) { // then jump to the next chunk
+ if (i == off.length - 1) break; // no more chunks
+ if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
+ if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
+ mFp.seek(off[i+1].u);
+ curr_off = mFp.getFilePointer();
+ ++n_seeks;
+ }
+ ++i;
+ }
+ String s;
+ if ((s = readLine(mFp)) != null) {
+ TIntv intv;
+ char[] str = s.toCharArray();
+ curr_off = mFp.getFilePointer();
+ if (str.length == 0 || str[0] == mMeta) continue;
+ intv = getIntv(s);
+ if (intv.tid != tid || intv.beg >= end) break; // no need to proceed
+ else if (intv.end > beg && intv.beg < end) return s; // overlap; return
+ } else break; // end of file
+ }
+ iseof = true;
+ return null;
+ }
+ };
+
+ public Iterator query(final int tid, final int beg, final int end) {
+ TPair64[] off, chunks;
+ long min_off;
+ TIndex idx = mIndex[tid];
+ int[] bins = new int[MAX_BIN];
+ int i, l, n_off, n_bins = reg2bins(beg, end, bins);
+ if (idx.l.length > 0)
+ min_off = (beg>>TAD_LIDX_SHIFT >= idx.l.length)? idx.l[idx.l.length-1] : idx.l[beg>>TAD_LIDX_SHIFT];
+ else min_off = 0;
+ for (i = n_off = 0; i < n_bins; ++i) {
+ if ((chunks = idx.b.get(bins[i])) != null)
+ n_off += chunks.length;
+ }
+ if (n_off == 0) return null;
+ off = new TPair64[n_off];
+ for (i = n_off = 0; i < n_bins; ++i)
+ if ((chunks = idx.b.get(bins[i])) != null)
+ for (int j = 0; j < chunks.length; ++j)
+ if (less64(min_off, chunks[j].v))
+ off[n_off++] = new TPair64(chunks[j]);
+ if (n_off == 0) return null;
+ Arrays.sort(off, 0, n_off);
+ // resolve completely contained adjacent blocks
+ for (i = 1, l = 0; i < n_off; ++i) {
+ if (less64(off[l].v, off[i].v)) {
+ ++l;
+ off[l].u = off[i].u; off[l].v = off[i].v;
+ }
+ }
+ n_off = l + 1;
+ // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
+ for (i = 1; i < n_off; ++i)
+ if (!less64(off[i-1].v, off[i].u)) off[i-1].v = off[i].u;
+ // merge adjacent blocks
+ for (i = 1, l = 0; i < n_off; ++i) {
+ if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+ else {
+ ++l;
+ off[l].u = off[i].u;
+ off[l].v = off[i].v;
+ }
+ }
+ n_off = l + 1;
+ // return
+ TPair64[] ret = new TPair64[n_off];
+ for (i = 0; i < n_off; ++i) ret[i] = new TPair64(off[i].u, off[i].v); // in C, this is inefficient
+ return new TabixReader.Iterator(tid, beg, end, ret);
+ }
+
+ public Iterator query(final String reg) {
+ int[] x = parseReg(reg);
+ return query(x[0], x[1], x[2]);
+ }
+
+ public static void main(String[] args) {
+ if (args.length < 1) {
+ System.out.println("Usage: java -cp .:sam.jar TabixReader <in.gz> [region]");
+ System.exit(1);
+ }
+ try {
+ TabixReader tr = new TabixReader(args[0]);
+ String s;
+ if (args.length == 1) { // no region is specified; print the whole file
+ while ((s = tr.readLine()) != null)
+ System.out.println(s);
+ } else { // a region is specified; random access
+ TabixReader.Iterator iter = tr.query(args[1]); // get the iterator
+ while (iter != null && (s = iter.next()) != null)
+ System.out.println(s);
+ }
+ } catch (IOException e) {
+ }
+ }
+}
42 bam_endian.h
@@ -0,0 +1,42 @@
+#ifndef BAM_ENDIAN_H
+#define BAM_ENDIAN_H
+
+#include <stdint.h>
+
+static inline int bam_is_big_endian()
+{
+ long one= 1;
+ return !(*((char *)(&one)));
+}
+static inline uint16_t bam_swap_endian_2(uint16_t v)
+{
+ return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+static inline void *bam_swap_endian_2p(void *x)
+{
+ *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
+ return x;
+}
+static inline uint32_t bam_swap_endian_4(uint32_t v)
+{
+ v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+ return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+static inline void *bam_swap_endian_4p(void *x)
+{
+ *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
+ return x;
+}
+static inline uint64_t bam_swap_endian_8(uint64_t v)
+{
+ v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+ v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+ return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+static inline void *bam_swap_endian_8p(void *x)
+{
+ *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
+ return x;
+}
+
+#endif
156 bedidx.c
@@ -0,0 +1,156 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint64_t)
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 8192)
+
+typedef struct {
+ int n, m;
+ uint64_t *a;
+ int *idx;
+} bed_reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, bed_reglist_t)
+
+#define LIDX_SHIFT 13
+
+typedef kh_reg_t reghash_t;
+
+int *bed_index_core(int n, uint64_t *a, int *n_idx)
+{
+ int i, j, m, *idx;
+ m = *n_idx = 0; idx = 0;
+ for (i = 0; i < n; ++i) {
+ int beg, end;
+ beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
+ if (m < end + 1) {
+ int oldm = m;
+ m = end + 1;
+ kroundup32(m);
+ idx = realloc(idx, m * sizeof(int));
+ for (j = oldm; j < m; ++j) idx[j] = -1;
+ }
+ if (beg == end) {
+ if (idx[beg] < 0) idx[beg] = i;
+ } else {
+ for (j = beg; j <= end; ++j)
+ if (idx[j] < 0) idx[j] = i;
+ }
+ *n_idx = end + 1;
+ }
+ return idx;
+}
+
+void bed_index(void *_h)
+{
+ reghash_t *h = (reghash_t*)_h;
+ khint_t k;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ bed_reglist_t *p = &kh_val(h, k);
+ if (p->idx) free(p->idx);
+ ks_introsort(uint64_t, p->n, p->a);
+ p->idx = bed_index_core(p->n, p->a, &p->m);
+ }
+ }
+}
+
+int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
+{
+ int i, min_off;
+ if (p->n == 0) return 0;
+ min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
+ if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
+ int n = beg>>LIDX_SHIFT;
+ if (n > p->n) n = p->n;
+ for (i = n - 1; i >= 0; --i)
+ if (p->idx[i] >= 0) break;
+ min_off = i >= 0? p->idx[i] : 0;
+ }
+ for (i = min_off; i < p->n; ++i) {
+ if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
+ if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
+ return 1; // find the overlap; return
+ }
+ return 0;
+}
+
+int bed_overlap(const void *_h, const char *chr, int beg, int end)
+{
+ const reghash_t *h = (const reghash_t*)_h;
+ khint_t k;
+ if (!h) return 0;
+ k = kh_get(reg, h, chr);
+ if (k == kh_end(h)) return 0;
+ return bed_overlap_core(&kh_val(h, k), beg, end);
+}
+
+void *bed_read(const char *fn)
+{
+ reghash_t *h = kh_init(reg);
+ gzFile fp;
+ kstream_t *ks;
+ int dret;
+ kstring_t *str;
+ // read the list
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ if (fp == 0) return 0;
+ str = calloc(1, sizeof(kstring_t));
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
+ int beg = -1, end = -1;
+ bed_reglist_t *p;
+ khint_t k = kh_get(reg, h, str->s);
+ if (k == kh_end(h)) { // absent from the hash table
+ int ret;
+ char *s = strdup(str->s);
+ k = kh_put(reg, h, s, &ret);
+ memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
+ }
+ p = &kh_val(h, k);
+ if (dret != '\n') { // if the lines has other characters
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+ beg = atoi(str->s); // begin
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0]))
+ end = atoi(str->s); // end
+ }
+ }
+ }
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
+ if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+ if (beg >= 0 && end > beg) {
+ if (p->n == p->m) {
+ p->m = p->m? p->m<<1 : 4;
+ p->a = realloc(p->a, p->m * 8);
+ }
+ p->a[p->n++] = (uint64_t)beg<<32 | end;
+ }
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ bed_index(h);
+ return h;
+}
+
+void bed_destroy(void *_h)
+{
+ reghash_t *h = (reghash_t*)_h;
+ khint_t k;
+ for (k = 0; k < kh_end(h); ++k) {
+ if (kh_exist(h, k)) {
+ free(kh_val(h, k).a);
+ free(kh_val(h, k).idx);
+ free((char*)kh_key(h, k));
+ }
+ }
+ kh_destroy(reg, h);
+}
714 bgzf.c
@@ -0,0 +1,714 @@
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/*
+ 2009-06-29 by lh3: cache recent uncompressed blocks.
+ 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP.
+ 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "bgzf.h"
+
+#include "khash.h"
+typedef struct {
+ int size;
+ uint8_t *block;
+ int64_t end_offset;
+} cache_t;
+KHASH_MAP_INIT_INT64(cache, cache_t)
+
+#if defined(_WIN32) || defined(_MSC_VER)
+#define ftello(fp) ftell(fp)
+#define fseeko(fp, offset, whence) fseek(fp, offset, whence)
+#else
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#endif
+
+typedef int8_t bgzf_byte_t;
+
+static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
+static const int MAX_BLOCK_SIZE = 64 * 1024;
+
+static const int BLOCK_HEADER_LENGTH = 18;
+static const int BLOCK_FOOTER_LENGTH = 8;
+
+static const int GZIP_ID1 = 31;
+static const int GZIP_ID2 = 139;
+static const int CM_DEFLATE = 8;
+static const int FLG_FEXTRA = 4;
+static const int OS_UNKNOWN = 255;
+static const int BGZF_ID1 = 66; // 'B'
+static const int BGZF_ID2 = 67; // 'C'
+static const int BGZF_LEN = 2;
+static const int BGZF_XLEN = 6; // BGZF_LEN+4
+
+static const int GZIP_WINDOW_BITS = -15; // no zlib header
+static const int Z_DEFAULT_MEM_LEVEL = 8;
+
+
+inline
+void
+packInt16(uint8_t* buffer, uint16_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+}
+
+inline
+int
+unpackInt16(const uint8_t* buffer)
+{
+ return (buffer[0] | (buffer[1] << 8));
+}
+
+inline
+void
+packInt32(uint8_t* buffer, uint32_t value)
+{
+ buffer[0] = value;
+ buffer[1] = value >> 8;
+ buffer[2] = value >> 16;
+ buffer[3] = value >> 24;
+}
+
+static inline
+int
+bgzf_min(int x, int y)
+{
+ return (x < y) ? x : y;
+}
+
+static
+void
+report_error(BGZF* fp, const char* message) {
+ fp->error = message;
+}
+
+int bgzf_check_bgzf(const char *fn)
+{
+ BGZF *fp;
+ uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377";
+ int n;
+
+ if ((fp = bgzf_open(fn, "r")) == 0)
+ {
+ fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn);
+ return -1;
+ }
+
+#ifdef _USE_KNETFILE
+ n = knet_read(fp->x.fpr, buf, 10);
+#else
+ n = fread(buf, 1, 10, fp->file);
+#endif
+ bgzf_close(fp);
+
+ if ( n!=10 )
+ return -1;
+
+ if ( !memcmp(magic, buf, 10) ) return 1;
+ return 0;
+}
+
+static BGZF *bgzf_read_init()
+{
+ BGZF *fp;
+ fp = calloc(1, sizeof(BGZF));
+ fp->uncompressed_block_size = MAX_BLOCK_SIZE;
+ fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
+ fp->compressed_block_size = MAX_BLOCK_SIZE;
+ fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+ fp->cache_size = 0;
+ fp->cache = kh_init(cache);
+ return fp;
+}
+
+static
+BGZF*
+open_read(int fd)
+{
+#ifdef _USE_KNETFILE
+ knetFile *file = knet_dopen(fd, "r");
+#else
+ FILE* file = fdopen(fd, "r");
+#endif
+ BGZF* fp;
+ if (file == 0) return 0;
+ fp = bgzf_read_init();
+ fp->file_descriptor = fd;
+ fp->open_mode = 'r';
+#ifdef _USE_KNETFILE
+ fp->x.fpr = file;
+#else
+ fp->file = file;
+#endif
+ return fp;
+}
+
+static
+BGZF*
+open_write(int fd, int compress_level) // compress_level==-1 for the default level
+{
+ FILE* file = fdopen(fd, "w");
+ BGZF* fp;
+ if (file == 0) return 0;
+ fp = malloc(sizeof(BGZF));
+ fp->file_descriptor = fd;
+ fp->open_mode = 'w';
+ fp->owned_file = 0;
+ fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1
+ if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION;
+#ifdef _USE_KNETFILE
+ fp->x.fpw = file;
+#else
+ fp->file = file;
+#endif
+ fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
+ fp->uncompressed_block = NULL;
+ fp->compressed_block_size = MAX_BLOCK_SIZE;
+ fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+ fp->block_address = 0;
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ fp->error = NULL;
+ return fp;
+}
+
+BGZF*
+bgzf_open(const char* __restrict path, const char* __restrict mode)
+{
+ BGZF* fp = NULL;
+ if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */
+#ifdef _USE_KNETFILE
+ knetFile *file = knet_open(path, mode);
+ if (file == 0) return 0;
+ fp = bgzf_read_init();
+ fp->file_descriptor = -1;
+ fp->open_mode = 'r';
+ fp->x.fpr = file;
+#else
+ int fd, oflag = O_RDONLY;
+#ifdef _WIN32
+ oflag |= O_BINARY;
+#endif
+ fd = open(path, oflag);
+ if (fd == -1) return 0;
+ fp = open_read(fd);
+#endif
+ } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
+ int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC;
+#ifdef _WIN32
+ oflag |= O_BINARY;
+#endif
+ fd = open(path, oflag, 0666);
+ if (fd == -1) return 0;
+ { // set compress_level
+ int i;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ }
+ fp = open_write(fd, compress_level);
+ }
+ if (fp != NULL) fp->owned_file = 1;
+ return fp;
+}
+
+BGZF*
+bgzf_fdopen(int fd, const char * __restrict mode)
+{
+ if (fd == -1) return 0;
+ if (mode[0] == 'r' || mode[0] == 'R') {
+ return open_read(fd);
+ } else if (mode[0] == 'w' || mode[0] == 'W') {
+ int i, compress_level = -1;
+ for (i = 0; mode[i]; ++i)
+ if (mode[i] >= '0' && mode[i] <= '9') break;
+ if (mode[i]) compress_level = (int)mode[i] - '0';
+ if (strchr(mode, 'u')) compress_level = 0;
+ return open_write(fd, compress_level);
+ } else {
+ return NULL;
+ }
+}
+
+static
+int
+deflate_block(BGZF* fp, int block_length)
+{
+ // Deflate the block in fp->uncompressed_block into fp->compressed_block.
+ // Also adds an extra field that stores the compressed block length.
+
+ bgzf_byte_t* buffer = fp->compressed_block;
+ int buffer_size = fp->compressed_block_size;
+
+ // Init gzip header
+ buffer[0] = GZIP_ID1;
+ buffer[1] = GZIP_ID2;
+ buffer[2] = CM_DEFLATE;
+ buffer[3] = FLG_FEXTRA;
+ buffer[4] = 0; // mtime
+ buffer[5] = 0;
+ buffer[6] = 0;
+ buffer[7] = 0;
+ buffer[8] = 0;
+ buffer[9] = OS_UNKNOWN;
+ buffer[10] = BGZF_XLEN;
+ buffer[11] = 0;
+ buffer[12] = BGZF_ID1;
+ buffer[13] = BGZF_ID2;
+ buffer[14] = BGZF_LEN;
+ buffer[15] = 0;
+ buffer[16] = 0; // placeholder for block length
+ buffer[17] = 0;
+
+ // loop to retry for blocks that do not compress enough
+ int input_length = block_length;
+ int compressed_length = 0;
+ while (1) {
+ z_stream zs;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = fp->uncompressed_block;
+ zs.avail_in = input_length;
+ zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
+ zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+ int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED,
+ GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ if (status != Z_OK) {
+ report_error(fp, "deflate init failed");
+ return -1;
+ }
+ status = deflate(&zs, Z_FINISH);
+ if (status != Z_STREAM_END) {
+ deflateEnd(&zs);
+ if (status == Z_OK) {
+ // Not enough space in buffer.
+ // Can happen in the rare case the input doesn't compress enough.
+ // Reduce the amount of input until it fits.
+ input_length -= 1024;
+ if (input_length <= 0) {
+ // should never happen
+ report_error(fp, "input reduction failed");
+ return -1;
+ }
+ continue;
+ }
+ report_error(fp, "deflate failed");
+ return -1;
+ }
+ status = deflateEnd(&zs);
+ if (status != Z_OK) {
+ report_error(fp, "deflate end failed");
+ return -1;
+ }
+ compressed_length = zs.total_out;
+ compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+ if (compressed_length > MAX_BLOCK_SIZE) {
+ // should never happen
+ report_error(fp, "deflate overflow");
+ return -1;
+ }
+ break;
+ }
+
+ packInt16((uint8_t*)&buffer[16], compressed_length-1);
+ uint32_t crc = crc32(0L, NULL, 0L);
+ crc = crc32(crc, fp->uncompressed_block, input_length);
+ packInt32((uint8_t*)&buffer[compressed_length-8], crc);
+ packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
+
+ int remaining = block_length - input_length;
+ if (remaining > 0) {
+ if (remaining > input_length) {
+ // should never happen (check so we can use memcpy)
+ report_error(fp, "remainder too large");
+ return -1;
+ }
+ memcpy(fp->uncompressed_block,
+ fp->uncompressed_block + input_length,
+ remaining);
+ }
+ fp->block_offset = remaining;
+ return compressed_length;
+}
+
+static
+int
+inflate_block(BGZF* fp, int block_length)
+{
+ // Inflate the block in fp->compressed_block into fp->uncompressed_block
+
+ z_stream zs;
+ int status;
+ zs.zalloc = NULL;
+ zs.zfree = NULL;
+ zs.next_in = fp->compressed_block + 18;
+ zs.avail_in = block_length - 16;
+ zs.next_out = fp->uncompressed_block;
+ zs.avail_out = fp->uncompressed_block_size;
+
+ status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+ if (status != Z_OK) {
+ report_error(fp, "inflate init failed");
+ return -1;
+ }
+ status = inflate(&zs, Z_FINISH);
+ if (status != Z_STREAM_END) {
+ inflateEnd(&zs);
+ report_error(fp, "inflate failed");
+ return -1;
+ }
+ status = inflateEnd(&zs);
+ if (status != Z_OK) {
+ report_error(fp, "inflate failed");
+ return -1;
+ }
+ return zs.total_out;
+}
+
+static
+int
+check_header(const bgzf_byte_t* header)
+{
+ return (header[0] == GZIP_ID1 &&
+ header[1] == (bgzf_byte_t) GZIP_ID2 &&
+ header[2] == Z_DEFLATED &&
+ (header[3] & FLG_FEXTRA) != 0 &&
+ unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
+ header[12] == BGZF_ID1 &&
+ header[13] == BGZF_ID2 &&
+ unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
+}
+
+static void free_cache(BGZF *fp)
+{
+ khint_t k;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (fp->open_mode != 'r') return;
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) free(kh_val(h, k).block);
+ kh_destroy(cache, h);
+}
+
+static int load_block_from_cache(BGZF *fp, int64_t block_address)
+{
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ k = kh_get(cache, h, block_address);
+ if (k == kh_end(h)) return 0;
+ p = &kh_val(h, k);
+ if (fp->block_length != 0) fp->block_offset = 0;
+ fp->block_address = block_address;
+ fp->block_length = p->size;
+ memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE);
+#ifdef _USE_KNETFILE
+ knet_seek(fp->x.fpr, p->end_offset, SEEK_SET);
+#else
+ fseeko(fp->file, p->end_offset, SEEK_SET);
+#endif
+ return p->size;
+}
+
+static void cache_block(BGZF *fp, int size)
+{
+ int ret;
+ khint_t k;
+ cache_t *p;
+ khash_t(cache) *h = (khash_t(cache)*)fp->cache;
+ if (MAX_BLOCK_SIZE >= fp->cache_size) return;
+ if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) {
+ /* A better way would be to remove the oldest block in the
+ * cache, but here we remove a random one for simplicity. This
+ * should not have a big impact on performance. */
+ for (k = kh_begin(h); k < kh_end(h); ++k)
+ if (kh_exist(h, k)) break;
+ if (k < kh_end(h)) {
+ free(kh_val(h, k).block);
+ kh_del(cache, h, k);
+ }
+ }
+ k = kh_put(cache, h, fp->block_address, &ret);
+ if (ret == 0) return; // if this happens, a bug!
+ p = &kh_val(h, k);
+ p->size = fp->block_length;
+ p->end_offset = fp->block_address + size;
+ p->block = malloc(MAX_BLOCK_SIZE);
+ memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE);
+}
+
+int
+bgzf_read_block(BGZF* fp)
+{
+ bgzf_byte_t header[BLOCK_HEADER_LENGTH];
+ int count, size = 0, block_length, remaining;
+#ifdef _USE_KNETFILE
+ int64_t block_address = knet_tell(fp->x.fpr);
+ if (load_block_from_cache(fp, block_address)) return 0;
+ count = knet_read(fp->x.fpr, header, sizeof(header));
+#else
+ int64_t block_address = ftello(fp->file);
+ if (load_block_from_cache(fp, block_address)) return 0;
+ count = fread(header, 1, sizeof(header), fp->file);
+#endif
+ if (count == 0) {
+ fp->block_length = 0;
+ return 0;
+ }
+ size = count;
+ if (count != sizeof(header)) {
+ report_error(fp, "read failed");
+ return -1;
+ }
+ if (!check_header(header)) {
+ report_error(fp, "invalid block header");
+ return -1;
+ }
+ block_length = unpackInt16((uint8_t*)&header[16]) + 1;
+ bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block;
+ memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+ remaining = block_length - BLOCK_HEADER_LENGTH;
+#ifdef _USE_KNETFILE
+ count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining);
+#else
+ count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
+#endif
+ if (count != remaining) {
+ report_error(fp, "read failed");
+ return -1;
+ }
+ size += count;
+ count = inflate_block(fp, block_length);
+ if (count < 0) return -1;
+ if (fp->block_length != 0) {
+ // Do not reset offset if this read follows a seek.
+ fp->block_offset = 0;
+ }
+ fp->block_address = block_address;
+ fp->block_length = count;
+ cache_block(fp, size);
+ return 0;
+}
+
+int
+bgzf_read(BGZF* fp, void* data, int length)
+{
+ if (length <= 0) {
+ return 0;
+ }
+ if (fp->open_mode != 'r') {
+ report_error(fp, "file not open for reading");
+ return -1;
+ }
+
+ int bytes_read = 0;
+ bgzf_byte_t* output = data;
+ while (bytes_read < length) {
+ int copy_length, available = fp->block_length - fp->block_offset;
+ bgzf_byte_t *buffer;
+ if (available <= 0) {
+ if (bgzf_read_block(fp) != 0) {
+ return -1;
+ }
+ available = fp->block_length - fp->block_offset;
+ if (available <= 0) {
+ break;
+ }
+ }
+ copy_length = bgzf_min(length-bytes_read, available);
+ buffer = fp->uncompressed_block;
+ memcpy(output, buffer + fp->block_offset, copy_length);
+ fp->block_offset += copy_length;
+ output += copy_length;
+ bytes_read += copy_length;
+ }
+ if (fp->block_offset == fp->block_length) {
+#ifdef _USE_KNETFILE
+ fp->block_address = knet_tell(fp->x.fpr);
+#else
+ fp->block_address = ftello(fp->file);
+#endif
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ return bytes_read;
+}
+
+int bgzf_flush(BGZF* fp)
+{
+ while (fp->block_offset > 0) {
+ int count, block_length;
+ block_length = deflate_block(fp, fp->block_offset);
+ if (block_length < 0) return -1;
+#ifdef _USE_KNETFILE
+ count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
+#else
+ count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+#endif
+ if (count != block_length) {
+ report_error(fp, "write failed");
+ return -1;
+ }
+ fp->block_address += block_length;
+ }
+ return 0;
+}
+
+int bgzf_flush_try(BGZF *fp, int size)
+{
+ if (fp->block_offset + size > fp->uncompressed_block_size)
+ return bgzf_flush(fp);
+ return -1;
+}
+
+int bgzf_write(BGZF* fp, const void* data, int length)
+{
+ const bgzf_byte_t *input = data;
+ int block_length, bytes_written;
+ if (fp->open_mode != 'w') {
+ report_error(fp, "file not open for writing");
+ return -1;
+ }
+
+ if (fp->uncompressed_block == NULL)
+ fp->uncompressed_block = malloc(fp->uncompressed_block_size);
+
+ input = data;
+ block_length = fp->uncompressed_block_size;
+ bytes_written = 0;
+ while (bytes_written < length) {
+ int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written);
+ bgzf_byte_t* buffer = fp->uncompressed_block;
+ memcpy(buffer + fp->block_offset, input, copy_length);
+ fp->block_offset += copy_length;
+ input += copy_length;
+ bytes_written += copy_length;
+ if (fp->block_offset == block_length) {
+ if (bgzf_flush(fp) != 0) {
+ break;
+ }
+ }
+ }
+ return bytes_written;
+}
+
+int bgzf_close(BGZF* fp)
+{
+ if (fp->open_mode == 'w') {
+ if (bgzf_flush(fp) != 0) return -1;
+ { // add an empty block
+ int count, block_length = deflate_block(fp, 0);
+#ifdef _USE_KNETFILE
+ count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw);
+#else
+ count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+#endif
+ }
+#ifdef _USE_KNETFILE
+ if (fflush(fp->x.fpw) != 0) {
+#else
+ if (fflush(fp->file) != 0) {
+#endif
+ report_error(fp, "flush failed");
+ return -1;
+ }
+ }
+ if (fp->owned_file) {
+#ifdef _USE_KNETFILE
+ int ret;
+ if (fp->open_mode == 'w') ret = fclose(fp->x.fpw);
+ else ret = knet_close(fp->x.fpr);
+ if (ret != 0) return -1;
+#else
+ if (fclose(fp->file) != 0) return -1;
+#endif
+ }
+ free(fp->uncompressed_block);
+ free(fp->compressed_block);
+ free_cache(fp);
+ free(fp);
+ return 0;
+}
+
+void bgzf_set_cache_size(BGZF *fp, int cache_size)
+{
+ if (fp) fp->cache_size = cache_size;
+}
+
+int bgzf_check_EOF(BGZF *fp)
+{
+ static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
+ uint8_t buf[28];
+ off_t offset;
+#ifdef _USE_KNETFILE
+ offset = knet_tell(fp->x.fpr);
+ if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1;
+ knet_read(fp->x.fpr, buf, 28);
+ knet_seek(fp->x.fpr, offset, SEEK_SET);
+#else
+ offset = ftello(fp->file);
+ if (fseeko(fp->file, -28, SEEK_END) != 0) return -1;
+ fread(buf, 1, 28, fp->file);
+ fseeko(fp->file, offset, SEEK_SET);
+#endif
+ return (memcmp(magic, buf, 28) == 0)? 1 : 0;
+}
+
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+ int block_offset;
+ int64_t block_address;
+
+ if (fp->open_mode != 'r') {
+ report_error(fp, "file not open for read");
+ return -1;
+ }
+ if (where != SEEK_SET) {
+ report_error(fp, "unimplemented seek option");
+ return -1;
+ }
+ block_offset = pos & 0xFFFF;
+ block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
+#ifdef _USE_KNETFILE
+ if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) {
+#else
+ if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
+#endif
+ report_error(fp, "seek failed");
+ return -1;
+ }
+ fp->block_length = 0; // indicates current block is not loaded
+ fp->block_address = block_address;
+ fp->block_offset = block_offset;
+ return 0;
+}
157 bgzf.h
@@ -0,0 +1,157 @@
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#ifndef __BGZF_H
+#define __BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <zlib.h>
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+//typedef int8_t bool;
+
+typedef struct {
+ int file_descriptor;
+ char open_mode; // 'r' or 'w'
+ int16_t owned_file, compress_level;
+#ifdef _USE_KNETFILE
+ union {
+ knetFile *fpr;
+ FILE *fpw;
+ } x;
+#else
+ FILE* file;
+#endif
+ int uncompressed_block_size;
+ int compressed_block_size;
+ void* uncompressed_block;
+ void* compressed_block;
+ int64_t block_address;
+ int block_length;
+ int block_offset;
+ int cache_size;
+ const char* error;
+ void *cache; // a pointer to a hash table
+} BGZF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Open an existing file descriptor for reading or writing.
+ * Mode must be either "r" or "w".
+ * A subsequent bgzf_close will not close the file descriptor.
+ * Returns null on error.
+ */
+BGZF* bgzf_fdopen(int fd, const char* __restrict mode);
+
+/*
+ * Open the specified file for reading or writing.
+ * Mode must be either "r" or "w".
+ * Returns null on error.
+ */
+BGZF* bgzf_open(const char* path, const char* __restrict mode);
+
+/*
+ * Close the BGZ file and free all associated resources.
+ * Does not close the underlying file descriptor if created with bgzf_fdopen.
+ * Returns zero on success, -1 on error.
+ */
+int bgzf_close(BGZF* fp);
+
+/*
+ * Read up to length bytes from the file storing into data.
+ * Returns the number of bytes actually read.
+ * Returns zero on end of file.
+ * Returns -1 on error.
+ */
+int bgzf_read(BGZF* fp, void* data, int length);
+
+/*
+ * Write length bytes from data to the file.
+ * Returns the number of bytes written.
+ * Returns -1 on error.
+ */
+int bgzf_write(BGZF* fp, const void* data, int length);
+
+/*
+ * Return a virtual file pointer to the current location in the file.
+ * No interpetation of the value should be made, other than a subsequent
+ * call to bgzf_seek can be used to position the file at the same point.
+ * Return value is non-negative on success.
+ * Returns -1 on error.
+ */
+#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF))
+
+/*
+ * Set the file to read from the location specified by pos, which must
+ * be a value previously returned by bgzf_tell for this file (but not
+ * necessarily one returned by this file handle).
+ * The where argument must be SEEK_SET.
+ * Seeking on a file opened for write is not supported.
+ * Returns zero on success, -1 on error.
+ */
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
+
+/*
+ * Set the cache size. Zero to disable. By default, caching is
+ * disabled. The recommended cache size for frequent random access is
+ * about 8M bytes.
+ */
+void bgzf_set_cache_size(BGZF *fp, int cache_size);
+
+int bgzf_check_EOF(BGZF *fp);
+int bgzf_read_block(BGZF* fp);
+int bgzf_flush(BGZF* fp);
+int bgzf_flush_try(BGZF *fp, int size);
+int bgzf_check_bgzf(const char *fn);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline int bgzf_getc(BGZF *fp)
+{
+ int c;
+ if (fp->block_offset >= fp->block_length) {
+ if (bgzf_read_block(fp) != 0) return -2; /* error */
+ if (fp->block_length == 0) return -1; /* end-of-file */
+ }
+ c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+ if (fp->block_offset == fp->block_length) {
+#ifdef _USE_KNETFILE
+ fp->block_address = knet_tell(fp->x.fpr);
+#else
+ fp->block_address = ftello(fp->file);
+#endif
+ fp->block_offset = 0;
+ fp->block_length = 0;
+ }
+ return c;
+}
+
+#endif
206 bgzip.c
@@ -0,0 +1,206 @@
+/* The MIT License
+
+ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/select.h>
+#include <sys/stat.h>
+#include "bgzf.h"
+
+static const int WINDOW_SIZE = 64 * 1024;
+
+static int bgzip_main_usage()
+{
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n");
+ fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n");
+ fprintf(stderr, " -d decompress\n");
+ fprintf(stderr, " -f overwrite files without asking\n");
+ fprintf(stderr, " -b INT decompress at virtual file pointer INT\n");
+ fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n");
+ fprintf(stderr, " -h give this help\n");
+ fprintf(stderr, "\n");
+ return 1;
+}
+
+static int write_open(const char *fn, int is_forced)
+{
+ int fd = -1;
+ char c;
+ if (!is_forced) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
+ fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
+ scanf("%c", &c);
+ if (c != 'Y' && c != 'y') {
+ fprintf(stderr, "[bgzip] not overwritten\n");
+ exit(1);
+ }
+ }
+ }
+ if (fd < 0) {
+ if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
+ fprintf(stderr, "[bgzip] %s: Fail to write\n", fn);
+ exit(1);
+ }
+ }
+ return fd;
+}
+
+static void fail(BGZF* fp)
+{
+ fprintf(stderr, "Error: %s\n", fp->error);
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ int c, compress, pstdout, is_forced;
+ BGZF *fp;
+ void *buffer;
+ long start, end, size;
+
+ compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
+ while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){
+ switch(c){
+ case 'h': return bgzip_main_usage();
+ case 'd': compress = 0; break;
+ case 'c': pstdout = 1; break;
+ case 'b': start = atol(optarg); break;
+ case 's': size = atol(optarg); break;
+ case 'f': is_forced = 1; break;
+ }
+ }
+ if (size >= 0) end = start + size;
+ if (end >= 0 && end < start) {
+ fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
+ return 1;
+ }
+ if (compress == 1) {
+ struct stat sbuf;
+ int f_src = fileno(stdin);
+ int f_dst = fileno(stdout);
+
+ if ( argc>optind )
+ {
+ if ( stat(argv[optind],&sbuf)<0 )
+ {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+
+ if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+
+ if (pstdout)
+ f_dst = fileno(stdout);
+ else
+ {
+ char *name = malloc(strlen(argv[optind]) + 5);
+ strcpy(name, argv[optind]);
+ strcat(name, ".gz");
+ f_dst = write_open(name, is_forced);
+ if (f_dst < 0) return 1;
+ free(name);
+ }
+ }
+ else if (!pstdout && isatty(fileno((FILE *)stdout)) )
+ return bgzip_main_usage();
+
+ fp = bgzf_fdopen(f_dst, "w");
+ buffer = malloc(WINDOW_SIZE);
+ while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
+ if (bgzf_write(fp, buffer, c) < 0) fail(fp);
+ // f_dst will be closed here
+ if (bgzf_close(fp) < 0) fail(fp);
+ if (argc > optind && !pstdout) unlink(argv[optind]);
+ free(buffer);
+ close(f_src);
+ return 0;
+ } else {
+ struct stat sbuf;
+ int f_dst;
+
+ if ( argc>optind )
+ {
+ if ( stat(argv[optind],&sbuf)<0 )
+ {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+ char *name;
+ int len = strlen(argv[optind]);
+ if ( strcmp(argv[optind]+len-3,".gz") )
+ {
+ fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
+ return 1;
+ }
+ fp = bgzf_open(argv[optind], "r");
+ if (fp == NULL) {
+ fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
+ return 1;
+ }
+
+ if (pstdout) {
+ f_dst = fileno(stdout);
+ }
+ else {
+ name = strdup(argv[optind]);
+ name[strlen(name) - 3] = '\0';
+ f_dst = write_open(name, is_forced);
+ free(name);
+ }
+ }
+ else if (!pstdout && isatty(fileno((FILE *)stdin)) )
+ return bgzip_main_usage();
+ else
+ {
+ f_dst = fileno(stdout);
+ fp = bgzf_fdopen(fileno(stdin), "r");
+ if (fp == NULL) {
+ fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
+ return 1;
+ }
+ }
+ buffer = malloc(WINDOW_SIZE);
+ if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp);
+ while (1) {
+ if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
+ else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
+ if (c == 0) break;
+ if (c < 0) fail(fp);
+ start += c;
+ write(f_dst, buffer, c);
+ if (end >= 0 && start >= end) break;
+ }
+ free(buffer);
+ if (bgzf_close(fp) < 0) fail(fp);
+ if (!pstdout) unlink(argv[optind]);
+ return 0;
+ }
+}
BIN example.gtf.gz
Binary file not shown.
BIN example.gtf.gz.tbi
Binary file not shown.
998 index.c
@@ -0,0 +1,998 @@
+#include <ctype.h>
+#include <assert.h>
+#include <sys/stat.h>
+#include "khash.h"
+#include "ksort.h"
+#include "kstring.h"
+#include "bam_endian.h"
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+#include "tabix.h"
+
+#define TAD_MIN_CHUNK_GAP 32768
+// 1<<14 is the size of minimum bin.
+#define TAD_LIDX_SHIFT 14
+
+typedef struct {
+ uint64_t u, v;
+} pair64_t;
+
+#define pair64_lt(a,b) ((a).u < (b).u)
+KSORT_INIT(offt, pair64_t, pair64_lt)
+