Skip to content

Commit

Permalink
15.0.58
Browse files Browse the repository at this point in the history
  • Loading branch information
Divon Lan committed May 24, 2024
1 parent 7e858b9 commit 28e772e
Show file tree
Hide file tree
Showing 125 changed files with 2,849 additions and 1,018 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ launch.json
*.back
.conda-timestamp
.archive.tar.gz
autocomplete.sh
*.o
!src/objdir.*/secure/license.o
!src/objdir.mac/secure/license.old_mac.o
Expand Down
8 changes: 4 additions & 4 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
// "program": "${workspaceFolder}/src/genozip-opt.exe",
// "program": "${workspaceFolder}/src/genocat-opt.exe",
// "program": "${workspaceFolder}/src/genozip.exe",
//"program": "${workspaceFolder}/src/genounzip.exe",
// "program": "${workspaceFolder}/src/genounzip.exe",
// "program": "${workspaceFolder}/src/genols-debug.exe",

"args" : ["--echo", "-fo", "./private/test/tmp/copy.vcf.gz", "-e", "./public/hs37d5.v15.ref.genozip", "./private/test/tmp/output.genozip"],

"args" : ["-ft", "private/test/test.starling.vcf"],

"environment": [
{ "name": "GENOZIP_TEST", "value": "1", }, // needed for VER2 macro to work
// { "name": "GENOZIP_REFERENCE", "value": "c:\\Users\\divon\\genozip/public", },
{ "name": "GENOZIP_REFERENCE", "value": "c:\\Users\\divon\\genozip/public", },
],

"stopAtEntry": false,
Expand Down
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,5 +158,5 @@ ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE.

END OF TERMS AND CONDITIONS

Genozip license version: 15.0.57
Genozip license version: 15.0.58

14 changes: 13 additions & 1 deletion RELEASE_NOTES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,19 @@ Note on versioning:
- Minor version changes with bug fixes and minor feature updates
- Some minor versions are skipped due to failed deployment pipelines

15.0.57
15.0.58 24/5/2024
- FASTA: New method (FAF) for better compressing FASTA files that contain sequencer reads (as opposed to assembled contigs), including support for --reference
- FASTA: genozip option --index now required in order to enable genocat --grep and --regions on files with more than 10,000 contigs (flles with less contigs are indexed by default)
- FASTQ/FASTA: better compression of interleaved data
- SAM/BAM: better compression of files containing OQ:Z
- VCF: better compression of files containing Mobile Elements data, and files generated by Illumina Starling
- bash: autocomplete of files and command line options in bash. .bashrc is modified during registration to activate the autocompletion. For existing users, you may manually add the following line to your .bashrc :
source <...genozip-directory...>/autocomplete.sh ; complete -F _genozip genozip; complete -F _genounzip genounzip ; complete -F _genocat genocat ; complete -F _genols genols
- feature matrix changes: --optimize is now available only in Genozip Enterprise, Premium and Academic (not Genozip Standard) (no change for current Genozip Standard customers)
- some command line options renamed: --show-reference, --show-ref-contigs, show-ref-iupacs and --show-filename have been renamed to --print-reference, --print-ref-contigs, print-ref-iupacs and --print-filename respectively. The old names still work too, for backward compatability.
- new diagnostic options: --no-FAF and --no-interleaved to disable the FAF and Interleaved methods respectively ; --debug-upgrade

15.0.57 21/4/2024
- genounzip/genocat: GENOZIP_REFERENCE environment variable can now accept either a file name
(of the reference file) OR the directory where it is located (the latter is new).
- Support for Mac with Apple Silicon (requires MacOS version 13 or higher)
Expand Down
5 changes: 4 additions & 1 deletion genozip.code-workspace
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@
"license.h": "c",
"windows.h": "c",
"cstdlib": "c",
"reconstruct.h": "c"
"reconstruct.h": "c",
"seg.h": "c",
"compare": "c",
"context.h": "c"
}
}
}
2 changes: 1 addition & 1 deletion installers/LICENSE.html
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@
10. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides Genozip on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Genozip and assume any risks associated with Your exercise of permissions under this License.<br><br>
11. LIMITATION OF LIABILITY. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, STRICT LIABILITY OR OTHER LEGAL OR EQUITABLE THEORY, SHALL LICENSOR OR DEVELOPER BE LIABLE FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER ARISING AS A RESULT OF THIS LICENSE OR OUT OF THE USE OR INABILITY TO USE GENOZIP (INCLUDING BUT NOT LIMITED TO DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, FILE CORRUPTION, DATA LOSS, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES), EVEN IF LICENSOR OR DEVELOPER HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL LICENSOR'S OR DEVELOPER'S TOTAL LIABILITY TO LICENSEE FOR ALL DAMAGES (OTHER THAN AS MAY BE REQUIRED BY APPLICABLE LAW IN CASES INVOLVING PERSONAL INJURY) EXCEED THE AMOUNT OF $500 USD. THE FOREGOING LIMITATIONS WILL APPLY EVEN IF THE ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE.<br><br>
END OF TERMS AND CONDITIONS<br><br>
Genozip license version: 15.0.57<br><br>
Genozip license version: 15.0.58<br><br>
Binary file modified installers/genozip-installer.exe
Binary file not shown.
Binary file modified installers/genozip-linux-x86_64.tar
Binary file not shown.
Binary file modified installers/genozip-osx-arm.tar
Binary file not shown.
Binary file modified installers/genozip-osx-x86.tar
Binary file not shown.
28 changes: 19 additions & 9 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,13 @@ SRC_DIRS = secure zlib bzlib lzma bsc libdeflate_1.7 libdeflate_1.19 libdeflate_

MY_SRCS = genozip.c genols.c context.c container.c strings.c stats.c arch.c tip.c seg_id.c zip_dyn_int.c \
data_types.c bits.c progress.c writer.c zriter.c tar.c chrom.c qname.c tokenizer.c mutex.c threads.c \
zip.c piz.c reconstruct.c recon_history.c recon_peek.c seg.c zfile.c aligner.c flags.c digest.c \
reference.c contigs.c ref_lock.c refhash.c ref_make.c ref_contigs.c ref_iupacs.c ref_cache.c \
zip.c piz.c reconstruct.c recon_history.c recon_peek.c seg.c zfile.c aligner.c flags.c specials.c \
reference.c contigs.c ref_lock.c refhash.c ref_make.c ref_contigs.c ref_iupacs.c ref_cache.c digest.c \
vcf_piz.c vcf_seg.c vcf_vblock.c vcf_header.c vcf_info.c vcf_samples.c vcf_hgvs.c \
vcf_format_GT.c vcf_format_PS_PID.c vcf_dbsnp.c vcf_giab.c vcf_vep.c vcf_qual.c \
vcf_format_GT.c vcf_format_PS_PID.c vcf_dbsnp.c vcf_giab.c vcf_vep.c vcf_qual.c vcf_1000G.c \
vcf_refalt.c vcf_format.c vcf_illum_gtyping.c vcf_gwas.c vcf_vagrent.c vcf_svaba.c vcf_pbsv.c \
vcf_icgc.c vcf_snpeff.c vcf_cosmic.c vcf_mastermind.c vcf_isaac.c vcf_manta.c vcf_pos.c vcf_ultima.c \
vcf_platypus.c vcf_info_AC_AF_AN.c vcf_format_GQ.c vcf_gatk.c vcf_sv.c vcf_gnomad.c \
vcf_platypus.c vcf_info_AC_AF_AN.c vcf_format_GQ.c vcf_gatk.c vcf_sv.c vcf_gnomad.c vcf_me.c \
sam_seg.c sam_piz.c sam_shared.c sam_header.c sam_md.c sam_nm.c sam_tlen.c sam_cigar.c sam_fields.c \
sam_sa.c bam_seg.c bam_seq.c bam_show.c sam_pacbio.c sam_ultima.c sam_xcons.c cram.c agilent.c \
sam_seq.c sam_qual.c sam_sag_zip.c sam_sag_piz.c sam_sag_load.c sam_sag_ingest.c sam_sag_scan.c \
Expand All @@ -160,7 +160,7 @@ MY_SRCS = genozip.c genols.c context.c container.c strings.c stats.c arch.c tip.
buffer.c buf_struct.c buf_list.c random_access.c sections.c base64.c bgzf.c coverage.c txtheader.c \
codec.c codec_bz2.c codec_lzma.c codec_acgt.c codec_domq.c codec_bsc.c codec_pacb.c \
codec_pbwt.c codec_none.c codec_htscodecs.c codec_longr.c codec_normq.c codec_homp.c codec_t0.c \
codec_smux.c \
codec_smux.c codec_oq.c \
txtfile.c profiler.c file.c filename.c dispatcher.c crypt.c aes.c md5.c segconf.c biopsy.c gencomp.c \
vblock.c regions.c optimize.c dict_id.c aliases.c hash.c stream.c url.c bases_filter.c dict_io.c \
recon_plan_io.c version.c huffman.c user_message.c b250.c qname_filter.c
Expand Down Expand Up @@ -335,9 +335,9 @@ DEBUG_OBJS := $(addprefix $(OBJDIR)/,$(C_SRCS:.c=.debug-o)) $(addprefix $(OBJDIR
OPT_OBJS := $(addprefix $(OBJDIR)/,$(C_SRCS:.c=.opt-o)) $(addprefix $(OBJDIR)/,$(CXX_SRCS:.cpp=.opt-o)) $(IGZIP_OBJS) $(BSC_OPT_OBJS) # optimized but with debug info, for debugging issues that only manifest with compiler optimization
DEPS := $(addprefix $(OBJDIR)/,$(C_SRCS:.c=.d)) $(addprefix $(OBJDIR)/,$(CXX_SRCS:.cpp=.d)) $(IGZIP_DEPS)

EXECUTABLES = genozip$(EXE) genounzip$(EXE) genocat$(EXE) genols$(EXE)
DEBUG_EXECUTABLES = genozip-debug$(EXE) genounzip-debug$(EXE) genocat-debug$(EXE) genols-debug$(EXE)
OPT_EXECUTABLES = genozip-opt$(EXE) genounzip-opt$(EXE) genocat-opt$(EXE) genols-opt$(EXE)
EXECUTABLES = genozip$(EXE) genounzip$(EXE) genocat$(EXE) genols$(EXE) autocomplete.sh
DEBUG_EXECUTABLES = genozip-debug$(EXE) genounzip-debug$(EXE) genocat-debug$(EXE) genols-debug$(EXE) autocomplete-debug.sh
OPT_EXECUTABLES = genozip-opt$(EXE) genounzip-opt$(EXE) genocat-opt$(EXE) genols-opt$(EXE) autocomplete-debug.sh

all : CFLAGS += $(OPTFLAGS)
all : CXXFLAGS += $(OPTFLAGS)
Expand All @@ -356,6 +356,16 @@ opt : CFLAGS += -ggdb3 $(OPTFLAGS) -DDEBUG
opt : CXXFLAGS += $(OPTFLAGS)
opt : $(OBJDIR) $(OPT_EXECUTABLES)

autocomplete.sh : autocomplete_gen.sh flags.c
@echo "Generating $@"
@./autocomplete_gen.sh
@if ! ln -f $@ ../$@; then ln -fs $@ ../$@; fi

autocomplete-debug.sh : autocomplete_gen.sh flags.c
@echo "Generating $@"
@./autocomplete_gen.sh debug
@if ! ln -f $@ ../$@; then ln -fs $@ ../$@; fi

ifdef osx_x86
objdir.mac/secure/license.o : CFLAGS := $(patsubst -march=% ,-march=x86-64 ,$(CFLAGS)) # plain vanilla x86-64, so it runs on Rosetta x86-64 simulator on M1/M2/M3
endif
Expand Down Expand Up @@ -441,7 +451,7 @@ GENDICT_OBJS := $(addprefix $(OBJDIR)/,$(GENDICT_SRCS:.c=.o))
# Step 2: dict_id_gen.sh compiles dict_id_gen.c and generate dict_id_gen[.exe]
# Step 3. dict_id_gen.sh generates dict_id_gen.h: it uses dict_id_gen[.exe]to generate the field constant, and then adds the fields enum and mapping
ifdef Windows
dict_id_gen.h : $(shell grep -w "pragma GENDICT" *.h | cut -d: -f1 | uniq) dict_id_gen.sh
dict_id_gen.h : $(shell egrep -w "GENDICT|SPECIAL" *.h | cut -d: -f1 | uniq) dict_id_gen.sh
@echo Generating $@
@./dict_id_gen.sh $(CC)

Expand Down
56 changes: 37 additions & 19 deletions src/aligner.c
Original file line number Diff line number Diff line change
Expand Up @@ -253,33 +253,40 @@ MappingType aligner_seg_seq (VBlockP vb, STRp(seq), bool is_pair_2, PosType64 pa
ref_get_genome (gref, &genome, &emoneg, &genome_nbases);

bool is_forward=false, is_all_ref=false;
bool is_interleaved_r2 = (segconf.is_interleaved && vb->line_i % 2);

// our aligner algorithm only works for short reads - long reads tend to have many Indel differences (mostly errors) vs the reference
PosType64 gpos = (seq_len <= MAX_SHORT_READ_LEN)
? aligner_best_match (VB, STRa(seq), pair_gpos, genome, emoneg, genome_nbases, &is_forward, &is_all_ref) : NO_GPOS;

if (gpos == NO_GPOS || gpos > genome_nbases - seq_len || gpos > MAX_ALIGNER_GPOS - seq_len || gpos < 0/*never happens*/) {
gpos_ctx->last_value.i = NO_GPOS;

COPY_TIMER (aligner_seg_seq);
return MAPPING_NO_MAPPING;
}

if (flag.show_aligner)
iprintf ("%s: gpos=%"PRId64" forward=%s\n", LN_NAME, gpos, TF(is_forward));
iprintf ("%s: gpos=%"PRId64" forward=%s perfect=%s\n", LN_NAME, gpos, TF(is_forward), TF(is_all_ref));

if ((strand_ctx->local.size & ~3ULL) * 8 == strand_ctx->local.nbits)
buf_alloc_do (vb, &strand_ctx->local, strand_ctx->local.size + sizeof(uint64_t), CTX_GROWTH, NULL, __FUNCLINE);

buf_alloc (vb, &gpos_ctx->local, 1, 0, uint32_t, CTX_GROWTH, NULL);
ContextP this_strand_ctx = is_interleaved_r2 ? strand_r2_ctx : strand_ctx;

buf_add_bit (&strand_ctx->local, pair_gpos == NO_GPOS ? is_forward // pair 1 is unaligned - just store the strand
: (is_forward == pair_is_forward)); // pair 1 is aligned - store equality, expected to he 1 in most cases

if (is_pair_2)
fastq_seg_pair2_gpos (vb, pair_gpos, gpos);
if ((this_strand_ctx->local.size & ~3ULL) * 8 == this_strand_ctx->local.nbits)
buf_alloc_do (vb, &this_strand_ctx->local, this_strand_ctx->local.size + sizeof(uint64_t), CTX_GROWTH, NULL, __FUNCLINE);

buf_add_bit (&this_strand_ctx->local, pair_gpos == NO_GPOS ? is_forward // pair 1 is unaligned - just store the strand
: (is_forward == pair_is_forward)); // pair 1 is aligned - store equality, expected to he 1 in most cases

if (is_pair_2 || is_interleaved_r2)
fastq_seg_r2_gpos (vb, pair_gpos, gpos);

// store the GPOS in local if its not a 2nd pair, or if it is, but the delta is not small enough
else
BNXT32 (gpos_ctx->local) = (uint32_t)gpos;
else
seg_integer_fixed (VB, gpos_ctx, &gpos, false, 0);

// note regarding interleaved: we always store last_value in gpos/strand (never in gpos_r2/strand_r2)
gpos_ctx->last_value.i = gpos;
strand_ctx->last_value.i = is_forward;

// lock region of reference to protect is_set
if (IS_REF_EXT_STORE) {
Expand Down Expand Up @@ -350,33 +357,44 @@ void aligner_reconstruct_seq (VBlockP vb, uint32_t seq_len, bool is_pair_2, bool
{
START_TIMER;
declare_seq_contexts;
bool is_interleaved_r2 = (segconf.is_interleaved && vb->line_i % 2);

if (!bitmap_ctx->is_loaded) return; // if case we need to skip the SEQ field (for the entire file)

if (is_perfect_alignment || buf_is_alloc (&bitmap_ctx->local)) { // not all non-ref

bool is_forward;
PosType64 gpos;

// first file of a pair (R1) or a non-pair fastq or sam
if (!is_pair_2) {
if (!is_pair_2 && !is_interleaved_r2) {
gpos = gpos_ctx->last_value.i = NEXTLOCAL (uint32_t, gpos_ctx);
is_forward = NEXTLOCALBIT (strand_ctx);
is_forward = NEXTLOCALBIT (strand_ctx);

if (segconf.is_interleaved) bitmap_ctx->r1_is_aligned = PAIR1_ALIGNED;
}

// 2nd file of a pair (R2)
else {
is_forward = fastq_piz_get_pair2_is_forward (vb); // MUST be called before gpos reconstruction as it inquires GPOS.localR1.next
else if (is_pair_2) {
is_forward = fastq_piz_get_r2_is_forward (vb); // MUST be called before gpos reconstruction as it inquires GPOS.localR1.next

// gpos: don't reconstruct just get last_value
reconstruct_from_ctx (vb, gpos_ctx->did_i, 0, false); // calls fastq_special_PAIR2_GPOS
gpos = gpos_ctx->last_value.i;
}

if (flag.deep) ctx_set_last_value (vb, strand_ctx, (int64_t)is_forward); // consumed by sam_piz_deep_add_seq
// R2 in a single-file interleaved FASTQ
else { // is_interleaved_r2
is_forward = fastq_piz_get_interleaved_r2_is_forward (vb);

reconstruct_from_ctx (vb, gpos_r2_ctx->did_i, 0, false); // calls fastq_special_PAIR2_GPOS
gpos = gpos_r2_ctx->last_value.i;
}

ctx_set_last_value (vb, strand_ctx, (int64_t)is_forward); // consumed by sam_piz_deep_add_seq and next line's fastq_piz_get_interleaved_r2_is_forward

if (flag.show_aligner)
iprintf ("%s: gpos=%"PRId64" forward=%s\n", LN_NAME, gpos, TF(is_forward));
iprintf ("%s: gpos=%"PRId64" forward=%s perfect=%s\n", LN_NAME, gpos, TF(is_forward), TF(is_perfect_alignment));

const Bits *genome=NULL;
PosType64 genome_nbases;
Expand Down
16 changes: 9 additions & 7 deletions src/aligner.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@
#include "genozip.h"

#define declare_seq_contexts ContextP __attribute__((unused)) \
bitmap_ctx = CTX(SAM_SQBITMAP), \
nonref_ctx = CTX(SAM_NONREF), \
gpos_ctx = CTX(SAM_GPOS), \
gpos_d_ctx = CTX(SAM_GPOS_DELTA), \
strand_ctx = CTX(SAM_STRAND), \
seqmis_ctx = CTX(SAM_SEQMIS_A), \
seqins_ctx = CTX(SAM_SEQINS_A)
bitmap_ctx = CTX(SAM_SQBITMAP), \
nonref_ctx = CTX(SAM_NONREF), \
gpos_ctx = CTX(SAM_GPOS), \
gpos_d_ctx = CTX(SAM_GPOS_DELTA), \
gpos_r2_ctx = CTX(SAM_GPOS_R2), \
strand_ctx = CTX(SAM_STRAND), \
strand_r2_ctx = CTX(SAM_STRAND_R2), \
seqmis_ctx = CTX(SAM_SEQMIS_A), \
seqins_ctx = CTX(SAM_SEQINS_A)


typedef enum { MAPPING_NO_MAPPING, MAPPING_ALIGNED, MAPPING_PERFECT } MappingType;
Expand Down
36 changes: 36 additions & 0 deletions src/arch.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ StrText arch_get_filesystem_type (void)
NAME (0x19830326, "FhGFS"); // https://www.beegfs.io/docs/SC13_FHGFS_Presentation.pdf
NAME (0x53464846, "wslfs"); // WSL1: https://github.com/MicrosoftDocs/WSL/issues/465
NAME (0x1021994, "tmpfs"); // Heap Backing Filesystem
NAME (0x2011bab0, "exFAT"); // Filesystem for flash memory: https://en.wikipedia.org/wiki/ExFAT
default: snprintf (s.s, sizeof (s.s), "0x%lx", fs.f_type);
}

Expand Down Expand Up @@ -467,3 +468,38 @@ bool arch_is_process_alive (uint32_t pid)
return is_alive;
}

// check if executable is in the path. Note: in Windows it actually runs the executable, so
// only suitable for executables that would terminate immediately.
static bool arch_is_exec_in_path (rom exec)
{
#ifdef _WIN32
StreamP where = stream_create (0, 0, 0, 0, 0, 0, 0, "where.exe", "where.exe", "/Q", exec, NULL);

return stream_close (&where, STREAM_WAIT_FOR_PROCESS) == 0;

#else
char run[32 + strlen(exec)];
snprintf (run, sizeof (run), "which %s > /dev/null 2>&1", exec);
return !system (run) && file_exists ("/dev/stdout");
#endif
}

bool wget_available (void)
{
static thool installed = unknown;
if (installed == unknown)
// note: wget not used on Windows, bc I can't get it to output to stdout, and also earlier wget versions may be adding \r ... : https://stackoverflow.com/questions/8522983/wget-of-binary-file-piped-into-other-commands-on-windows-breaks-the-binary
installed = !flag.is_windows && arch_is_exec_in_path ("wget");

return installed;
}

bool curl_available (void)
{
static thool installed = unknown;
if (installed == unknown)
installed = arch_is_exec_in_path ("curl");

return installed;
}

2 changes: 2 additions & 0 deletions src/arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ extern bool arch_is_first_compression (void);
extern Timestamp arch_timestamp (void);
extern bool arch_is_process_alive (uint32_t pid);
extern uint64_t arch_get_max_resident_set (void);
extern bool wget_available (void);
extern bool curl_available (void);

static inline uint32_t arch_time_lap (uint128_t ts_start) // in msec
{ return (uint32_t)((arch_timestamp() - ts_start) / 1000000); }
Expand Down

0 comments on commit 28e772e

Please sign in to comment.