Skip to content

Commit

Permalink
Adding a check to automagically create index for VFI and VCF files if…
Browse files Browse the repository at this point in the history
… they do not exist
  • Loading branch information
mattjvincent committed Jun 5, 2018
1 parent bf32c18 commit 52cf234
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 12 deletions.
5 changes: 5 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
History
-------

0.2.4 (06/05/2018)
~~~~~~~~~~~~~~~~~~

* Automtically generates file index if not found

0.2.3 (06/05/2018)
~~~~~~~~~~~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion g2gtools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__version__ = '0.2.3'
__version__ = '0.2.4'
__author__ = 'Matthew Vincent and Kwangbom \"KB\" Choi, The Jackson Laboratory'
__email__ = 'matt.vincent@jax.org'
2 changes: 0 additions & 2 deletions g2gtools/fasta_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,6 @@ def process(filename_fasta, filename_vci, regions, filename_output=None, bgzip=F
dump_fasta = True
LOG.debug("Temporary fasta file: {}".format(filename_output))

print('hello')

fasta_file = fasta.FastaFile(filename_fasta)
vci_file = vci.VCIFile(filename_vci)

Expand Down
4 changes: 1 addition & 3 deletions g2gtools/fasta_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def process(filename_fasta, filename_vci, regions, filename_output=None, bgzip=F
:type num_processes: int
:return: Nothing
"""
LOG.error("in process of fixing")
#LOG.error("in process of fixing")


start = time.time()
Expand Down Expand Up @@ -657,8 +657,6 @@ def process(filename_fasta, filename_vci, regions, filename_output=None, bgzip=F
args = zip(all_params)
LOG.debug(args)



pool = multiprocessing.Pool(num_processes)
results = pool.map(wrapper, args)

Expand Down
42 changes: 38 additions & 4 deletions g2gtools/g2g_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def bgzip_file(original_file, new_file, delete_original=False, force=True):
delete_file(original_file)


def index_file(original_file, file_format="vcf"):
def has_index_file(original_file, file_format=None):
"""
:param original_file:
Expand All @@ -302,15 +302,49 @@ def index_file(original_file, file_format="vcf"):
:return:
"""

if not file_format:
# try to guess the file format
if original_file.lower().endswith(".fa") or original_file.lower().endswith(".fasta"):
file_format = 'fa'
elif original_file.lower().endswith(".vcf"):
file_format = 'vcf'
elif original_file.lower().endswith(".vci"):
file_format = 'vci'
else:
raise G2GValueError("Cannot determine file format")

if file_format.lower() == 'fa':
pysam.faidx(original_file)
ext = 'fai'
elif file_format.lower() == 'vcf':
pysam.tabix_index(original_file, preset="vcf", force=True)
ext = 'tbi'
elif file_format.lower() == 'vci':
pysam.tabix_index(original_file, seq_col=0, start_col=1, end_col=1, force=True)
ext = 'tbi'
else:
raise G2GValueError("Unknown file format: {0}".format(file_format))

idx_file = '{}.{}'.format(original_file, ext)

return os.path.exists(idx_file)


def index_file(original_file, file_format="vcf", overwrite=False):
"""
:param original_file:
:param new_file:
:param file_format:
:return:
"""
if not overwrite or not has_index_file(original_file, file_format=file_format):
if file_format.lower() == 'fa':
pysam.faidx(original_file)
elif file_format.lower() == 'vcf':
pysam.tabix_index(original_file, preset="vcf", force=True)
elif file_format.lower() == 'vci':
pysam.tabix_index(original_file, seq_col=0, start_col=1, end_col=1, force=True)
else:
raise G2GValueError("Unknown file format: {0}".format(file_format))


def bgzip_and_index_file(original_file, new_file, delete_original=False, force=True, file_format="vcf"):
bgzip_file(original_file, new_file, delete_original, force)
Expand Down
2 changes: 2 additions & 0 deletions g2gtools/vcf2vci.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,8 @@ def process(vcf_files, fasta_file, output_file, strain, vcf_keep=False, passed=F
for file_name in vcf_files:
vcf_file = g2g_utils.check_file(file_name)
LOG.info("VCF file: {0}".format(vcf_file))
LOG.info("Checking for index file, creating if needed...")
g2g_utils.index_file(original_file=vcf_file, file_format="vcf", overwrite=False)

vcf_discard_file = None
if vcf_keep:
Expand Down
2 changes: 2 additions & 0 deletions g2gtools/vci.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ def __init__(self, filename, mode='r', parser=None, index=None, encoding="ascii"

self._tabix_file = pysam.TabixFile(self.filename, mode=mode, parser=parser, index=index, encoding=encoding)

g2g_utils.index_file(original_file=filename, file_format="vci", overwrite=False)

self.parse_header()

def __getattr__(self, name):
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.2.3
current_version = 0.2.4
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

setup(
name='g2gtools',
version='0.2.3',
version='0.2.4',
description="A suite of tools for the reconstruction of personal diploid genomes and better coordinate conversion",
long_description=readme + '\n\n' + history,
author='Matthew J. Vincent and Kwangbom "KB" Choi, The Jackson Laboratory',
Expand Down

0 comments on commit 52cf234

Please sign in to comment.