Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 842 lines (756 sloc) 32.2 KB
#!/usr/bin/env python
"""Prepare GFF transcript files for use as input to RNA-seq pipelines
Usage, from within the main genome directory of your organism:
prepare_tx_gff.py <organism> <org_build>
requires these python and external packages which come pre-installed
with bcbio using bioconda:
mysql-python
gffutils
requests
picard
kallisto
"""
from __future__ import print_function
import csv
import gzip
import os
import sys
import shutil
import collections
import datetime
import subprocess
import tempfile
import glob
from argparse import ArgumentParser
import gffutils
import requests
try:
import MySQLdb
except:
MySQLdb = None
from bcbio.utils import chdir, safe_makedir, file_exists, get_program_python
from bcbio.rnaseq.gtf import gtf_to_fasta
# ## Version and retrieval details for Ensembl and UCSC
ensembl_release = "95"
base_ftp = "ftp://ftp.ensembl.org/pub/release-{release}/gtf"
supported_oldbuilds = {"GRCh37": "75", "hg19": "75"}
build_subsets = {"hg38-noalt": "hg38"}
ucsc_db = "genome-mysql.cse.ucsc.edu"
ucsc_user = "genome"
# Chromosome name remappings thanks to Devon Ryan
# https://github.com/dpryan79/ChromosomeMappings
manual_remaps = {"hg38":
"https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/GRCh38_ensembl2UCSC.txt"}
def which(program):
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
path = path.strip('"')
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file
return None
def manual_ucsc_ensembl_map(org_build):
org_build = build_subsets.get(org_build, org_build)
requests.packages.urllib3.disable_warnings()
r = requests.get(manual_remaps[org_build], verify=False)
out = {}
for line in r.text.split("\n"):
try:
ensembl, ucsc = line.split()
out[ensembl] = ucsc
except ValueError:
pass
return out
def ucsc_ensembl_map_via_download(org_build):
"""Compare .dict files by md5, then length to compare two builds.
"""
ensembl_dict_file = get_ensembl_dict(org_build)
ucsc_dict_file = get_ucsc_dict(org_build)
ensembl_dict = parse_sequence_dict(ensembl_dict_file)
ucsc_dict = parse_sequence_dict(ucsc_dict_file)
return ensembl_to_ucsc(ensembl_dict, ucsc_dict, org_build)
def ensembl_to_ucsc(ensembl_dict, ucsc_dict, org_build):
name_map = {}
for md5, name in ensembl_dict.items():
if ucsc_dict.get(md5):
name_map[name] = ucsc_dict[md5]
map_file = "%s-map.csv" % (org_build)
with open(map_file, "w") as out_handle:
writer = csv.writer(out_handle)
writer.writerow(["ensembl", "ucsc"])
for md5, name in ensembl_dict.items():
ucsc = ucsc_dict.get(md5)
if ucsc is not None:
writer.writerow([name, ucsc])
return name_map
def ucsc_ensembl_map_via_query(org_build):
"""Retrieve UCSC to Ensembl name mappings from UCSC MySQL database.
"""
org_build = build_subsets.get(org_build, org_build)
# if MySQLdb is not installed, figure it out via download
if not MySQLdb:
return ucsc_ensembl_map_via_download(org_build)
db = MySQLdb.connect(host=ucsc_db, user=ucsc_user, db=org_build)
cursor = db.cursor()
cursor.execute("select * from ucscToEnsembl")
ucsc_map = {}
for fields in cursor.fetchall():
ucsc = fields[0]
ensembl = fields[-1]
# workaround for GRCh37/hg19 additional haplotype contigs.
# Coordinates differ between builds so do not include these regions.
if org_build == "hg19" and "hap" in ucsc:
continue
else:
ucsc_map[ensembl] = ucsc
return ucsc_map
# taxname:
# biomart_name: name of ensembl gene_id on biomart
# ucsc_map:
# fbase: the base filename for ensembl files using this genome
Build = collections.namedtuple("Build", ["taxname", "biomart_name",
"ucsc_map", "fbase"])
build_info = {
"hg19": Build("homo_sapiens", "hsapiens_gene_ensembl",
ucsc_ensembl_map_via_query,
"Homo_sapiens.GRCh37." + supported_oldbuilds["GRCh37"]),
"GRCh37": Build("homo_sapiens", "hsapiens_gene_ensembl",
None,
"Homo_sapiens.GRCh37." + supported_oldbuilds["hg19"]),
"mm9": Build("mus_musculus", "mmusculus_gene_ensembl",
ucsc_ensembl_map_via_query,
"Mus_musculus.NCBIM37.67"),
"mm10": Build("mus_musculus", "mmusculus_gene_ensembl",
ucsc_ensembl_map_via_query,
"Mus_musculus.GRCm38." + ensembl_release),
"rn5": Build("rattus_norvegicus", None,
ucsc_ensembl_map_via_download,
"Rattus_norvegicus.Rnor_5.0." + ensembl_release),
"rn6": Build("rattus_norvegicus", None,
ucsc_ensembl_map_via_download,
"Rattus_norvegicus.Rnor_6.0." + ensembl_release),
"hg38": Build("homo_sapiens", "hsapiens_gene_ensembl",
manual_ucsc_ensembl_map,
"Homo_sapiens.GRCh38." + ensembl_release),
"hg38-noalt": Build("homo_sapiens", "hsapiens_gene_ensembl",
manual_ucsc_ensembl_map,
"Homo_sapiens.GRCh38." + ensembl_release),
"canFam3": Build("canis_familiaris", None,
ucsc_ensembl_map_via_download,
"Canis_familiaris.CanFam3.1." + ensembl_release),
"sacCer3": Build("saccharomyces_cerevisiae", None,
ucsc_ensembl_map_via_download,
"Saccharomyces_cerevisiae.R64-1-1." + ensembl_release),
"WBcel235": Build("caenorhabditis_elegans", None,
ucsc_ensembl_map_via_download,
"Caenorhabditis_elegans.WBcel235." + ensembl_release),
"dm3": Build("drosophila_melanogaster", None,
ucsc_ensembl_map_via_download,
"Drosophila_melanogaster.BDGP5." + ensembl_release),
"Zv9": Build("danio_rerio", None,
ucsc_ensembl_map_via_download,
"Danio_rerio.Zv9." + ensembl_release),
"GRCz11": Build("danio_rerio", None, None,
"Danio_rerio.GRCz11." + ensembl_release),
"xenTro3": Build("xenopus_tropicalis", None,
ucsc_ensembl_map_via_download,
"Xenopus_tropicalis.JGI_4.2." + ensembl_release),
"Sscrofa11.1": Build("sus_scrofa", None, None,
"Sus_scrofa.Sscrofa11.1." + ensembl_release),
}
def parse_sequence_dict(fasta_dict):
def _tuples_from_line(line):
attrs = {}
for tag, val in [x.split(":", 1) for x in line.strip().split("\t")[1:]]:
attrs[tag] = val
return attrs["SN"], attrs["LN"], attrs["M5"]
out = {}
with open(fasta_dict) as dict_handle:
for name, length, md5 in [_tuples_from_line(x) for x in dict_handle if x.startswith("@SQ")]:
out[md5] = name
return out
class SequenceDictParser(object):
def __init__(self, fname):
self.fname = fname
def _get_sequences_in_genome_dict(self):
with open(self.fname) as genome_handle:
sequences = [self._sequence_from_line(x) for x in genome_handle if "@SQ" in x]
return sequences
def _sequence_from_line(self, line):
name = line.split("\t")[1].split(":")[1]
md5 = line.split("\t")[4].split(":")[1]
return md5, name
def get_ensembl_dict(org_build):
genome_dict = org_build + ".dict"
if not os.path.exists(genome_dict):
org_fa = org_build + ".fa.gz"
if not os.path.exists(org_fa):
genome = _download_ensembl_genome(org_build)
shutil.move(genome, org_fa)
genome_dict = make_fasta_dict(org_fa)
return genome_dict
def get_ucsc_dict(org_build):
fa_dict = os.path.join(os.getcwd(), os.pardir, "seq", org_build + ".dict")
if not file_exists(fa_dict):
fa_file = os.path.splitext(fa_dict)[0] + ".fa"
fa_dict = make_fasta_dict(fa_file)
return fa_dict
def make_fasta_dict(fasta_file):
dict_file = os.path.splitext(fasta_file.replace(".fa.gz", ".fa"))[0] + ".dict"
if not os.path.exists(dict_file):
subprocess.check_call("picard -Xms1g -Xmx3g CreateSequenceDictionary R={fasta_file} "
"O={dict_file}".format(**locals()), shell=True)
return dict_file
def _download_ensembl_genome(org_build):
build = build_info[org_build]
# reference files do not use the ensembl_release version so split it off
fname = os.path.splitext(build.fbase)[0] + ".dna_sm.toplevel.fa.gz"
dl_url = ("ftp://ftp.ensembl.org/pub/release-{release}/"
"fasta/{taxname}/dna/{fname}").format(release=ensembl_release,
taxname=build.taxname,
fname=fname)
out_file = os.path.basename(dl_url)
if not os.path.exists(out_file):
subprocess.check_call(["wget", "-c", dl_url])
return out_file
def write_version(build=None, gtf_file=None):
gtf_file = build.fbase if build else gtf_file
gtf_file = os.path.abspath(gtf_file)
version_file = "version.txt"
with open(version_file, "w") as out_handle:
out_handle.write("Created from: %s" % gtf_file)
return version_file
# ## Main driver functions
def main(org_build, gtf_file, genome_fasta, genome_dir, cores, args):
genome_dir = genome_dir if genome_dir else os.curdir
build_dir = os.path.abspath(os.path.join(genome_dir, org_build))
work_dir = os.path.join(build_dir, "tmpcbl")
safe_makedir(work_dir)
ens_version = supported_oldbuilds.get(org_build, ensembl_release)
out_dir = os.path.join(build_dir,
"rnaseq-%s_%s" % (datetime.datetime.now().strftime("%Y-%m-%d"), ens_version))
tophat_dir = os.path.join(out_dir, "tophat")
gtf_file = os.path.abspath(gtf_file) if gtf_file else gtf_file
if genome_fasta:
genome_fasta = os.path.abspath(genome_fasta)
work_fasta = os.path.join(work_dir, os.path.basename(genome_fasta))
if not os.path.exists(work_fasta):
shutil.copy(genome_fasta, work_fasta)
genome_fasta = work_fasta
with chdir(work_dir):
if not genome_fasta:
genome_fasta = get_genome_fasta(org_build)
if not gtf_file:
write_version(build=build_info[org_build])
build = build_info[org_build]
gtf_file = prepare_tx_gff(build, org_build)
else:
write_version(gtf_file=gtf_file)
work_gtf = os.path.join(work_dir, "ref-transcripts.gtf")
if not os.path.exists(work_gtf):
shutil.copy(gtf_file, work_gtf)
gtf_file = work_gtf
gtf_file = clean_gtf(gtf_file, genome_fasta)
db = _get_gtf_db(gtf_file)
os.remove(gtf_file)
gtf_file = db_to_gtf(db, gtf_file)
gtf_to_refflat(gtf_file)
gtf_to_bed(gtf_file)
prepare_tx2gene(gtf_file)
prepare_dexseq(gtf_file)
mask_gff = prepare_mask_gtf(gtf_file)
rrna_gtf = prepare_rrna_gtf(gtf_file)
if file_exists(rrna_gtf):
gtf_to_interval(rrna_gtf, genome_fasta)
if args.tophat:
prepare_tophat_index(gtf_file, org_build, genome_fasta)
transcriptome_fasta = make_transcriptome_fasta(gtf_file, genome_fasta)
if args.kallisto:
prepare_kallisto_index(transcriptome_fasta, org_build)
make_hisat2_splicesites(gtf_file)
cleanup(work_dir, out_dir, org_build)
rnaseq_dir = os.path.join(build_dir, "rnaseq")
if os.path.exists(rnaseq_dir):
if os.path.islink(rnaseq_dir):
os.unlink(rnaseq_dir)
else:
shutil.rmtree(rnaseq_dir)
os.symlink(out_dir, rnaseq_dir)
tar_dirs = [os.path.relpath(out_dir)]
tarball = create_tarball(tar_dirs, org_build)
def make_hisat2_splicesites(gtf_file):
base, _ = os.path.splitext(gtf_file)
out_file = os.path.join(base + "-splicesites.txt")
executable = get_program_python("hisat2")
hisat2_script = os.path.join(os.path.dirname(executable),
"hisat2_extract_splice_sites.py")
cmd = "{executable} {hisat2_script} {gtf_file} > {out_file}"
if file_exists(out_file):
return out_file
if not file_exists(hisat2_script):
return None
subprocess.check_call(cmd.format(**locals()), shell=True)
return out_file
def make_transcriptome_fasta(gtf_file, genome_fasta):
base, _ = os.path.splitext(gtf_file)
out_file = os.path.join(base + ".fa")
out_file = gtf_to_fasta(gtf_file, genome_fasta, out_file=out_file)
return out_file
def clean_gtf(gtf_file, genome_fasta):
"""
remove transcripts that have the following properties
1) don't have a corresponding ID in the reference
2) gencode Selenocysteine features which break many downstream tools
3) are not associated with a gene (no gene_id field)
"""
temp_gtf = tempfile.NamedTemporaryFile(suffix=".gtf").name
fa_names = get_fasta_names(genome_fasta)
with open(gtf_file) as in_gtf, open(temp_gtf, "w") as out_gtf:
for line in in_gtf:
if line.startswith("#"):
continue
# these cause problems with downstream tools and we don't use them
if "Selenocysteine" in line:
continue
if line.split()[0].strip() not in fa_names:
continue
if 'gene_id' not in line:
continue
out_gtf.write(line)
shutil.move(temp_gtf, gtf_file)
return gtf_file
def get_genome_fasta(org_build):
fa_path = os.path.abspath(os.path.join(os.curdir, os.pardir, "seq",
org_build + ".fa"))
return fa_path
def get_fasta_names(genome_fasta):
fa_dict = genome_fasta + ".fai"
if not os.path.exists(fa_dict):
subprocess.check_call("samtools faidx %s" % genome_fasta, shell=True)
with open(fa_dict) as in_handle:
return [line.split("\t")[0] for line in in_handle]
def cleanup(work_dir, out_dir, org_build):
for fname in [os.path.join(work_dir, org_build + ".dict"),
os.path.join(work_dir, org_build + ".fa"),
os.path.join(work_dir, org_build + ".fa.gz"),
os.path.join(work_dir, org_build + "-map.csv")]:
if os.path.exists(fname):
os.remove(fname)
if os.path.exists(os.path.join(work_dir, "bcbiotx")):
shutil.rmtree(os.path.join(work_dir, "bcbiotx"))
shutil.move(work_dir, out_dir)
def create_tarball(tar_dirs, org_build):
str_tar_dirs = " ".join(tar_dirs)
tarball = "{org}-{dir}.tar.xz".format(org=org_build, dir=os.path.basename(tar_dirs[0]))
if not os.path.exists(tarball):
subprocess.check_call("tar -cvpf - {out_dir} | xz -zc - > {tarball}".format(
out_dir=str_tar_dirs, tarball=tarball), shell=True)
return tarball
def upload_to_s3(tarball):
upload_script = os.path.join(os.path.dirname(__file__), "s3_multipart_upload.py")
subprocess.check_call([sys.executable, upload_script, tarball, "biodata",
os.path.join("annotation", os.path.basename(tarball)),
"--public"])
def genepred_to_UCSC_table(genepred):
header = ["#bin", "name", "chrom", "strand",
"txStart", "txEnd", "cdsStart", "cdsEnd",
"exonCount", "exonStarts", "exonEnds", "score",
"name2", "cdsStartStat", "cdsEndStat",
"exonFrames"]
out_file = os.path.splitext(genepred)[0] + ".UCSCTable"
if file_exists(out_file):
return out_file
with open(genepred) as in_handle, open(out_file, "w") as out_handle:
counter = -1
current_item = None
out_handle.write("\t".join(header) + "\n")
for l in in_handle:
item = l.split("\t")[0]
if current_item != item:
current_item = item
counter = counter + 1
out_handle.write("\t".join([str(counter), l]))
return out_file
def gtf_to_genepred(gtf):
out_file = os.path.splitext(gtf)[0] + ".genePred"
if file_exists(out_file):
return out_file
cmd = "gtfToGenePred -allErrors -ignoreGroupsWithoutExons -genePredExt {gtf} {out_file}"
subprocess.check_call(cmd.format(**locals()), shell=True)
return out_file
def gtf_to_refflat(gtf):
out_file = os.path.splitext(gtf)[0] + ".refFlat"
if file_exists(out_file):
return out_file
genepred = gtf_to_genepred(gtf)
with open(genepred) as in_handle, open(out_file, "w") as out_handle:
for l in in_handle:
first = l.split("\t")[0]
out_handle.write("\t".join([first, l]))
return out_file
def gtf_to_bed(gtf):
db = _get_gtf_db(gtf)
out_file = os.path.splitext(gtf)[0] + ".bed"
if file_exists(out_file):
return out_file
with open(out_file, "w") as out_handle:
for feature in db.features_of_type('transcript'):
chrom = feature.chrom
start = feature.start
end = feature.end
attributes = feature.attributes.keys()
strand = feature.strand
name = (feature['gene_name'][0] if 'gene_name' in attributes else
feature['gene_id'][0])
line = "\t".join(map(str, [chrom, start, end, name, ".", strand]))
out_handle.write(line + "\n")
return out_file
def _is_selenocysteine(feature):
if feature.featuretype == "Selenocysteine":
return True
return False
def db_to_gtf(db, out_file):
if file_exists(out_file):
return out_file
print("Writing out merged GTF file to %s." % out_file)
with open(out_file, "w") as out_handle:
for feature in db.all_features():
if _is_selenocysteine(feature):
continue
out_handle.write(str(feature) + "\n")
return out_file
def make_miso_events(gtf, org_build):
genepred = gtf_to_genepred(gtf)
genepred = genepred_to_UCSC_table(genepred)
pred_dir = tempfile.mkdtemp()
miso_dir = os.path.join(os.path.dirname(gtf), "miso")
tmp_pred = os.path.join(pred_dir, "ensGene.txt")
os.symlink(os.path.abspath(genepred), tmp_pred)
make_miso_annotation(pred_dir, miso_dir, org_build)
gff_files = glob.glob(os.path.join(miso_dir, "commonshortest", "*.gff3"))
cmd = "index_gff --index {f} {prefix}"
for f in gff_files:
prefix = f.split(".")[0] + "_indexed"
if not file_exists(prefix):
subprocess.check_call(cmd.format(**locals()), shell=True)
def prepare_bowtie_index(genome_fasta, bowtie_dir):
if os.path.exists(bowtie_dir + ".1.bt2"):
return bowtie_dir
safe_makedir(bowtie_dir)
cmd = "bowtie2-build {genome_fasta} {bowtie_dir}"
subprocess.check_call(cmd.format(**locals()), shell=True)
return bowtie_dir
def prepare_tophat_index(gtf, org_build, genome_fasta):
tophat_dir = os.path.abspath(os.path.join(os.path.dirname(gtf), "tophat",
org_build + "_transcriptome"))
bowtie_dir = os.path.abspath(os.path.join(os.path.dirname(gtf),
os.path.pardir, "bowtie2",
org_build))
bowtie_dir = prepare_bowtie_index(genome_fasta, bowtie_dir)
out_dir = tempfile.mkdtemp()
fastq = _create_dummy_fastq()
cmd = ("tophat --transcriptome-index {tophat_dir} -G {gtf} "
"-o {out_dir} {bowtie_dir} {fastq}")
subprocess.check_call(cmd.format(**locals()), shell=True)
shutil.rmtree(out_dir)
os.remove(fastq)
def prepare_kallisto_index(transcriptome_fasta, org_build):
kallisto = which("kallisto")
if not kallisto:
return None
base_dir = os.path.abspath(os.path.dirname(transcriptome_fasta))
kallisto_dir = os.path.join(base_dir, "kallisto")
safe_makedir(kallisto_dir)
kallisto_index = os.path.join(kallisto_dir, org_build)
if not os.path.exists(kallisto_index):
cmd = ("kallisto index -i {kallisto_index} {transcriptome_fasta}")
subprocess.check_call(cmd.format(**locals()), shell=True)
return kallisto_index
def prepare_sailfish_index(transcriptome_fasta, org_build):
sailfish = which("sailfish")
if not sailfish:
return None
base_dir = os.path.abspath(os.path.dirname(transcriptome_fasta))
sailfish_dir = os.path.join(base_dir, "sailfish")
safe_makedir(sailfish_dir)
sailfish_index = os.path.join(sailfish_dir, org_build)
cmd = ("sailfish index -t {sailfish_index} -o {sailfish_index}")
subprocess.check_call(cmd.format(**locals()), shell=True)
return sailfish_index
def _create_dummy_fastq():
read = ("@HWI-ST333_0178_FC:5:1101:1107:2112#ATCTCG/1\n"
"GGNCTTTCCTGCTTCTATGTCTTGATCGCCTGTAGGCAGG\n"
"+HWI-ST333_0178_FC:5:1101:1107:2112#ATCTCG/1\n"
"[[BS\\a`ceeagfhhhhhaefhcdfhcf`efeg[cg_b__\n")
fn = "dummy.fq"
with open(fn, "w") as out_handle:
out_handle.write(read)
return fn
def gtf_to_interval(gtf, genome_fasta):
fa_dict = make_fasta_dict(genome_fasta)
db = _get_gtf_db(gtf)
out_file = os.path.splitext(gtf)[0] + ".interval_list"
if file_exists(out_file):
return out_file
with open(out_file, "w") as out_handle:
with open(fa_dict) as in_handle:
for l in in_handle:
out_handle.write(l)
for l in db.all_features():
out_handle.write("\t".join([str(l.seqid), str(l.start),
str(l.end), str(l.strand),
str(l.attributes.get("transcript_id",
["."])[0])]) + "\n")
return out_file
def prepare_mask_gtf(gtf):
"""
make a mask file of usually-masked RNA biotypes
"""
mask_biotype = ["rRNA", "Mt_rRNA", "misc_RNA", "snRNA", "snoRNA",
"tRNA", "Mt_tRNA"]
mask_chrom = ["MT"]
out_file = os.path.join(os.path.dirname(gtf), "ref-transcripts-mask.gtf")
if file_exists(out_file):
return out_file
biotype_lookup = _biotype_lookup_fn(gtf)
# if we can't find a biotype column, skip this
if not biotype_lookup:
return None
db = _get_gtf_db(gtf)
with open(out_file, "w") as out_handle:
for g in db.all_features():
biotype = biotype_lookup(g)
if (biotype in mask_biotype) or (g.chrom in mask_chrom):
out_handle.write(str(g) + "\n")
return out_file
def prepare_rrna_gtf(gtf):
"""
extract out just the rRNA biotypes, for assessing rRNA contamination
"""
mask_biotype = ["rRNA", "Mt_rRNA", "tRNA", "MT_tRNA"]
out_file = os.path.join(os.path.dirname(gtf), "rRNA.gtf")
if os.path.exists(out_file):
return out_file
db = _get_gtf_db(gtf)
biotype_lookup = _biotype_lookup_fn(gtf)
# if we can't find a biotype column, skip this
if not biotype_lookup:
return None
with open(out_file, "w") as out_handle:
for feature in db.all_features():
biotype = biotype_lookup(feature)
if biotype in mask_biotype:
out_handle.write(str(feature) + "\n")
return out_file
def prepare_tx2gene(gtf):
"""
prepare a file mapping transcripts to genes
"""
db = _get_gtf_db(gtf)
out_file = os.path.join(os.path.dirname(gtf), "tx2gene.csv")
if file_exists(out_file):
return out_file
with open(out_file, "w") as out_handle:
for transcript in db.features_of_type('transcript'):
gene_id = transcript['gene_id'][0]
transcript_id = transcript['transcript_id'][0]
out_handle.write(",".join([transcript_id, gene_id]) + "\n")
return out_file
def _biotype_lookup_fn(gtf):
"""
return a function that will look up the biotype of a feature
this checks for either gene_biotype or biotype being set or for the source
column to have biotype information
"""
db = _get_gtf_db(gtf)
sources = set([feature.source for feature in db.all_features()])
gene_biotypes = set([feature.attributes.get("gene_biotype", [None])[0]
for feature in db.all_features()])
biotypes = set([feature.attributes.get("biotype", [None])[0]
for feature in db.all_features()])
if "protein_coding" in sources:
return lambda feature: feature.source
elif "protein_coding" in biotypes:
return lambda feature: feature.attributes.get("biotype", [None])[0]
elif "protein_coding" in gene_biotypes:
return lambda feature: feature.attributes.get("gene_biotype", [None])[0]
else:
return None
def prepare_tx_gff(build, org_name):
"""Prepare UCSC ready transcript file given build information.
"""
ensembl_gff = _download_ensembl_gff(build, org_name)
# if we need to do the name remapping
if build.ucsc_map:
ucsc_name_map = build.ucsc_map(org_name)
tx_gff = _remap_gff(ensembl_gff, ucsc_name_map)
os.remove(ensembl_gff)
else:
tx_gff = "ref-transcripts.gtf"
os.rename(ensembl_gff, tx_gff)
return tx_gff
def _remap_gff(base_gff, name_map):
"""Remap chromosome names to UCSC instead of Ensembl
"""
out_file = "ref-transcripts.gtf"
wrote_missing = set([])
if not os.path.exists(out_file):
with open(out_file, "w") as out_handle, \
open(base_gff) as in_handle:
for line in in_handle:
parts = line.split("\t")
ucsc_name = name_map.get(parts[0], None)
if ucsc_name:
out_handle.write("\t".join([ucsc_name] + parts[1:]))
elif parts[0] not in wrote_missing and not line.startswith("#"):
print("Missing", parts[0])
wrote_missing.add(parts[0])
return out_file
def _download_ensembl_gff(build, org_name):
"""Given build details, download and extract the relevant ensembl GFF.
"""
fname = build.fbase + ".gtf.gz"
dl_url = "/".join([base_ftp, build.taxname, fname]).format(
release=supported_oldbuilds.get(org_name, ensembl_release))
out_file = os.path.splitext(os.path.basename(dl_url))[0]
if not os.path.exists(out_file):
subprocess.check_call(["wget", dl_url])
subprocess.check_call(["gunzip", os.path.basename(dl_url)])
return out_file
def _create_tiny_gffutils_db(gtf_file):
_, ext = os.path.splitext(gtf_file)
tmp_out = tempfile.NamedTemporaryFile(suffix=".gtf", delete=False).name
with open(tmp_out, "w") as out_handle:
count = 0
in_handle = open(gtf_file) if ext != ".gz" else gzip.open(gtf_file)
for line in in_handle:
if count > 1000:
break
out_handle.write(line)
count += 1
in_handle.close()
db = gffutils.create_db(tmp_out, dbfn=":memory:",
disable_infer_genes=True,
disable_infer_transcripts=True)
os.remove(tmp_out)
return db
def subfeature_handler(f):
"""
Given a gffutils.Feature object (which does not yet have its ID assigned),
figure out what its ID should be.
This is intended to be used for CDS, UTR, start_codon, and stop_codon
features in the Ensembl release 81 GTF files. I figured a reasonable
unique ID would consist of the parent transcript and the feature type,
followed by an autoincrementing number.
See https://pythonhosted.org/gffutils/database-ids.html#id-spec for
details and other options.
Grabbed from Ryan Dale: https://www.biostars.org/p/152517/
"""
return ''.join(
['autoincrement:',
f.attributes['transcript_id'][0],
'_',
f.featuretype])
def guess_disable_infer_extent(gtf_file):
"""
guess if we need to use disable the infer gene or transcript extent option
when making a gffutils database by making a tiny database of 1000 lines
from the original GTF and looking for all of the features
"""
db = _create_tiny_gffutils_db(gtf_file)
features = [x for x in db.featuretypes()]
disable_infer_transcript = "transcript" in features
disable_infer_gene = "gene" in features
return disable_infer_transcript, disable_infer_gene
def guess_id_spec(gtf_file):
"""
guess at the id spec in a GTF file by examining the first 1000 lines
assigns unique ids to features that may not have them
"""
db = _create_tiny_gffutils_db(gtf_file)
id_spec = {}
attributes = set()
for f in db.all_features():
attributes.update(f.attributes)
if "gene_id" in attributes:
id_spec["gene"] = "gene_id"
attributes.remove("gene_id")
if "transcript_id" in attributes:
id_spec["transcript"] = "transcript_id"
attributes.remove("transcript_id")
return id_spec
def _get_gtf_db(gtf):
db_file = gtf + ".db"
if not file_exists(db_file):
print("Creating gffutils database for %s." % (gtf))
disable_infer_transcripts, disable_infer_genes = guess_disable_infer_extent(gtf)
if not disable_infer_transcripts or not disable_infer_genes:
print("'transcript' or 'gene' entries not found, so inferring "
"their extent. This can be very slow.")
id_spec = guess_id_spec(gtf)
gffutils.create_db(gtf, dbfn=db_file,
disable_infer_genes=disable_infer_genes,
disable_infer_transcripts=disable_infer_transcripts,
id_spec=id_spec,
merge_strategy="create_unique",
keep_order=True,
verbose=True)
return gffutils.FeatureDB(db_file)
def _dexseq_preparation_path():
PREP_FILE = "python_scripts/dexseq_prepare_annotation.py"
try:
cmd = "%s/Rscript -e 'find.package(\"DEXSeq\")'" % os.path.dirname(os.path.realpath(sys.executable))
output = subprocess.check_output(cmd, shell=True)
except subprocess.CalledProcessError:
return None
for line in output.decode().split("\n"):
if line.startswith("["):
dirname = line.split("[1]")[1].replace("\"", "").strip()
path = os.path.join(dirname, PREP_FILE)
if os.path.exists(path):
return path
return None
def prepare_dexseq(gtf):
out_file = os.path.splitext(gtf)[0] + ".dexseq.gff3"
if file_exists(out_file):
return out_file
dexseq_path = _dexseq_preparation_path()
if not dexseq_path:
return None
executable = get_program_python("htseq-count")
cmd = "{executable} {dexseq_path} {gtf} {out_file}"
subprocess.check_call(cmd.format(**locals()), shell=True)
return out_file
if __name__ == "__main__":
parser = ArgumentParser(description="Prepare the transcriptome files for an "
"organism.")
parser.add_argument("-c", "--cores", default=1,
help="number of cores to use")
parser.add_argument("--gtf",
help="Optional GTF file (instead of downloading from Ensembl)",
default=None),
parser.add_argument("--fasta",
help="Optional genomic FASTA file (instead of downloading from Ensembl)",
default=None),
parser.add_argument("--genome-dir",
help=("Optional location of the root genome directory. "
"For example --genome-dir=/foo will install the files "
"for a Hsapiens hg19 genome to /foo/Hsapiens/hg19."))
parser.add_argument("--tophat", help="Build TopHat indices",
default=False, action="store_true")
parser.add_argument("--kallisto", help="Build Kallisto indices",
default=False, action="store_true")
parser.add_argument("organism", help="Short name of organism (for example Hsapiens)")
parser.add_argument("org_build", help="Build of organism to run.")
args = parser.parse_args()
if args.genome_dir:
genome_dir = os.path.join(args.genome_dir, args.organism)
else:
genome_dir = os.curdir
main(args.org_build, args.gtf, args.fasta, genome_dir, args.cores, args)
You can’t perform that action at this time.