Skip to content

Commit

Permalink
Merge pull request #299 from eweitz/instant-gene-description
Browse files Browse the repository at this point in the history
Instant gene description
  • Loading branch information
eweitz committed Apr 6, 2022
2 parents b3ff53e + 52dcbbb commit 04b6ccb
Show file tree
Hide file tree
Showing 35 changed files with 45,609 additions and 395,677 deletions.
19,678 changes: 0 additions & 19,678 deletions dist/data/cache/bos-taurus-genes.tsv

This file was deleted.

Binary file added dist/data/cache/bos-taurus-genes.tsv.gz
Binary file not shown.
46,908 changes: 0 additions & 46,908 deletions dist/data/cache/caenorhabditis-elegans-genes.tsv

This file was deleted.

Binary file not shown.
18,160 changes: 0 additions & 18,160 deletions dist/data/cache/canis-lupus-familiaris-genes.tsv

This file was deleted.

Binary file not shown.
31,984 changes: 0 additions & 31,984 deletions dist/data/cache/danio-rerio-genes.tsv

This file was deleted.

Binary file added dist/data/cache/danio-rerio-genes.tsv.gz
Binary file not shown.
Binary file not shown.
17,112 changes: 0 additions & 17,112 deletions dist/data/cache/equus-caballus-genes.tsv

This file was deleted.

Binary file added dist/data/cache/equus-caballus-genes.tsv.gz
Binary file not shown.
17,300 changes: 0 additions & 17,300 deletions dist/data/cache/felis-catus-genes.tsv

This file was deleted.

Binary file added dist/data/cache/felis-catus-genes.tsv.gz
Binary file not shown.
13,480 changes: 0 additions & 13,480 deletions dist/data/cache/gallus-gallus-genes.tsv

This file was deleted.

Binary file added dist/data/cache/gallus-gallus-genes.tsv.gz
Binary file not shown.
45,473 changes: 45,473 additions & 0 deletions dist/data/cache/homo-sapiens-genes-new.tsv

Large diffs are not rendered by default.

60,620 changes: 0 additions & 60,620 deletions dist/data/cache/homo-sapiens-genes.tsv

This file was deleted.

Binary file added dist/data/cache/homo-sapiens-genes.tsv.gz
Binary file not shown.
20,529 changes: 0 additions & 20,529 deletions dist/data/cache/macaca-fascicularis-genes.tsv

This file was deleted.

Binary file added dist/data/cache/macaca-fascicularis-genes.tsv.gz
Binary file not shown.
23,397 changes: 0 additions & 23,397 deletions dist/data/cache/macaca-mulatta-genes.tsv

This file was deleted.

Binary file added dist/data/cache/macaca-mulatta-genes.tsv.gz
Binary file not shown.
55,405 changes: 0 additions & 55,405 deletions dist/data/cache/mus-musculus-genes.tsv

This file was deleted.

Binary file added dist/data/cache/mus-musculus-genes.tsv.gz
Binary file not shown.
22,504 changes: 0 additions & 22,504 deletions dist/data/cache/pan-troglodytes-genes.tsv

This file was deleted.

Binary file added dist/data/cache/pan-troglodytes-genes.tsv.gz
Binary file not shown.
32,231 changes: 0 additions & 32,231 deletions dist/data/cache/rattus-norvegicus-genes.tsv

This file was deleted.

Binary file added dist/data/cache/rattus-norvegicus-genes.tsv.gz
Binary file not shown.
16,285 changes: 0 additions & 16,285 deletions dist/data/cache/sus-scrofa-genes.tsv

This file was deleted.

Binary file added dist/data/cache/sus-scrofa-genes.tsv.gz
Binary file not shown.
146 changes: 86 additions & 60 deletions scripts/python/cache/gene_cache.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Convert Ensembl GTF files to minimal TSVs for Ideogram.js gene caches
"""Convert Ensembl GFF files to minimal TSVs for Ideogram.js gene caches
Ideogram uses cached gene data to drastically simplify and speed up rendering.
Expand All @@ -7,6 +7,7 @@
import argparse
import codecs
import csv
import gzip
import os
import re
import sys
Expand Down Expand Up @@ -35,41 +36,42 @@
"Bos taurus": "ARS-UCD1.2",
"Sus scrofa": "Sscrofa11.1",
# "Anopheles gambiae": "AgamP4.51",
"Caenorhabditis elegans": "WBcel235"
"Caenorhabditis elegans": "WBcel235",
"Drosophila melanogaster": "BDGP6.28"
}

interesting_genes_by_organism = {
"Homo sapiens": "gene-hints.tsv",
"Mus musculus": "pubmed-citations.tsv",
"Rattus norvegicus": "pubmed-citations.tsv",
"Canis lupus familiaris": "pubmed-citations.tsv",
"Felis catus": "pubmed-citations.tsv",
# "Mus musculus": "pubmed-citations.tsv",
# "Rattus norvegicus": "pubmed-citations.tsv",
# "Canis lupus familiaris": "pubmed-citations.tsv",
# "Felis catus": "pubmed-citations.tsv",
}

# metazoa = {
# "Anopheles gambiae".AgamP4.51.chr.gtf.gz "
# "Anopheles gambiae".AgamP4.51.gff3.gz "
# }

def get_gtf_url(organism):
"""Get URL to GTF file
E.g. https://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.chr.gtf.gz
def get_gff_url(organism):
"""Get URL to GFF file
E.g. https://ftp.ensembl.org/pub/release-102/gff3/homo_sapiens/Homo_sapiens.GRCh38.102.gff3.gz
"""
release = "102"
base = f"https://ftp.ensembl.org/pub/release-{release}/gtf/"
base = f"https://ftp.ensembl.org/pub/release-{release}/gff3/"
asm = assemblies_by_org[organism]
org_us = organism.replace(" ", "_")
org_lcus = org_us.lower()

url = f"{base}{org_lcus}/{org_us}.{asm}.{release}.chr.gtf.gz"
url = f"{base}{org_lcus}/{org_us}.{asm}.{release}.gff3.gz"
return url

def parse_gtf_info_field(info):
"""Parse a GTF "INFO" field into a dictionary
def parse_gff_info_field(info):
"""Parse a GFF "INFO" field into a dictionary
Example INFO field:
gene_id "ENSMUSG00000102628"; gene_version "2"; gene_name "Gm37671"; gene_source "havana"; gene_biotype "TEC";
"""
fields = [f.strip() for f in info.split(';')][:-1]
kvs = [f.split(" ") for f in fields]
kvs = [f.split("=") for f in fields]
info_dict = {}
for kv in kvs:
info_dict[kv[0]] = kv[1].strip('"')
Expand All @@ -78,7 +80,7 @@ def parse_gtf_info_field(info):
def detect_prefix(id):
"""Find the prefix of a feature ID
Feature IDs in a given GTF file have a constant "prefix".
Feature IDs in a given GFF file have a constant "prefix".
E.g. ID ENSMUSG00000102628 has prefix ENSMUSG
"""
prefix = re.match(r"[A-Za-z]+", id).group()
Expand All @@ -97,53 +99,77 @@ def trim_id(id, prefix):
slim_id = id.replace(insignificant, "")
return slim_id

def parse_gene(gtf_row):
"""Parse gene from CSV-reader-split row of GTF file"""
def parse_gene(gff_row):
"""Parse gene from CSV-reader-split row of GFF file"""

if gtf_row[0][0] == "#":
if gff_row[0][0] == "#":
# Skip header
return None

chr = gtf_row[0]
feat_type = gtf_row[2]
if feat_type != "gene":
chr = gff_row[0]
feat_type = gff_row[2]

# feature types that are modeled as genes in Ensembl, i.e.
# that have an Ensembl accession beginning ENSG in human
loose_gene_types = [
"gene", "miRNA", "ncRNA", "ncRNA_gene", "rRNA",
"scRNA", "snRNA", "snoRNA", "tRNA"
]
if feat_type not in loose_gene_types:
return None

start = gtf_row[3]
stop = gtf_row[4]
info = gtf_row[8]
start = gff_row[3]
stop = gff_row[4]
info = gff_row[8]
# print("info", info)
info_dict = parse_gtf_info_field(info)
info_dict = parse_gff_info_field(info)

# For GTF
# id = info_dict["gene_id"]
# symbol = info_dict.get("gene_name")

# For GFF

if "gene_id" not in info_dict:
return None
id = info_dict["gene_id"]
symbol = info_dict.get("gene_name")
symbol = info_dict.get("Name")

# Change e.g.:
# description=transmembrane protein 88B [Source:HGNC Symbol%3BAcc:HGNC:37099]
# to
# transmembrane protein 88B
description = info_dict.get("description", "")
description = description.split(" [")[0]

if symbol is None:
return None

return [chr, start, stop, id, symbol]
return [chr, start, stop, id, symbol, description]

def trim_gtf(gtf_path):
"""Parse GTF into a list of compact genes
def trim_gff(gff_path):
"""Parse GFF into a list of compact genes
"""
print(f"Parsing GTF: {gtf_path}")
print(f"Parsing GFF: {gff_path}")
slim_genes = []
prefix = None

with open(gtf_path) as file:
with open(gff_path) as file:
reader = csv.reader(file, delimiter="\t")
for row in reader:
parsed_gene = parse_gene(row)

if parsed_gene == None:
continue

[chr, start, stop, id, symbol] = parsed_gene
[chr, start, stop, id, symbol, desc] = parsed_gene
length = str(int(stop) - int(start))

if prefix == None:
prefix = detect_prefix(id)
slim_id = trim_id(id, prefix)

slim_genes.append([chr, start, length, slim_id, symbol])
slim_genes.append([chr, start, length, slim_id, symbol, desc])

return [slim_genes, prefix]

Expand Down Expand Up @@ -198,61 +224,61 @@ def sort_by_interest(slim_genes, organism):


class GeneCache():
"""Convert Ensembl GTF files to minimal TSVs for Ideogram.js gene caches
"""Convert Ensembl GFF files to minimal TSVs for Ideogram.js gene caches
"""

def __init__(self, output_dir="data/", reuse_gtf=False):
def __init__(self, output_dir="data/", reuse_gff=False):
self.output_dir = output_dir
self.tmp_dir = "data/"
self.reuse_gtf = reuse_gtf
self.reuse_gff = reuse_gff

if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
if not os.path.exists(self.tmp_dir):
os.makedirs(self.tmp_dir)

def fetch_ensembl_gtf(self, organism):
"""Download and decompress an organism's GTF file from Ensembl
def fetch_ensembl_gff(self, organism):
"""Download and decompress an organism's GFF file from Ensembl
"""
print(f"Fetching Ensembl GTF for {organism}")
url = get_gtf_url(organism)
gtf_dir = self.tmp_dir + "gtf/"
if not os.path.exists(gtf_dir):
os.makedirs(gtf_dir)
gtf_path = gtf_dir + url.split("/")[-1]
print(f"Fetching Ensembl GFF for {organism}")
url = get_gff_url(organism)
gff_dir = self.tmp_dir + "gff3/"
if not os.path.exists(gff_dir):
os.makedirs(gff_dir)
gff_path = gff_dir + url.split("/")[-1]
try:
download_gzip(url, gtf_path, cache=self.reuse_gtf)
download_gzip(url, gff_path, cache=self.reuse_gff)
except urllib.error.HTTPError:
# E.g. for C. elegans
url = url.replace("chr.", "")
download_gzip(url, gtf_path, cache=self.reuse_gtf)
return [gtf_path, url]
download_gzip(url, gff_path, cache=self.reuse_gff)
return [gff_path, url]

def write(self, genes, organism, prefix, gtf_url):
def write(self, genes, organism, prefix, gff_url):
"""Save fetched and transformed gene data to cache file
"""
headers = (
f"## Ideogram.js gene cache for {organism}\n" +
f"## Derived from {gtf_url}\n"
f"## Derived from {gff_url}\n"
f"## prefix: {prefix}\n"
f"# chr\tstart\tlength\tslim_id\tsymbol\n"
f"# chr\tstart\tlength\tslim_id\tsymbol\tdescription\n"
)
gene_lines = "\n".join(["\t".join(g) for g in genes])
content = headers + gene_lines

org_lch = organism.lower().replace(" ", "-")
output_path = f"{self.output_dir}{org_lch}-genes.tsv"
with open(output_path, "w") as f:
output_path = f"{self.output_dir}{org_lch}-genes.tsv.gz"
with gzip.open(output_path, "wt") as f:
f.write(content)
print(f"Wrote gene cache: {output_path}")

def populate_by_org(self, organism):
"""Fill gene caches for a configured organism
"""
[gtf_path, gtf_url] = self.fetch_ensembl_gtf(organism)
[slim_genes, prefix] = trim_gtf(gtf_path)
[gff_path, gff_url] = self.fetch_ensembl_gff(organism)
[slim_genes, prefix] = trim_gff(gff_path)
sorted_slim_genes = sort_by_interest(slim_genes, organism)
self.write(sorted_slim_genes, organism, prefix, gtf_url)
self.write(sorted_slim_genes, organism, prefix, gff_url)

def populate(self):
"""Fill gene caches for all configured organisms
Expand All @@ -276,14 +302,14 @@ def populate(self):
default="data/"
)
parser.add_argument(
"--reuse-gtf",
"--reuse-gff",
help=(
"Whether to use previously-downloaded raw GTFs"
"Whether to use previously-downloaded raw GFFs"
),
action="store_true"
)
args = parser.parse_args()
output_dir = args.output_dir
reuse_gtf = args.reuse_gtf
reuse_gff = args.reuse_gff

GeneCache(output_dir, reuse_gtf).populate()
GeneCache(output_dir, reuse_gff).populate()
1 change: 1 addition & 0 deletions scripts/python/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def download_gzip(url, output_path, cache=0):
if is_cached(output_path, cache, 1):
return
headers={"Accept-Encoding": "gzip"}
print(f"Requesting {url}")
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request, context=ctx)
content = gzip.decompress(response.read()).decode()
Expand Down
29 changes: 21 additions & 8 deletions src/js/gene-cache.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
* - test if a given string is a gene name, e.g. for gene search
* - find genomic position of a given gene (or all genes)
*/
import {decompressSync, strFromU8} from 'fflate';

import {slug, getEarlyTaxid, getDir} from './lib';
import {organismMetadata} from './init/organism-metadata';
Expand All @@ -18,13 +19,13 @@ import version from './version';
let perfTimes;

/** Get URL for gene cache file */
function getCacheUrl(orgName, cacheDir, ideo) {
function getCacheUrl(orgName, cacheDir) {
const organism = slug(orgName);
if (!cacheDir) {
cacheDir = getDir('cache/');
}

const cacheUrl = cacheDir + organism + '-genes.tsv';
const cacheUrl = cacheDir + organism + '-genes.tsv.gz';

return cacheUrl;
}
Expand Down Expand Up @@ -80,7 +81,9 @@ function parseCache(rawTsv, orgName) {
const names = [];
const nameCaseMap = {};
const namesById = {};
const fullNamesById = {};
const idsByName = {};
const idsByFullName = {};
const lociByName = {};
const lociById = {};
const preAnnots = [];
Expand All @@ -101,18 +104,21 @@ function parseCache(rawTsv, orgName) {
continue;
}
const [
chromosome, rawStart, rawLength, slimEnsemblId, gene
chromosome, rawStart, rawLength, slimEnsemblId, gene, rawFullName
] = line.trim().split(/\t/);
const fullName = decodeURIComponent(rawFullName);
const start = parseInt(rawStart);
const stop = start + parseInt(rawLength);
const ensemblId = getEnsemblId(ensemblPrefix, slimEnsemblId);
preAnnots.push([chromosome, start, stop, ensemblId, gene]);
preAnnots.push([chromosome, start, stop, ensemblId, gene, fullName]);
const locus = [chromosome, start, stop];

names.push(gene);
nameCaseMap[gene.toLowerCase()] = gene;
namesById[ensemblId] = gene;
fullNamesById[ensemblId] = fullName;
idsByName[gene] = ensemblId;
idsByFullName[fullName] = ensemblId;
lociByName[gene] = locus;
lociById[ensemblId] = locus;
};
Expand All @@ -123,7 +129,8 @@ function parseCache(rawTsv, orgName) {
perfTimes.parseAnnots = Math.round(performance.now() - t1);

return [
names, nameCaseMap, namesById, idsByName, lociByName, lociById,
names, nameCaseMap, namesById, fullNamesById,
idsByName, idsByFullName, lociByName, lociById,
sortedAnnots
];
}
Expand Down Expand Up @@ -181,21 +188,27 @@ export default async function initGeneCache(orgName, ideo, cacheDir=null) {
const fetchStartTime = performance.now();
const response = await cacheFetch(cacheUrl);

const data = await response.text();
// const data = await response.text();

const blob = await response.blob();
const uint8Array = new Uint8Array(await blob.arrayBuffer());
const data = strFromU8(decompressSync(uint8Array));
const fetchEndTime = performance.now();
perfTimes.fetch = Math.round(fetchEndTime - fetchStartTime);

const [
interestingNames, nameCaseMap, namesById, idsByName,
lociByName, lociById, sortedAnnots
interestingNames, nameCaseMap, namesById, fullNamesById,
idsByName, idsByFullName, lociByName, lociById, sortedAnnots
] = parseCache(data, orgName);
perfTimes.parseCache = Math.round(performance.now() - fetchEndTime);

ideo.geneCache = {
interestingNames, // Array ordered by general or scholarly interest
nameCaseMap, // Maps of lowercase gene names to proper gene names
namesById,
fullNamesById,
idsByName,
idsByFullName,
lociByName, // Object of gene positions, keyed by gene name
lociById,
sortedAnnots // Ideogram annotations sorted by genomic position
Expand Down
Loading

0 comments on commit 04b6ccb

Please sign in to comment.