Merge pull request #299 from eweitz/instant-gene-description

Instant gene description
eweitz · Apr 6, 2022 · 04b6ccb · 04b6ccb
2 parents b3ff53e + 52dcbbb
commit 04b6ccb
Show file tree

Hide file tree

Showing 35 changed files with 45,609 additions and 395,677 deletions.
diff --git a/dist/data/cache/bos-taurus-genes.tsv b/dist/data/cache/bos-taurus-genes.tsv
diff --git a/dist/data/cache/bos-taurus-genes.tsv.gz b/dist/data/cache/bos-taurus-genes.tsv.gz
diff --git a/dist/data/cache/caenorhabditis-elegans-genes.tsv b/dist/data/cache/caenorhabditis-elegans-genes.tsv
diff --git a/dist/data/cache/caenorhabditis-elegans-genes.tsv.gz b/dist/data/cache/caenorhabditis-elegans-genes.tsv.gz
diff --git a/dist/data/cache/canis-lupus-familiaris-genes.tsv b/dist/data/cache/canis-lupus-familiaris-genes.tsv
diff --git a/dist/data/cache/canis-lupus-familiaris-genes.tsv.gz b/dist/data/cache/canis-lupus-familiaris-genes.tsv.gz
diff --git a/dist/data/cache/danio-rerio-genes.tsv b/dist/data/cache/danio-rerio-genes.tsv
diff --git a/dist/data/cache/danio-rerio-genes.tsv.gz b/dist/data/cache/danio-rerio-genes.tsv.gz
diff --git a/dist/data/cache/drosophila-melanogaster-genes.tsv.gz b/dist/data/cache/drosophila-melanogaster-genes.tsv.gz
diff --git a/dist/data/cache/equus-caballus-genes.tsv b/dist/data/cache/equus-caballus-genes.tsv
diff --git a/dist/data/cache/equus-caballus-genes.tsv.gz b/dist/data/cache/equus-caballus-genes.tsv.gz
diff --git a/dist/data/cache/felis-catus-genes.tsv b/dist/data/cache/felis-catus-genes.tsv
diff --git a/dist/data/cache/felis-catus-genes.tsv.gz b/dist/data/cache/felis-catus-genes.tsv.gz
diff --git a/dist/data/cache/gallus-gallus-genes.tsv b/dist/data/cache/gallus-gallus-genes.tsv
diff --git a/dist/data/cache/gallus-gallus-genes.tsv.gz b/dist/data/cache/gallus-gallus-genes.tsv.gz
diff --git a/dist/data/cache/homo-sapiens-genes-new.tsv b/dist/data/cache/homo-sapiens-genes-new.tsv
diff --git a/dist/data/cache/homo-sapiens-genes.tsv b/dist/data/cache/homo-sapiens-genes.tsv
diff --git a/dist/data/cache/homo-sapiens-genes.tsv.gz b/dist/data/cache/homo-sapiens-genes.tsv.gz
diff --git a/dist/data/cache/macaca-fascicularis-genes.tsv b/dist/data/cache/macaca-fascicularis-genes.tsv
diff --git a/dist/data/cache/macaca-fascicularis-genes.tsv.gz b/dist/data/cache/macaca-fascicularis-genes.tsv.gz
diff --git a/dist/data/cache/macaca-mulatta-genes.tsv b/dist/data/cache/macaca-mulatta-genes.tsv
diff --git a/dist/data/cache/macaca-mulatta-genes.tsv.gz b/dist/data/cache/macaca-mulatta-genes.tsv.gz
diff --git a/dist/data/cache/mus-musculus-genes.tsv b/dist/data/cache/mus-musculus-genes.tsv
diff --git a/dist/data/cache/mus-musculus-genes.tsv.gz b/dist/data/cache/mus-musculus-genes.tsv.gz
diff --git a/dist/data/cache/pan-troglodytes-genes.tsv b/dist/data/cache/pan-troglodytes-genes.tsv
diff --git a/dist/data/cache/pan-troglodytes-genes.tsv.gz b/dist/data/cache/pan-troglodytes-genes.tsv.gz
diff --git a/dist/data/cache/rattus-norvegicus-genes.tsv b/dist/data/cache/rattus-norvegicus-genes.tsv
diff --git a/dist/data/cache/rattus-norvegicus-genes.tsv.gz b/dist/data/cache/rattus-norvegicus-genes.tsv.gz
diff --git a/dist/data/cache/sus-scrofa-genes.tsv b/dist/data/cache/sus-scrofa-genes.tsv
diff --git a/dist/data/cache/sus-scrofa-genes.tsv.gz b/dist/data/cache/sus-scrofa-genes.tsv.gz
diff --git a/scripts/python/cache/gene_cache.py b/scripts/python/cache/gene_cache.py
@@ -1,4 +1,4 @@
-"""Convert Ensembl GTF files to minimal TSVs for Ideogram.js gene caches
+"""Convert Ensembl GFF files to minimal TSVs for Ideogram.js gene caches
 
 Ideogram uses cached gene data to drastically simplify and speed up rendering.
 
@@ -7,6 +7,7 @@
 import argparse
 import codecs
 import csv
+import gzip
 import os
 import re
 import sys
@@ -35,41 +36,42 @@
     "Bos taurus": "ARS-UCD1.2",
     "Sus scrofa": "Sscrofa11.1",
     # "Anopheles gambiae": "AgamP4.51",
-    "Caenorhabditis elegans": "WBcel235"
+    "Caenorhabditis elegans": "WBcel235",
+    "Drosophila melanogaster": "BDGP6.28"
 }
 
 interesting_genes_by_organism = {
     "Homo sapiens": "gene-hints.tsv",
-    "Mus musculus": "pubmed-citations.tsv",
-    "Rattus norvegicus": "pubmed-citations.tsv",
-    "Canis lupus familiaris": "pubmed-citations.tsv",
-    "Felis catus": "pubmed-citations.tsv",
+    # "Mus musculus": "pubmed-citations.tsv",
+    # "Rattus norvegicus": "pubmed-citations.tsv",
+    # "Canis lupus familiaris": "pubmed-citations.tsv",
+    # "Felis catus": "pubmed-citations.tsv",
 }
 
 # metazoa = {
-#     "Anopheles gambiae".AgamP4.51.chr.gtf.gz  "
+#     "Anopheles gambiae".AgamP4.51.gff3.gz  "
 # }
 
-def get_gtf_url(organism):
-    """Get URL to GTF file
-    E.g. https://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.chr.gtf.gz
+def get_gff_url(organism):
+    """Get URL to GFF file
+    E.g. https://ftp.ensembl.org/pub/release-102/gff3/homo_sapiens/Homo_sapiens.GRCh38.102.gff3.gz
     """
     release = "102"
-    base = f"https://ftp.ensembl.org/pub/release-{release}/gtf/"
+    base = f"https://ftp.ensembl.org/pub/release-{release}/gff3/"
     asm = assemblies_by_org[organism]
     org_us = organism.replace(" ", "_")
     org_lcus = org_us.lower()
 
-    url = f"{base}{org_lcus}/{org_us}.{asm}.{release}.chr.gtf.gz"
+    url = f"{base}{org_lcus}/{org_us}.{asm}.{release}.gff3.gz"
     return url
 
-def parse_gtf_info_field(info):
-    """Parse a GTF "INFO" field into a dictionary
+def parse_gff_info_field(info):
+    """Parse a GFF "INFO" field into a dictionary
     Example INFO field:
     gene_id "ENSMUSG00000102628"; gene_version "2"; gene_name "Gm37671"; gene_source "havana"; gene_biotype "TEC";
     """
     fields = [f.strip() for f in info.split(';')][:-1]
-    kvs = [f.split(" ") for f in fields]
+    kvs = [f.split("=") for f in fields]
     info_dict = {}
     for kv in kvs:
         info_dict[kv[0]] = kv[1].strip('"')
@@ -78,7 +80,7 @@ def parse_gtf_info_field(info):
 def detect_prefix(id):
     """Find the prefix of a feature ID
 
-    Feature IDs in a given GTF file have a constant "prefix".
+    Feature IDs in a given GFF file have a constant "prefix".
     E.g. ID ENSMUSG00000102628 has prefix ENSMUSG
     """
     prefix = re.match(r"[A-Za-z]+", id).group()
@@ -97,53 +99,77 @@ def trim_id(id, prefix):
     slim_id = id.replace(insignificant, "")
     return slim_id
 
-def parse_gene(gtf_row):
-    """Parse gene from CSV-reader-split row of GTF file"""
+def parse_gene(gff_row):
+    """Parse gene from CSV-reader-split row of GFF file"""
 
-    if gtf_row[0][0] == "#":
+    if gff_row[0][0] == "#":
         # Skip header
         return None
 
-    chr = gtf_row[0]
-    feat_type = gtf_row[2]
-    if feat_type != "gene":
+    chr = gff_row[0]
+    feat_type = gff_row[2]
+
+    # feature types that are modeled as genes in Ensembl, i.e.
+    # that have an Ensembl accession beginning ENSG in human
+    loose_gene_types = [
+        "gene", "miRNA", "ncRNA", "ncRNA_gene", "rRNA",
+        "scRNA", "snRNA", "snoRNA", "tRNA"
+    ]
+    if feat_type not in loose_gene_types:
         return None
 
-    start = gtf_row[3]
-    stop = gtf_row[4]
-    info = gtf_row[8]
+    start = gff_row[3]
+    stop = gff_row[4]
+    info = gff_row[8]
     # print("info", info)
-    info_dict = parse_gtf_info_field(info)
+    info_dict = parse_gff_info_field(info)
+
+    # For GTF
+    # id = info_dict["gene_id"]
+    # symbol = info_dict.get("gene_name")
+
+    # For GFF
+
+    if "gene_id" not in info_dict:
+        return None
     id = info_dict["gene_id"]
-    symbol = info_dict.get("gene_name")
+    symbol = info_dict.get("Name")
+
+    # Change e.g.:
+    # description=transmembrane protein 88B [Source:HGNC Symbol%3BAcc:HGNC:37099]
+    # to
+    # transmembrane protein 88B
+    description = info_dict.get("description", "")
+    description = description.split(" [")[0]
+
     if symbol is None:
         return None
 
-    return [chr, start, stop, id, symbol]
+    return [chr, start, stop, id, symbol, description]
 
-def trim_gtf(gtf_path):
-    """Parse GTF into a list of compact genes
+def trim_gff(gff_path):
+    """Parse GFF into a list of compact genes
     """
-    print(f"Parsing GTF: {gtf_path}")
+    print(f"Parsing GFF: {gff_path}")
     slim_genes = []
     prefix = None
 
-    with open(gtf_path) as file:
+    with open(gff_path) as file:
         reader = csv.reader(file, delimiter="\t")
         for row in reader:
             parsed_gene = parse_gene(row)
 
             if parsed_gene == None:
                 continue
 
-            [chr, start, stop, id, symbol] = parsed_gene
+            [chr, start, stop, id, symbol, desc] = parsed_gene
             length = str(int(stop) - int(start))
 
             if prefix == None:
                 prefix = detect_prefix(id)
             slim_id = trim_id(id, prefix)
 
-            slim_genes.append([chr, start, length, slim_id, symbol])
+            slim_genes.append([chr, start, length, slim_id, symbol, desc])
 
     return [slim_genes, prefix]
 
@@ -198,61 +224,61 @@ def sort_by_interest(slim_genes, organism):
 
 
 class GeneCache():
-    """Convert Ensembl GTF files to minimal TSVs for Ideogram.js gene caches
+    """Convert Ensembl GFF files to minimal TSVs for Ideogram.js gene caches
     """
 
-    def __init__(self, output_dir="data/", reuse_gtf=False):
+    def __init__(self, output_dir="data/", reuse_gff=False):
         self.output_dir = output_dir
         self.tmp_dir = "data/"
-        self.reuse_gtf = reuse_gtf
+        self.reuse_gff = reuse_gff
 
         if not os.path.exists(self.output_dir):
             os.makedirs(self.output_dir)
         if not os.path.exists(self.tmp_dir):
             os.makedirs(self.tmp_dir)
 
-    def fetch_ensembl_gtf(self, organism):
-        """Download and decompress an organism's GTF file from Ensembl
+    def fetch_ensembl_gff(self, organism):
+        """Download and decompress an organism's GFF file from Ensembl
         """
-        print(f"Fetching Ensembl GTF for {organism}")
-        url = get_gtf_url(organism)
-        gtf_dir = self.tmp_dir + "gtf/"
-        if not os.path.exists(gtf_dir):
-            os.makedirs(gtf_dir)
-        gtf_path = gtf_dir + url.split("/")[-1]
+        print(f"Fetching Ensembl GFF for {organism}")
+        url = get_gff_url(organism)
+        gff_dir = self.tmp_dir + "gff3/"
+        if not os.path.exists(gff_dir):
+            os.makedirs(gff_dir)
+        gff_path = gff_dir + url.split("/")[-1]
         try:
-            download_gzip(url, gtf_path, cache=self.reuse_gtf)
+            download_gzip(url, gff_path, cache=self.reuse_gff)
         except urllib.error.HTTPError:
             # E.g. for C. elegans
             url = url.replace("chr.", "")
-            download_gzip(url, gtf_path, cache=self.reuse_gtf)
-        return [gtf_path, url]
+            download_gzip(url, gff_path, cache=self.reuse_gff)
+        return [gff_path, url]
 
-    def write(self, genes, organism, prefix, gtf_url):
+    def write(self, genes, organism, prefix, gff_url):
         """Save fetched and transformed gene data to cache file
         """
         headers = (
             f"## Ideogram.js gene cache for {organism}\n" +
-            f"## Derived from {gtf_url}\n"
+            f"## Derived from {gff_url}\n"
             f"## prefix: {prefix}\n"
-            f"# chr\tstart\tlength\tslim_id\tsymbol\n"
+            f"# chr\tstart\tlength\tslim_id\tsymbol\tdescription\n"
         )
         gene_lines = "\n".join(["\t".join(g) for g in genes])
         content = headers + gene_lines
 
         org_lch = organism.lower().replace(" ", "-")
-        output_path = f"{self.output_dir}{org_lch}-genes.tsv"
-        with open(output_path, "w") as f:
+        output_path = f"{self.output_dir}{org_lch}-genes.tsv.gz"
+        with gzip.open(output_path, "wt") as f:
             f.write(content)
         print(f"Wrote gene cache: {output_path}")
 
     def populate_by_org(self, organism):
         """Fill gene caches for a configured organism
         """
-        [gtf_path, gtf_url] = self.fetch_ensembl_gtf(organism)
-        [slim_genes, prefix] = trim_gtf(gtf_path)
+        [gff_path, gff_url] = self.fetch_ensembl_gff(organism)
+        [slim_genes, prefix] = trim_gff(gff_path)
         sorted_slim_genes = sort_by_interest(slim_genes, organism)
-        self.write(sorted_slim_genes, organism, prefix, gtf_url)
+        self.write(sorted_slim_genes, organism, prefix, gff_url)
 
     def populate(self):
         """Fill gene caches for all configured organisms
@@ -276,14 +302,14 @@ def populate(self):
         default="data/"
     )
     parser.add_argument(
-        "--reuse-gtf",
+        "--reuse-gff",
         help=(
-            "Whether to use previously-downloaded raw GTFs"
+            "Whether to use previously-downloaded raw GFFs"
         ),
         action="store_true"
     )
     args = parser.parse_args()
     output_dir = args.output_dir
-    reuse_gtf = args.reuse_gtf
+    reuse_gff = args.reuse_gff
 
-    GeneCache(output_dir, reuse_gtf).populate()
+    GeneCache(output_dir, reuse_gff).populate()
diff --git a/scripts/python/lib.py b/scripts/python/lib.py
@@ -40,6 +40,7 @@ def download_gzip(url, output_path, cache=0):
     if is_cached(output_path, cache, 1):
         return
     headers={"Accept-Encoding": "gzip"}
+    print(f"Requesting {url}")
     request = urllib.request.Request(url, headers=headers)
     response = urllib.request.urlopen(request, context=ctx)
     content = gzip.decompress(response.read()).decode()

diff --git a/src/js/gene-cache.js b/src/js/gene-cache.js
@@ -10,6 +10,7 @@
  * - test if a given string is a gene name, e.g. for gene search
  * - find genomic position of a given gene (or all genes)
  */
+ import {decompressSync, strFromU8} from 'fflate';
 
 import {slug, getEarlyTaxid, getDir} from './lib';
 import {organismMetadata} from './init/organism-metadata';
@@ -18,13 +19,13 @@ import version from './version';
 let perfTimes;
 
 /** Get URL for gene cache file */
-function getCacheUrl(orgName, cacheDir, ideo) {
+function getCacheUrl(orgName, cacheDir) {
   const organism = slug(orgName);
   if (!cacheDir) {
     cacheDir = getDir('cache/');
   }
 
-  const cacheUrl = cacheDir + organism + '-genes.tsv';
+  const cacheUrl = cacheDir + organism + '-genes.tsv.gz';
 
   return cacheUrl;
 }
@@ -80,7 +81,9 @@ function parseCache(rawTsv, orgName) {
   const names = [];
   const nameCaseMap = {};
   const namesById = {};
+  const fullNamesById = {};
   const idsByName = {};
+  const idsByFullName = {};
   const lociByName = {};
   const lociById = {};
   const preAnnots = [];
@@ -101,18 +104,21 @@ function parseCache(rawTsv, orgName) {
       continue;
     }
     const [
-      chromosome, rawStart, rawLength, slimEnsemblId, gene
+      chromosome, rawStart, rawLength, slimEnsemblId, gene, rawFullName
     ] = line.trim().split(/\t/);
+    const fullName = decodeURIComponent(rawFullName);
     const start = parseInt(rawStart);
     const stop = start + parseInt(rawLength);
     const ensemblId = getEnsemblId(ensemblPrefix, slimEnsemblId);
-    preAnnots.push([chromosome, start, stop, ensemblId, gene]);
+    preAnnots.push([chromosome, start, stop, ensemblId, gene, fullName]);
     const locus = [chromosome, start, stop];
 
     names.push(gene);
     nameCaseMap[gene.toLowerCase()] = gene;
     namesById[ensemblId] = gene;
+    fullNamesById[ensemblId] = fullName;
     idsByName[gene] = ensemblId;
+    idsByFullName[fullName] = ensemblId;
     lociByName[gene] = locus;
     lociById[ensemblId] = locus;
   };
@@ -123,7 +129,8 @@ function parseCache(rawTsv, orgName) {
   perfTimes.parseAnnots = Math.round(performance.now() - t1);
 
   return [
-    names, nameCaseMap, namesById, idsByName, lociByName, lociById,
+    names, nameCaseMap, namesById, fullNamesById,
+    idsByName, idsByFullName, lociByName, lociById,
     sortedAnnots
   ];
 }
@@ -181,21 +188,27 @@ export default async function initGeneCache(orgName, ideo, cacheDir=null) {
   const fetchStartTime = performance.now();
   const response = await cacheFetch(cacheUrl);
 
-  const data = await response.text();
+  // const data = await response.text();
+
+  const blob = await response.blob();
+  const uint8Array = new Uint8Array(await blob.arrayBuffer());
+  const data = strFromU8(decompressSync(uint8Array));
   const fetchEndTime = performance.now();
   perfTimes.fetch = Math.round(fetchEndTime - fetchStartTime);
 
   const [
-    interestingNames, nameCaseMap, namesById, idsByName,
-    lociByName, lociById, sortedAnnots
+    interestingNames, nameCaseMap, namesById, fullNamesById,
+    idsByName, idsByFullName, lociByName, lociById, sortedAnnots
   ] = parseCache(data, orgName);
   perfTimes.parseCache = Math.round(performance.now() - fetchEndTime);
 
   ideo.geneCache = {
     interestingNames, // Array ordered by general or scholarly interest
     nameCaseMap, // Maps of lowercase gene names to proper gene names
     namesById,
+    fullNamesById,
     idsByName,
+    idsByFullName,
     lociByName, // Object of gene positions, keyed by gene name
     lociById,
     sortedAnnots // Ideogram annotations sorted by genomic position