added method to obtain refseq ids corresponding to gencode transcript…

…s; added refseq queries to user interface elements. This commit implements #13
cmbi · Apr 18, 2019 · da4d678 · da4d678
1 parent 3747c6f
commit da4d678
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 7 deletions.
diff --git a/metadome/default_settings.py b/metadome/default_settings.py
@@ -48,6 +48,7 @@
 GENCODE_HG_ANNOTATION_FILE_GFF3 = DATA_DIR+"Gencode/gencode.v19.annotation.gff3"
 GENCODE_HG_TRANSCRIPTION_FILE = DATA_DIR+"Gencode/gencode.v19.pc_transcripts.fa"
 GENCODE_HG_TRANSLATION_FILE = DATA_DIR+"Gencode/gencode.v19.pc_translations.fa"
+GENCODE_REFSEQ_FILE = DATA_DIR+"Gencode/gencode.v19.metadata.RefSeq"
 GENCODE_SWISSPROT_FILE = DATA_DIR+"Gencode/gencode.v19.metadata.SwissProt"
 GENCODE_BASIC_FILE = DATA_DIR+"Gencode/ucsc.gencode.v19.wgEncodeGencodeBasic.txt"
 

diff --git a/metadome/domain/wrappers/gencode.py b/metadome/domain/wrappers/gencode.py
@@ -1,7 +1,7 @@
 import logging
 from metadome.default_settings import GENCODE_HG_TRANSLATION_FILE,\
     GENCODE_SWISSPROT_FILE, GENCODE_HG_TRANSCRIPTION_FILE,\
-    GENCODE_HG_ANNOTATION_FILE_GFF3, GENCODE_BASIC_FILE
+    GENCODE_HG_ANNOTATION_FILE_GFF3, GENCODE_BASIC_FILE, GENCODE_REFSEQ_FILE
 from metadome.domain.parsers import gff3
 from Bio.Seq import translate
 import urllib
@@ -383,4 +383,36 @@ def retrieve_all_protein_coding_gene_names():
                 # add the gene name to the set
                 gene_names.add(tokens[5])
 
-    return list(gene_names)
+    return list(gene_names)
+
+def retrieve_refseq_identifiers_for_transcript(gencode_id):
+    """Retrieves the refseq identifiers for a Gencode transcript"""
+    result = {}
+    result['NP'] = []
+    result['NM'] = []
+    result['NR'] = []
+    with open(GENCODE_REFSEQ_FILE) as gencode_refseq:
+        # read the lines in the file
+        lines = gencode_refseq.readlines()
+        for line in lines:
+            # check if the unique identifier is on the current line
+            if gencode_id in line:
+                #Add the result to hits
+                tokens = line.split('\t')
+
+                # Only add the translation to the translation list if the gene_name exactly matches the one we are looking for
+                if gencode_id == tokens[0]:
+                    # add the results
+                    for token in tokens[1:]:
+                        token = token.strip()
+                        if token.startswith('NP'):
+                            result['NP'].append(token)
+                        elif token.startswith('NM'):
+                            result['NM'].append(token)
+                        elif token.startswith('NR'):
+                            result['NR'].append(token)
+                        elif len(token) == 0:
+                            continue
+                        else:
+                            _log.warning("When retrieving matching RefSeq ids for "+gencode_id+" unexpected token: "+token)    
+    return result
diff --git a/metadome/presentation/api/routes.py b/metadome/presentation/api/routes.py
@@ -14,6 +14,7 @@
 from metadome.controllers.job import (create_visualization_job_if_needed,
                                       get_visualization_status,
                                       retrieve_visualization)
+from metadome.domain.wrappers.gencode import retrieve_refseq_identifiers_for_transcript
 
 
 _log = logging.getLogger(__name__)
@@ -30,7 +31,7 @@ def get_transcript_ids_for_gene(gene_name):
     _log.debug('get_transcript_ids_for_gene')
     # retrieve the transcript ids for this gene
     trancripts = GeneRepository.retrieve_all_transcript_ids(gene_name)
-
+    
     # check if there was any return value
     if len(trancripts) > 0:
         message = "Retrieved transcripts for gene '"+trancripts[0].gene_name+"'"
@@ -39,9 +40,14 @@ def get_transcript_ids_for_gene(gene_name):
 
     transcript_results = []
     for t in trancripts:
+        # retrieve matching refseq identifiers for this transcript 
+        refseq_ids = retrieve_refseq_identifiers_for_transcript(t.gencode_transcription_id)
+        refseq_nm_numbers = ", ".join(nm_number for nm_number in refseq_ids['NM'])
+
         transcript_entry = {}
         transcript_entry['aa_length'] = t.sequence_length
         transcript_entry['gencode_id'] = t.gencode_transcription_id
+        transcript_entry['refseq_nm_numbers'] = refseq_nm_numbers
         transcript_entry['has_protein_data'] = not t.protein_id is None
         transcript_results.append(transcript_entry)
 

diff --git a/metadome/presentation/web/templates/js/dashboard.js b/metadome/presentation/web/templates/js/dashboard.js
@@ -400,7 +400,12 @@ function getTranscript() {
 				for (var i = 0; i < transcript_id_results.trancript_ids.length; i++) {
 					var opt = new Option();
 					opt.value = i;
-					opt.text = transcript_id_results.trancript_ids[i].gencode_id + " ("+ transcript_id_results.trancript_ids[i].aa_length +"aa)" ;
+					if ( transcript_id_results.trancript_ids[i].refseq_nm_numbers === ""){
+						opt.text = transcript_id_results.trancript_ids[i].gencode_id + " ("+ transcript_id_results.trancript_ids[i].aa_length +"aa)" ;
+					}
+					else {
+						opt.text = transcript_id_results.trancript_ids[i].gencode_id + " / "+ transcript_id_results.trancript_ids[i].refseq_nm_numbers+" ("+ transcript_id_results.trancript_ids[i].aa_length +"aa)" ;
+					}
 
 					if (!transcript_id_results.trancript_ids[i].has_protein_data){
 						opt.disabled = true;
@@ -529,9 +534,22 @@ function getVisualizationResult(transcript_id) {
             $("#graph_control_field").removeClass('is-hidden');
             var geneName = document.getElementById("geneName").value;
             var geneDetails = document.getElementById("geneDetails");
+
+            var refSeqLinks = "";
+            if (obj.refseq_ids.length > 0) {
+            	refSeqLinks += "RefSeq: "; 	
+	            for (var i = 0; i < obj.refseq_ids.length; i++){
+	            	if (i > 0){
+	            		refSeqLinks += ', ';
+	            	}
+	            	refSeqLinks += '<a href="https://www.ncbi.nlm.nih.gov/nuccore/'+obj.refseq_ids[i]+'" target="_blank">'+obj.refseq_ids[i]+'</a>';
+	            }
+	            refSeqLinks += ','
+            }
+
             geneDetails.innerHTML = 'Gene: ' + obj.gene_name + ' (transcript: <a href="http://grch37.ensembl.org/Homo_sapiens/Transcript/Summary?t='
                                   + obj.transcript_id + '" target="_blank">' + obj.transcript_id
-                                  + '</a>, protein: <a href="https://www.uniprot.org/uniprot/'
+                                  + '</a>, '+refSeqLinks+' protein: <a href="https://www.uniprot.org/uniprot/'
                                   + obj.protein_ac + '" target="_blank">' + obj.protein_ac + '</a>)';
             // Draw the graph
             createGraph(obj);

diff --git a/metadome/tasks.py b/metadome/tasks.py
@@ -20,6 +20,7 @@
 import os
 import logging
 from metadome.domain.models.gene import Strand
+from metadome.domain.wrappers.gencode import retrieve_refseq_identifiers_for_transcript
 
 _log = logging.getLogger(__name__)
 
@@ -153,7 +154,6 @@ def retrieve_metadomain_annotation(transcript_id, protein_position, domain_posit
             transcript_ids = [meta_snvs[meta_snv_repr][0]['gencode_transcription_id'] for meta_snv_repr in meta_snvs.keys()]
             transcripts_to_gene = GeneRepository.retrieve_gene_names_for_multiple_transcript_ids(transcript_ids)
 
-#             hier ergens iets doen: retrieve_gene_names_for_multiple_transcript_ids???
             # iterate over meta_codons and add to metadom_entry
             for meta_snv_repr in meta_snvs.keys():
                 if not current_codon.unique_str_representation() in meta_snv_repr:
@@ -197,6 +197,7 @@ def retrieve_metadomain_annotation(transcript_id, protein_position, domain_posit
         domain_results[domain_id]["alignment_depth"] = alignment_depth
 
     return domain_results
+
 def analyse_transcript(transcript_id):
     # Retrieve the gene from the database
     try:
@@ -206,6 +207,9 @@ def analyse_transcript(transcript_id):
 
     # build the gene region
     gene_region = GeneRegion(gene)
+
+    # Retrieve the refseq ids    
+    refseq_ids = retrieve_refseq_identifiers_for_transcript(transcript_id)
 
     if not gene_region is None:
         # generate the positional annotation for this gene by first computing the tolerance landscape
@@ -289,7 +293,7 @@ def analyse_transcript(transcript_id):
 
                         if domain["metadomain"] and len(consensus_positions)>0:
                             d['domains'][domain['ID']] = create_meta_domain_entry(gene_region, meta_domains[domain['ID']], consensus_positions, db_position)
-        result = {"transcript_id":transcript_id, "protein_ac":gene_region.uniprot_ac, "gene_name":gene_region.gene_name, "positional_annotation":region_positional_annotation, "domains":Pfam_domains}
+        result = {"transcript_id":transcript_id, "refseq_ids":refseq_ids['NM'], "protein_ac":gene_region.uniprot_ac, "gene_name":gene_region.gene_name, "positional_annotation":region_positional_annotation, "domains":Pfam_domains}
     else:
         result = {'error': 'No gene region could be build for transcript '+str(transcript_id)}