Skip to content

Commit

Permalink
added method to obtain refseq ids corresponding to gencode transcript…
Browse files Browse the repository at this point in the history
…s; added refseq queries to user interface elements. This commit implements #13
  • Loading branch information
laurensvdwiel committed Apr 18, 2019
1 parent 3747c6f commit da4d678
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 7 deletions.
1 change: 1 addition & 0 deletions metadome/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
GENCODE_HG_ANNOTATION_FILE_GFF3 = DATA_DIR+"Gencode/gencode.v19.annotation.gff3"
GENCODE_HG_TRANSCRIPTION_FILE = DATA_DIR+"Gencode/gencode.v19.pc_transcripts.fa"
GENCODE_HG_TRANSLATION_FILE = DATA_DIR+"Gencode/gencode.v19.pc_translations.fa"
GENCODE_REFSEQ_FILE = DATA_DIR+"Gencode/gencode.v19.metadata.RefSeq"
GENCODE_SWISSPROT_FILE = DATA_DIR+"Gencode/gencode.v19.metadata.SwissProt"
GENCODE_BASIC_FILE = DATA_DIR+"Gencode/ucsc.gencode.v19.wgEncodeGencodeBasic.txt"

Expand Down
36 changes: 34 additions & 2 deletions metadome/domain/wrappers/gencode.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from metadome.default_settings import GENCODE_HG_TRANSLATION_FILE,\
GENCODE_SWISSPROT_FILE, GENCODE_HG_TRANSCRIPTION_FILE,\
GENCODE_HG_ANNOTATION_FILE_GFF3, GENCODE_BASIC_FILE
GENCODE_HG_ANNOTATION_FILE_GFF3, GENCODE_BASIC_FILE, GENCODE_REFSEQ_FILE
from metadome.domain.parsers import gff3
from Bio.Seq import translate
import urllib
Expand Down Expand Up @@ -383,4 +383,36 @@ def retrieve_all_protein_coding_gene_names():
# add the gene name to the set
gene_names.add(tokens[5])

return list(gene_names)
return list(gene_names)

def retrieve_refseq_identifiers_for_transcript(gencode_id):
"""Retrieves the refseq identifiers for a Gencode transcript"""
result = {}
result['NP'] = []
result['NM'] = []
result['NR'] = []
with open(GENCODE_REFSEQ_FILE) as gencode_refseq:
# read the lines in the file
lines = gencode_refseq.readlines()
for line in lines:
# check if the unique identifier is on the current line
if gencode_id in line:
#Add the result to hits
tokens = line.split('\t')

# Only add the translation to the translation list if the gene_name exactly matches the one we are looking for
if gencode_id == tokens[0]:
# add the results
for token in tokens[1:]:
token = token.strip()
if token.startswith('NP'):
result['NP'].append(token)
elif token.startswith('NM'):
result['NM'].append(token)
elif token.startswith('NR'):
result['NR'].append(token)
elif len(token) == 0:
continue
else:
_log.warning("When retrieving matching RefSeq ids for "+gencode_id+" unexpected token: "+token)
return result
8 changes: 7 additions & 1 deletion metadome/presentation/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from metadome.controllers.job import (create_visualization_job_if_needed,
get_visualization_status,
retrieve_visualization)
from metadome.domain.wrappers.gencode import retrieve_refseq_identifiers_for_transcript


_log = logging.getLogger(__name__)
Expand All @@ -30,7 +31,7 @@ def get_transcript_ids_for_gene(gene_name):
_log.debug('get_transcript_ids_for_gene')
# retrieve the transcript ids for this gene
trancripts = GeneRepository.retrieve_all_transcript_ids(gene_name)

# check if there was any return value
if len(trancripts) > 0:
message = "Retrieved transcripts for gene '"+trancripts[0].gene_name+"'"
Expand All @@ -39,9 +40,14 @@ def get_transcript_ids_for_gene(gene_name):

transcript_results = []
for t in trancripts:
# retrieve matching refseq identifiers for this transcript
refseq_ids = retrieve_refseq_identifiers_for_transcript(t.gencode_transcription_id)
refseq_nm_numbers = ", ".join(nm_number for nm_number in refseq_ids['NM'])

transcript_entry = {}
transcript_entry['aa_length'] = t.sequence_length
transcript_entry['gencode_id'] = t.gencode_transcription_id
transcript_entry['refseq_nm_numbers'] = refseq_nm_numbers
transcript_entry['has_protein_data'] = not t.protein_id is None
transcript_results.append(transcript_entry)

Expand Down
22 changes: 20 additions & 2 deletions metadome/presentation/web/templates/js/dashboard.js
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,12 @@ function getTranscript() {
for (var i = 0; i < transcript_id_results.trancript_ids.length; i++) {
var opt = new Option();
opt.value = i;
opt.text = transcript_id_results.trancript_ids[i].gencode_id + " ("+ transcript_id_results.trancript_ids[i].aa_length +"aa)" ;
if ( transcript_id_results.trancript_ids[i].refseq_nm_numbers === ""){
opt.text = transcript_id_results.trancript_ids[i].gencode_id + " ("+ transcript_id_results.trancript_ids[i].aa_length +"aa)" ;
}
else {
opt.text = transcript_id_results.trancript_ids[i].gencode_id + " / "+ transcript_id_results.trancript_ids[i].refseq_nm_numbers+" ("+ transcript_id_results.trancript_ids[i].aa_length +"aa)" ;
}

if (!transcript_id_results.trancript_ids[i].has_protein_data){
opt.disabled = true;
Expand Down Expand Up @@ -529,9 +534,22 @@ function getVisualizationResult(transcript_id) {
$("#graph_control_field").removeClass('is-hidden');
var geneName = document.getElementById("geneName").value;
var geneDetails = document.getElementById("geneDetails");

var refSeqLinks = "";
if (obj.refseq_ids.length > 0) {
refSeqLinks += "RefSeq: ";
for (var i = 0; i < obj.refseq_ids.length; i++){
if (i > 0){
refSeqLinks += ', ';
}
refSeqLinks += '<a href="https://www.ncbi.nlm.nih.gov/nuccore/'+obj.refseq_ids[i]+'" target="_blank">'+obj.refseq_ids[i]+'</a>';
}
refSeqLinks += ','
}

geneDetails.innerHTML = 'Gene: ' + obj.gene_name + ' (transcript: <a href="http://grch37.ensembl.org/Homo_sapiens/Transcript/Summary?t='
+ obj.transcript_id + '" target="_blank">' + obj.transcript_id
+ '</a>, protein: <a href="https://www.uniprot.org/uniprot/'
+ '</a>, '+refSeqLinks+' protein: <a href="https://www.uniprot.org/uniprot/'
+ obj.protein_ac + '" target="_blank">' + obj.protein_ac + '</a>)';
// Draw the graph
createGraph(obj);
Expand Down
8 changes: 6 additions & 2 deletions metadome/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import os
import logging
from metadome.domain.models.gene import Strand
from metadome.domain.wrappers.gencode import retrieve_refseq_identifiers_for_transcript

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -153,7 +154,6 @@ def retrieve_metadomain_annotation(transcript_id, protein_position, domain_posit
transcript_ids = [meta_snvs[meta_snv_repr][0]['gencode_transcription_id'] for meta_snv_repr in meta_snvs.keys()]
transcripts_to_gene = GeneRepository.retrieve_gene_names_for_multiple_transcript_ids(transcript_ids)

# hier ergens iets doen: retrieve_gene_names_for_multiple_transcript_ids???
# iterate over meta_codons and add to metadom_entry
for meta_snv_repr in meta_snvs.keys():
if not current_codon.unique_str_representation() in meta_snv_repr:
Expand Down Expand Up @@ -197,6 +197,7 @@ def retrieve_metadomain_annotation(transcript_id, protein_position, domain_posit
domain_results[domain_id]["alignment_depth"] = alignment_depth

return domain_results

def analyse_transcript(transcript_id):
# Retrieve the gene from the database
try:
Expand All @@ -206,6 +207,9 @@ def analyse_transcript(transcript_id):

# build the gene region
gene_region = GeneRegion(gene)

# Retrieve the refseq ids
refseq_ids = retrieve_refseq_identifiers_for_transcript(transcript_id)

if not gene_region is None:
# generate the positional annotation for this gene by first computing the tolerance landscape
Expand Down Expand Up @@ -289,7 +293,7 @@ def analyse_transcript(transcript_id):

if domain["metadomain"] and len(consensus_positions)>0:
d['domains'][domain['ID']] = create_meta_domain_entry(gene_region, meta_domains[domain['ID']], consensus_positions, db_position)
result = {"transcript_id":transcript_id, "protein_ac":gene_region.uniprot_ac, "gene_name":gene_region.gene_name, "positional_annotation":region_positional_annotation, "domains":Pfam_domains}
result = {"transcript_id":transcript_id, "refseq_ids":refseq_ids['NM'], "protein_ac":gene_region.uniprot_ac, "gene_name":gene_region.gene_name, "positional_annotation":region_positional_annotation, "domains":Pfam_domains}
else:
result = {'error': 'No gene region could be build for transcript '+str(transcript_id)}

Expand Down

0 comments on commit da4d678

Please sign in to comment.