Skip to content

Commit

Permalink
added functionality to replace outdated with current hgnc symbols in …
Browse files Browse the repository at this point in the history
…PC v12
  • Loading branch information
ri23 committed Apr 19, 2020
1 parent 43a6d12 commit 387eb0f
Showing 1 changed file with 41 additions and 9 deletions.
50 changes: 41 additions & 9 deletions genewalk/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import shutil
import logging
import urllib.request
import pandas
from indra.databases import hgnc_client

logger = logging.getLogger('genewalk.resources')

Expand Down Expand Up @@ -31,13 +33,17 @@ def get_goa_gaf(self):
return fname

def get_pc(self):
fname = os.path.join(self.resource_folder,
'PathwayCommons12.All.hgnc.sif')
if not os.path.exists(fname):
url_pc = ('http://www.pathwaycommons.org/archives/PC2/v12/'
'PathwayCommons12.All.hgnc.sif.gz')
download_gz(fname, url_pc)
return fname
fname_current = os.path.join(self.resource_folder,
'PathwayCommons12.All.hgnc_current.sif')
if not os.path.exists(fname_current):
fname = os.path.join(self.resource_folder,
'PathwayCommons12.All.hgnc.sif')
if not os.path.exists(fname):
url_pc = ('http://www.pathwaycommons.org/archives/PC2/v12/'
'PathwayCommons12.All.hgnc.sif.gz')
download_gz(fname, url_pc)
self._replace_outdated_hgnc_symbols(fname,fname_current)
return fname_current

def get_mgi_entrez(self):
fname = os.path.join(self.resource_folder, 'MGI_EntrezGene.rpt')
Expand All @@ -49,20 +55,46 @@ def get_mgi_entrez(self):

def _get_resource_folder(self):
resource_dir = os.path.join(self.base_folder, 'resources')

if not os.path.isdir(resource_dir):
try:
os.makedirs(resource_dir)
except Exception:
logger.warning(resource_dir + ' already exists')
return resource_dir

def _replace_outdated_hgnc_symbols(self,pc_old,pc_current):
logger.info('Replacing outdated HGNC symbols in %s and save as %s' % \
(pc_old, pc_current))
pc = pandas.read_csv(pc_old,sep='\t',dtype=str, header=None)
col_mapper = {}
col_mapper[0] = 'source'
col_mapper[1] = 'rel_type'
col_mapper[2] = 'target'
pc = pc.rename(mapper=col_mapper, axis='columns')
all_symbols = set(pc['source']).union(pc['target'])
symbol_map = {}
for sym in all_symbols:
if not sym.startswith('CHEBI:'):
hgnc_id = hgnc_client.get_current_hgnc_id(sym)
if not hgnc_id:
continue
elif isinstance(hgnc_id, list):
#outdated gene symbol is ambiguous: maps to multiple genes
continue
latest_symbol = hgnc_client.get_hgnc_name(hgnc_id)
if latest_symbol != sym:
symbol_map[sym] = latest_symbol
if symbol_map:
pc.replace(symbol_map,inplace=True)
pc.to_csv(pc_current, sep='\t', header=False, index=False)
os.remove(pc_old)

def download_all(self):
self.get_go_obo()
self.get_goa_gaf()
self.get_pc()
self.get_mgi_entrez()


def download_url(url, fname):
logger.info('Downloading %s into %s' % (url, fname))
Expand Down

0 comments on commit 387eb0f

Please sign in to comment.