# Mitocarta2 Mapping Uniprot Accession to Entrez

- http://www.uniprot.org/help/api_idmapping

In [1]:
import tempfile
import omin
import pandas as pd
from urllib import parse
from urllib import request

def uniprot_query(start, end, query):
    """Return a query from the UniProt RESTful API."""
    url = 'http://www.uniprot.org/uploadlists/'

    params = {'from':start,
              'to':end,
              'format':'tab',
              'query':query}
    
    data = parse.urlencode(params)
    req = request.Request(url, data.encode('utf-8'))
    response = request.urlopen(req)
    res = response.read().decode()
    return res

In [2]:
# FIXME: Before publishing make sure to include derivation of omin.MitoCartaTwo.data 
# Create query string.
all_gene_ids = omin.MitoCartaTwo.data.MouseGeneID.astype(str).tolist()
all_gene_ids = ' '.join(all_gene_ids)
# Make query.
res = uniprot_query(start='P_ENTREZGENEID', end='ACC', query=all_gene_ids)

In [3]:
# Format the results.
r = '\t'.join(res.split('\t')[2:])
r = list(filter(lambda x:len(x)>0, r.split('\n')))
r[0] = '\t'.join(['Entrez',r[0]])
r = ['\t'.join(list(filter(lambda x: len(x)>0, i.split('\t')))) for i in r]
r = '\n'.join(r)

In [4]:
# Create a temporary file for the results.
temp = tempfile.TemporaryFile(mode='w+')
temp.write(r)
temp.seek(0)
# Read the results into a pandas DataFrame. 
req_table = pd.read_table(temp, sep='\t')
# Close and destroy the tempfile
temp.close()

In [5]:
# Format the results table
areq = req_table[req_table.columns[:3]].copy()
areq.rename(columns={'Entrez':'MouseGeneID','Entry':'Accession'}, inplace=True)
areq.rename(columns={'Entry name':'UniProtKB'}, inplace=True)
areq = areq[areq.columns[::-1]]

In [6]:
# Create a copy of the MitoCarta2 DataFrame so merge doesn't complian.
new_mc = omin.MitoCartaTwo.data.copy()
nmc = areq.merge(new_mc, on="MouseGeneID", how="left")
nmc.to_pickle('uniprotkb_mitocarta2.p.gz', compression='gzip')

In [7]:
# Check to make sure that most Gene IDs are accounted for.

mgi_mc = set(new_mc.MouseGeneID.tolist())

mgi_mc2 = set(nmc.MouseGeneID.tolist())

set.difference(mgi_mc, mgi_mc2)

{15925, 245347}

These are the only geen