In [35]:
# %load parser.py
# import the panda, request, io, and mygene libraries
import pandas as pd
import requests
import io
import mygene

# construct the query data
query = {

  "query": "annotation:(type:transmem) (organism:\"Homo sapiens (Human) [9606]\" OR organism:\"Mus musculus (Mouse) [10090]\")",
  "columns":"id,comment(SUBCELLULAR LOCATION)",
  "format": "tab"

}

# get the data from uni prot
r = requests.get("http://www.uniprot.org/uniprot/", params = query)

In [36]:
# Simple expression to find annotation data
def findAnnotation(text):
  text = str(text)
  # Now check for an annotation dictionary
  if text.find("{") != -1:
    return text[text.find("{") + 1: text.find("}")]
  else:
    return "None"

# expression to check for the type of protein
def findType(entry):
  entry = str(entry)
  # Check for single, multi, beta or other
  if entry.find("Single-pass") != -1:
    return "Single-pass membrane protein"
  elif entry.find("Multi-pass") != -1:
    return "Multi-pass membrane protein"
  elif entry.find("Beta-barrel") != -1:
    return "Beta-barrel membrane protein"
  else:
    return "Data unknown"

# read in the table data and set column names
data = pd.read_table(io.StringIO(r.text), header=0, names = ["UniProtID", "Reference"])

# Add the type column
data["Type"] = data["Reference"].apply(lambda x: findType(x))

# Add the annotation column
data["Annotations"] = data["Reference"].apply(lambda x: findAnnotation(x))

In [37]:
# now use mygene to add additional IDs
mg = mygene.MyGeneInfo()
out = mg.querymany(data["UniProtID"], scopes='uniprot', fields='entrezgene,ensembl.gene,refseq,symbol,taxid', returnall=True)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
queryin

In [38]:
def getTax(value):
    # Value might be not found, so treat as string
    value = str(value)
    if value == "9606":
        return "Human"
    elif value == "10090":
        return "Mouse"
    else:
        return "Unknown"
    
# change all values to mouse/human
for entry in out["out"]:
    if "taxid" in entry:
        entry["taxid"] = getTax(entry["taxid"])

In [40]:
#create a separate dictionary for duplicates
duplicates = {}
for k, v in out["dup"]:
    duplicates[k] = v
    
# create a list of all the duplicates
dups = [entry for entry in out["out"] if entry["query"] in duplicates]

# whether or not an entry is unique
def isUnique(entry):
    
    # check if query is a duplicate
    if entry["query"] in duplicates:
        
        #check if the duplicate has been merged
        if duplicates[entry["query"]] > 1:
            
            # get a list of the current duplicates
            entries = [dup for dup in dups if dup["query"] == entry["query"]]
            
            # now merge each category
            entry["entrezgene"] = [item["entrezgene"] for item in entries if "entrezgene" in item]
            entry["ensembl"] = [item["ensembl"] for item in entries if "ensembl" in item]
            entry["refseq"] = [item["refseq"] for item in entries if "refseq" in item]
            entry["symbol"] = [item["symbol"] for item in entries if "symbol" in item]
            entry["taxid"] = [item["taxid"] for item in entries if "taxid" in item]
            
            # mark as merged
            duplicates[entry["query"]] = 1
            return True
        else:
            return False
    else:
        return True

#filter out duplicates
parsed = [entry for entry in  out["out"] if isUnique(entry)]

In [41]:
data["EntrezID"] = pd.Series(map(lambda d: d.get('entrezgene', 'Not Found'), parsed))
data["EnsemblID"] = pd.Series(map(lambda d: d.get('ensembl', 'Not Found'), parsed))
data["RefSeqID"] = pd.Series(map(lambda d: d.get('refseq', 'Not Found'), parsed))
data["Symbol"] = pd.Series(map(lambda d: d.get('symbol', 'Not Found'), parsed))
data["TaxID"] = pd.Series(map(lambda d: d.get('taxid', 'Not Found'), parsed))

In [42]:
# now write the table to a file
data.to_csv("formatted.tab", index=False, header=True, sep="	")

In [43]:
data

Unnamed: 0,UniProtID,Reference,Type,Annotations,EntrezID,EnsemblID,RefSeqID,Symbol,TaxID
0,Q9NYW0,SUBCELLULAR LOCATION: Membrane; Multi-pass mem...,Multi-pass membrane protein,,50839,"[{'gene': 'ENSG00000272805'}, {'gene': 'ENSG00...","{'genomic': ['NC_000012.12', 'NC_018923.2', 'N...",TAS2R10,Human
1,Q9NYV9,SUBCELLULAR LOCATION: Membrane; Multi-pass mem...,Multi-pass membrane protein,,50838,"[{'gene': 'ENSG00000273457'}, {'gene': 'ENSG00...","{'genomic': ['NC_000012.12', 'NC_018923.2', 'N...",TAS2R13,Human
2,P59538,SUBCELLULAR LOCATION: Membrane; Multi-pass mem...,Multi-pass membrane protein,,259290,"[{'gene': 'ENSG00000256436'}, {'gene': 'ENSG00...","{'genomic': ['NC_000012.12', 'NC_018923.2', 'N...",TAS2R31,Human
3,Q7TQA6,SUBCELLULAR LOCATION: Membrane; Multi-pass mem...,Multi-pass membrane protein,,387513,{'gene': 'ENSMUSG00000058250'},"{'genomic': 'NC_000072.6', 'protein': 'NP_0010...",Tas2r138,Mouse
4,Q7TQA5,SUBCELLULAR LOCATION: Membrane; Multi-pass mem...,Multi-pass membrane protein,,353148,{'gene': 'ENSMUSG00000047102'},"{'genomic': 'NC_000072.6', 'protein': 'NP_8517...",Tas2r139,Mouse
5,Q7TQB8,SUBCELLULAR LOCATION: Membrane; Multi-pass mem...,Multi-pass membrane protein,,387515,{'gene': 'ENSMUSG00000051917'},"{'genomic': 'NC_000072.6', 'protein': 'NP_0010...",Tas2r144,Mouse
6,P59539,SUBCELLULAR LOCATION: Membrane; Multi-pass mem...,Multi-pass membrane protein,,Not Found,Not Found,Not Found,Not Found,Not Found
7,Q8C0G2,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,Single-pass membrane protein,ECO:0000305,215243,{'gene': 'ENSMUSG00000037318'},"{'genomic': 'NC_000067.6', 'protein': 'NP_6947...",Traf3ip3,Mouse
8,Q96CE8,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,Multi-pass membrane protein,ECO:0000305,116441,{'gene': 'ENSG00000163762'},"{'genomic': ['NC_000003.12', 'NC_018914.2'], '...",TM4SF18,Human
9,Q91XD3,SUBCELLULAR LOCATION: Membrane {ECO:0000250}; ...,Multi-pass membrane protein,ECO:0000250,Not Found,Not Found,Not Found,Not Found,Not Found
