In [1]:
import pandas as pd
from utils import *
import numpy as np

## Human Protein Atlas
* We keep the categories "Validated", "Supported", "Approved"
* the rest is [tidy data](http://garrettgman.github.io/tidying/)
* if a protein resdies in multiple locations, it will appear in multiple lines

In [2]:
hpa = pd.read_csv("./data/hpa_subcellular_location.tsv", sep="\t")

In [3]:
hpa_filtered = hpa\
    .loc[:,("Gene name", "Validated", "Supported", "Approved")]\
    .melt(id_vars="Gene name", var_name="status", value_name="subcellular_location")\
    .rename(columns={"Gene name": "hgnc"})\
    .dropna()\
    .pipe(lambda x: tidy_split(x, 'subcellular_location', ';'))
hpa_filtered.head()

Unnamed: 0,hgnc,status,subcellular_location
11,CYP51A1,Validated,Endoplasmic reticulum
14,BAD,Validated,Mitochondria
31,RBM5,Validated,Nucleoplasm
36,FKBP4,Validated,Cytosol
36,FKBP4,Validated,Nucleoplasm


In [4]:
hpa_filtered.shape

(18420, 3)

In [5]:
hpa_filtered.to_csv("./results/hpa_filtered.tsv", sep="\t", index=False)

## Deeploc
* extract uniprot identifiers from ID and map them to HGNC symbols
    * the SYMBOL_HUMAN is some weird identifier, but not HGNC
    * the mapping was obtained from ensemble biomart
* there are multipe entries per identifier -> isoforms
* **For the start, we remove all genes that have isoforms with differing predicted locations**.
* Remove predictions predicted as Plastid (does not exist in human)

In [6]:
uniprot_hgnc = pd.read_csv("./data/biomart_hgnc_uniprot.tsv", sep="\t")\
    .rename(columns={"HGNC symbol": 'hgnc', "UniProtKB Gene Name ID": 'uniprot'})\
    .dropna()\
    .set_index("uniprot")
uniprot_hgnc.head()

Unnamed: 0_level_0,hgnc
uniprot,Unnamed: 1_level_1
U5Z754,MT-ND1
P03886,MT-ND1
Q7GXY9,MT-ND2
P03891,MT-ND2
U5YWV7,MT-CO1


In [7]:
deeploc = pd.read_csv("./data/deeploc_predictions.txt", sep="\t")

In [8]:
deeploc.head()

Unnamed: 0,#ID,Prediction,Membrane-bound,Nucleus,Cytoplasm,Extracellular,Mitochondrion,Cell_membrane,Endoplasmic_reticulum,Plastid,Golgi_apparatus,Lysosome/Vacuole,Peroxisome
0,sp|Q96MY7-2|F161B_HUMAN Isoform 2 of Protein F...,Nucleus,0.0787,0.5774,0.4075,0.0001,0.0003,0.0117,0.0002,0.0001,0.0019,0.0006,0.0003
1,sp|Q92541|RTF1_HUMAN RNA polymerase-associated...,Nucleus,0.3232,0.9968,0.0016,0.0,0.0013,0.0001,0.0,0.0,0.0001,0.0,0.0
2,sp|Q8N5V2|NGEF_HUMAN Ephexin-1 OS=Homo sapiens...,Nucleus,0.0188,0.7316,0.2641,0.0,0.0001,0.0016,0.0004,0.0,0.0015,0.0005,0.0001
3,sp|Q8IXQ6-3|PARP9_HUMAN Isoform 3 of Poly [ADP...,Cytoplasm,0.1902,0.1717,0.5183,0.0063,0.0338,0.0448,0.0365,0.0119,0.0347,0.0412,0.1008
4,sp|Q6ZW61|BBS12_HUMAN Bardet-Biedl syndrome 12...,Cytoplasm,0.2224,0.3059,0.4059,0.0013,0.132,0.0365,0.018,0.0026,0.0463,0.0303,0.0212


In [9]:
def get_isoform(id_):
    try:
        return int(id_.split("|")[1].split("-")[1])
    except IndexError:
        return np.nan
    
def get_uniprot_id(id_):
    return id_.split("|")[1].split("-")[0]

In [10]:
deeploc_filtered = deeploc\
    .assign(isoform=lambda x: x.loc[:, "#ID"].apply(get_isoform))\
    .assign(uniprot=lambda x: x.loc[:, "#ID"].apply(get_uniprot_id))\
    .join(uniprot_hgnc, on="uniprot", how="inner")\
    .loc[:,("hgnc", "Prediction")]\
    .groupby(["hgnc", "Prediction"])\
    .count()\
    .reset_index()\
    .drop_duplicates("hgnc", keep=False)\
    .query("Prediction != 'Plastid'")
    
deeploc_filtered.head()

Unnamed: 0,hgnc,Prediction
4,A2M,Extracellular
7,A3GALT2,Endoplasmic_reticulum
8,A4GALT,Golgi_apparatus
9,A4GNT,Golgi_apparatus
10,AAAS,Nucleus


In [11]:
deeploc_filtered.shape

(14845, 2)

In [12]:
deeploc_filtered.to_csv("./results/deeploc_filtered.tsv", sep="\t", index=False)

## Swissprot
* pre-parsed using perl script
* only human proteins
* genes with out a name are removed
* only evidence 'ECO:0000305' (curator inference used in manual assertion) is kept.

In [13]:
swissprot = pd.read_csv("results/subcellular_location.swissprot.tsv", sep="\t", na_values=["."])

In [14]:
swissprot_filtered = swissprot\
    .query("annotation_source == 'SwissProt'")\
    .query("species == 'Homo sapiens'")\
    .query("annotation_evidence == 'ECO:0000269'")\
    .dropna()\
    .drop("annotation_source", axis='columns')\
    .rename(columns={"gene_name": "hgnc"})
    
swissprot_filtered.head()

Unnamed: 0,uniprot_id,species,hgnc,subcellular_location,annotation_evidence
133,Q9NRA8,Homo sapiens,EIF4ENIF1,Cytoplasm,ECO:0000269
134,Q9NRA8,Homo sapiens,EIF4ENIF1,Nucleus,ECO:0000269
135,Q9NRA8,Homo sapiens,EIF4ENIF1,Nucleus,ECO:0000269
136,Q9NRA8,Homo sapiens,EIF4ENIF1,Nucleus speckle,ECO:0000269
151,Q676U5,Homo sapiens,ATG16L1,Cytoplasm,ECO:0000269


In [15]:
swissprot_filtered.shape

(8062, 5)

In [16]:
swissprot_filtered.to_csv("./results/swissprot_filtered.tsv", sep="\t", index=False)