In [1]:
import pandas as pd
from utils import *

## Human Protein Atlas
* We keep the categories "Validated", "Supported", "Approved"
* the rest is [tidy data](http://garrettgman.github.io/tidying/)
* if a protein resdies in multiple locations, it will appear in multiple lines

In [2]:
hpa = pd.read_csv("./data/hpa_subcellular_location.tsv", sep="\t")

In [3]:
hpa_filtered = hpa\
    .loc[:,("Gene name", "Validated", "Supported", "Approved")]\
    .melt(id_vars="Gene name", var_name="status")\
    .dropna()\
    .pipe(lambda x: tidy_split(x, 'value', ';'))
hpa_filtered.head()

Unnamed: 0,Gene name,status,value
11,CYP51A1,Validated,Endoplasmic reticulum
14,BAD,Validated,Mitochondria
31,RBM5,Validated,Nucleoplasm
36,FKBP4,Validated,Cytosol
36,FKBP4,Validated,Nucleoplasm


In [14]:
hpa_filtered.shape

(18420, 3)

In [4]:
hpa_filtered.to_csv("./results/hpa_filtered.tsv", sep="\t")

## Deeploc
* extract gene symbols from identifier
* there are multipe entries per identifier -> isoforms
* **For the start, we remove all genes that have isoforms with differing predicted locations**.

In [5]:
deeploc = pd.read_csv("./data/deeploc_predictions.txt", sep="\t")

In [6]:
deeploc_filtered = deeploc\
    .assign(gene_symbol=lambda x: x.loc[:,"#ID"].apply(
        lambda k: k.split(" ")[0].split("|")[2].replace("_HUMAN", "")))\
    .loc[:,("gene_symbol", "Prediction")]\
    .groupby(["gene_symbol", "Prediction"])\
    .count()\
    .reset_index()\
    .drop_duplicates("gene_symbol", keep=False)
    
deeploc_filtered.head()

Unnamed: 0,gene_symbol,Prediction
0,1433B,Cytoplasm
1,1433E,Cytoplasm
2,1433F,Cytoplasm
3,1433G,Cytoplasm
4,1433S,Cytoplasm


In [13]:
deeploc_filtered.shape

(16655, 2)

In [7]:
deeploc_filtered.to_csv("./results/deeploc_filtered.tsv", sep="\t")

## Swissprot
* pre-parsed using perl script
* only human proteins
* genes with out a name are removed
* only evidence 'ECO:0000305' (curator inference used in manual assertion) is kept.

In [8]:
swissprot = pd.read_csv("results/subcellular_location.swissprot.tsv", sep="\t", na_values=["."])

In [9]:
swissprot_filtered = swissprot\
    .query("annotation_source == 'SwissProt'")\
    .query("species == 'Homo sapiens'")\
    .query("annotation_evidence == 'ECO:0000305'")\
    .dropna()\
    .drop("annotation_source", axis='columns')
    
swissprot_filtered.head()

Unnamed: 0,uniprot_id,species,gene_name,subcellular_location,annotation_evidence
293,Q99758,Homo sapiens,ABCA3,Membrane,ECO:0000305
593,Q6ZNF0,Homo sapiens,ACP7,Secreted,ECO:0000305
738,Q6P461,Homo sapiens,ACSM6,Mitochondrion,ECO:0000305
740,Q9H6R3,Homo sapiens,ACSS3,Mitochondrion,ECO:0000305
771,Q5VUY2,Homo sapiens,AADACL4,Membrane,ECO:0000305


In [12]:
swissprot_filtered.shape

(4374, 5)

In [10]:
swissprot_filtered.to_csv("./results/swissprot_filtered.tsv", sep="\t")