In [1]:
# Dependencies
import numpy as np
import modules.string as string
import pandas as pd

In [2]:
# Constants
HUMAN_CSV_PATH = 'data/human.csv'  # Test set csv
GO_CSV_PATH = 'data/go.csv'  # GO terms dataset
STRING_GZ_PATH = 'data/string.txt.gz'  # String dataset

In [3]:
# Load human proteome dataset
human_proteome = pd.read_csv(HUMAN_CSV_PATH, sep='\t')
# Format string id column
human_proteome.string_id = human_proteome.string_id.map(lambda x: str(x).replace(';', '').strip())
human_proteome.head()

Unnamed: 0,entry_ac,entry_name,protein_name,len,go,pdb_ids,pfam_ids,string_id,PF00397
0,Q9Y263,PLAP_HUMAN,Phospholipase A-2-activating protein (PLA2P) (...,795,cell [GO:0005623]; cell junction [GO:0030054];...,2K89;2K8A;2K8B;2K8C;3EBB;,PF09070;PF08324;PF00400;,9606.ENSP00000380460,False
1,Q96RE7,NACC1_HUMAN,Nucleus accumbens-associated protein 1 (NAC-1)...,527,cell junction [GO:0030054]; cytoplasm [GO:0005...,3GA1;4U2N;,PF10523;PF00651;,9606.ENSP00000292431,False
2,O43312,MTSS1_HUMAN,Protein MTSS 1 (Metastasis suppressor YGL-1) (...,755,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,2D1K;,PF08397;PF02205;,9606.ENSP00000322804,False
3,Q9NP80,PLPL8_HUMAN,Calcium-independent phospholipase A2-gamma (EC...,782,endoplasmic reticulum membrane [GO:0005789]; G...,,PF01734;,9606.ENSP00000410804,False
4,Q15319,PO4F3_HUMAN,"POU domain, class 4, transcription factor 3 (B...",338,cytoplasm [GO:0005737]; nuclear chromatin [GO:...,,PF00046;PF00157;,9606.ENSP00000230732,False


In [4]:
# Define a fake original dataset
original = human_proteome.iloc[:5]
original.head()

Unnamed: 0,entry_ac,entry_name,protein_name,len,go,pdb_ids,pfam_ids,string_id,PF00397
0,Q9Y263,PLAP_HUMAN,Phospholipase A-2-activating protein (PLA2P) (...,795,cell [GO:0005623]; cell junction [GO:0030054];...,2K89;2K8A;2K8B;2K8C;3EBB;,PF09070;PF08324;PF00400;,9606.ENSP00000380460,False
1,Q96RE7,NACC1_HUMAN,Nucleus accumbens-associated protein 1 (NAC-1)...,527,cell junction [GO:0030054]; cytoplasm [GO:0005...,3GA1;4U2N;,PF10523;PF00651;,9606.ENSP00000292431,False
2,O43312,MTSS1_HUMAN,Protein MTSS 1 (Metastasis suppressor YGL-1) (...,755,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,2D1K;,PF08397;PF02205;,9606.ENSP00000322804,False
3,Q9NP80,PLPL8_HUMAN,Calcium-independent phospholipase A2-gamma (EC...,782,endoplasmic reticulum membrane [GO:0005789]; G...,,PF01734;,9606.ENSP00000410804,False
4,Q15319,PO4F3_HUMAN,"POU domain, class 4, transcription factor 3 (B...",338,cytoplasm [GO:0005737]; nuclear chromatin [GO:...,,PF00046;PF00157;,9606.ENSP00000230732,False


In [5]:
# Load gene ontology dataset
gene_ontology = pd.read_csv(GO_CSV_PATH, sep='\t', dtype={
    'entry_ac': np.unicode_,
    'go_id': np.unicode_,
    'go_descr': np.unicode_
})
gene_ontology.head()

Unnamed: 0,entry_ac,go_id,go_descr
0,Q9Y263,5623,cell
1,Q9Y263,30054,cell junction
2,Q9Y263,5737,cytoplasm
3,Q9Y263,70062,extracellular exosome
4,Q9Y263,5634,nucleus


In [6]:
# Load interactions in human proteome
interactions = string.load(STRING_GZ_PATH)
interactions.head()

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000272298,490
1,9606.ENSP00000000233,9606.ENSP00000253401,198
2,9606.ENSP00000000233,9606.ENSP00000401445,159
3,9606.ENSP00000000233,9606.ENSP00000418915,606
4,9606.ENSP00000000233,9606.ENSP00000327801,167


In [7]:
# Define set of ids in original proteome (must be kept)
original_string_ids = set(original.string_id.tolist())

# Get direct interactors
direct_interactors = interactions.protein2[interactions.protein1.isin(original_string_ids)]
direct_interactors.head()

713648    9606.ENSP00000226193
713649    9606.ENSP00000280190
713650    9606.ENSP00000297991
713651    9606.ENSP00000381932
713652    9606.ENSP00000352264
Name: protein2, dtype: object

In [11]:
# Define interactors ids
interactors_string_ids = set(direct_interactors.tolist())
# Define union of the two sets
all_string_ids = original_string_ids | interactors_string_ids

# Get all proteins in original dataset, plus direct interactors
original = human_proteome[human_proteome.string_id.isin(all_string_ids)]
original.head()

Unnamed: 0,entry_ac,entry_name,protein_name,len,go,pdb_ids,pfam_ids,string_id,PF00397
0,Q9Y263,PLAP_HUMAN,Phospholipase A-2-activating protein (PLA2P) (...,795,cell [GO:0005623]; cell junction [GO:0030054];...,2K89;2K8A;2K8B;2K8C;3EBB;,PF09070;PF08324;PF00400;,9606.ENSP00000380460,False
1,Q96RE7,NACC1_HUMAN,Nucleus accumbens-associated protein 1 (NAC-1)...,527,cell junction [GO:0030054]; cytoplasm [GO:0005...,3GA1;4U2N;,PF10523;PF00651;,9606.ENSP00000292431,False
2,O43312,MTSS1_HUMAN,Protein MTSS 1 (Metastasis suppressor YGL-1) (...,755,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,2D1K;,PF08397;PF02205;,9606.ENSP00000322804,False
3,Q9NP80,PLPL8_HUMAN,Calcium-independent phospholipase A2-gamma (EC...,782,endoplasmic reticulum membrane [GO:0005789]; G...,,PF01734;,9606.ENSP00000410804,False
4,Q15319,PO4F3_HUMAN,"POU domain, class 4, transcription factor 3 (B...",338,cytoplasm [GO:0005737]; nuclear chromatin [GO:...,,PF00046;PF00157;,9606.ENSP00000230732,False


In [13]:
original.shape

(2640, 9)