In [1]:
import os

DATA_FOLDER = os.path.join('..', 'data')
FIGURE_FOLDER = os.path.join('..', 'figures')

notebook_name = '020_get_nucleus_cytoplasm_genes'

data_folder = os.path.join(DATA_FOLDER, notebook_name)
figure_folder = os.path.join(FIGURE_FOLDER, notebook_name)

! mkdir -p $data_folder
! mkdir -p $figure_folder

input_folder = os.path.join(DATA_FOLDER, '000_filter_for_differentially_localized_proteins')

In [2]:
import pandas as pd

csv = os.path.join(input_folder, 'v17_antibody_localization_tidy.csv')
localization = pd.read_csv(csv)
print(localization.shape)
localization.head()

(50095, 6)


Unnamed: 0,antibody_id,tissue_name,cellular_component,level_a,level_b,level_c
0,167,A-431,Nuclear_speckles,nuclear_speckles,nuclear_bodies,nucleus
1,167,U-2 OS,Cytosol,cytosol,cytosol,cytoplasm
2,167,U-2 OS,Nuclear_speckles,nuclear_speckles,nuclear_bodies,nucleus
3,167,U-251 MG,Cytosol,cytosol,cytosol,cytoplasm
4,167,U-251 MG,Nuclear_speckles,nuclear_speckles,nuclear_bodies,nucleus


In [3]:
csv = os.path.join(input_folder, 'v17_antibody_metadata.csv')
metadata = pd.read_csv(csv)
print(metadata.shape)
metadata.head()

(33456, 7)


Unnamed: 0,antibody_id,prest_id,tissue_name,ensg_id,gene_name,uniprot_id,aa_sequencing_sequence
0,167,231776,A-431,ENSG00000130830,MPP1,Q00013,TEEMTRNISANEFLEFGSYQGNMFGTKFETVHQIHKQNKIAILDIE...
1,167,231776,U-2 OS,ENSG00000130830,MPP1,Q00013,TEEMTRNISANEFLEFGSYQGNMFGTKFETVHQIHKQNKIAILDIE...
2,167,231776,U-251 MG,ENSG00000130830,MPP1,Q00013,TEEMTRNISANEFLEFGSYQGNMFGTKFETVHQIHKQNKIAILDIE...
3,243,230049,A-431,ENSG00000036473,OTC,P00480,ILADYLTLQEHYSSLKGLTLSWIGDGNNILHSIMMSAAKFGMHLQA...
4,243,230049,U-2 OS,ENSG00000036473,OTC,P00480,ILADYLTLQEHYSSLKGLTLSWIGDGNNILHSIMMSAAKFGMHLQA...


### Get metadata for singly localized antibodies across cell lines

They were only found in one place no matter which cell line. Use "Level C" which is coarsest granularity.

In [21]:
merge_cols = ['antibody_id', 'tissue_name']
localization_metadata = localization.merge(metadata, left_on=merge_cols, right_on=merge_cols)
print(localization_metadata.shape)
localization_metadata.head()

(50095, 11)


Unnamed: 0,antibody_id,tissue_name,cellular_component,level_a,level_b,level_c,prest_id,ensg_id,gene_name,uniprot_id,aa_sequencing_sequence
0,167,A-431,Nuclear_speckles,nuclear_speckles,nuclear_bodies,nucleus,231776,ENSG00000130830,MPP1,Q00013,TEEMTRNISANEFLEFGSYQGNMFGTKFETVHQIHKQNKIAILDIE...
1,167,U-2 OS,Cytosol,cytosol,cytosol,cytoplasm,231776,ENSG00000130830,MPP1,Q00013,TEEMTRNISANEFLEFGSYQGNMFGTKFETVHQIHKQNKIAILDIE...
2,167,U-2 OS,Nuclear_speckles,nuclear_speckles,nuclear_bodies,nucleus,231776,ENSG00000130830,MPP1,Q00013,TEEMTRNISANEFLEFGSYQGNMFGTKFETVHQIHKQNKIAILDIE...
3,167,U-251 MG,Cytosol,cytosol,cytosol,cytoplasm,231776,ENSG00000130830,MPP1,Q00013,TEEMTRNISANEFLEFGSYQGNMFGTKFETVHQIHKQNKIAILDIE...
4,167,U-251 MG,Nuclear_speckles,nuclear_speckles,nuclear_bodies,nucleus,231776,ENSG00000130830,MPP1,Q00013,TEEMTRNISANEFLEFGSYQGNMFGTKFETVHQIHKQNKIAILDIE...


## Use medium level of granularity (level B)

Use Level B granulari5ty

In [109]:
LEVEL = "level_b"
sorted(localization[LEVEL].dropna().unique())

['actin_filaments',
 'cytoplasmic_bodies',
 'cytosol',
 'endoplasmic_reticulum',
 'golgi_apparatus',
 'intermediate_filaments',
 'microtubule_organizing_center',
 'microtubules',
 'mitochondria',
 'mitotic_filament',
 'nuclear_bodies',
 'nuclear_membrane',
 'nucleoli',
 'nucleoplasm',
 'plasma_membrane',
 'vesicles']

In [100]:
single_level_a = localization_metadata.groupby(['antibody_id']).filter(lambda x: len(x[LEVEL].unique()) == 1)
print(single_level_a.shape)
single_level_a.head()

(14087, 11)


Unnamed: 0,antibody_id,tissue_name,cellular_component,level_a,level_b,level_c,prest_id,ensg_id,gene_name,uniprot_id,aa_sequencing_sequence
5,243,A-431,Mitochondria,mitochondria,mitochondria,cytoplasm,230049,ENSG00000036473,OTC,P00480,ILADYLTLQEHYSSLKGLTLSWIGDGNNILHSIMMSAAKFGMHLQA...
6,243,U-2 OS,Mitochondria,mitochondria,mitochondria,cytoplasm,230049,ENSG00000036473,OTC,P00480,ILADYLTLQEHYSSLKGLTLSWIGDGNNILHSIMMSAAKFGMHLQA...
7,243,U-251 MG,Mitochondria,mitochondria,mitochondria,cytoplasm,230049,ENSG00000036473,OTC,P00480,ILADYLTLQEHYSSLKGLTLSWIGDGNNILHSIMMSAAKFGMHLQA...
8,244,MCF7,Nucleoplasm,nucleoplasm,nucleoplasm,nucleus,230064,ENSG00000049769,PPP1R3F,Q6ZSY5,TDGGMSPSHPLGILTDRDLILKWPGPERALNSALAEEITLHYARLG...
9,244,U-2 OS,Nucleoplasm,nucleoplasm,nucleoplasm,nucleus,230064,ENSG00000049769,PPP1R3F,Q6ZSY5,TDGGMSPSHPLGILTDRDLILKWPGPERALNSALAEEITLHYARLG...


## Use ENSG ids since that's how we'll be getting the sequences

In [101]:
single_level_a = localization_metadata.groupby(['ensg_id']).filter(lambda x: len(x[LEVEL].unique()) == 1)
print(single_level_a.shape)
single_level_a.head()

(12388, 11)


Unnamed: 0,antibody_id,tissue_name,cellular_component,level_a,level_b,level_c,prest_id,ensg_id,gene_name,uniprot_id,aa_sequencing_sequence
5,243,A-431,Mitochondria,mitochondria,mitochondria,cytoplasm,230049,ENSG00000036473,OTC,P00480,ILADYLTLQEHYSSLKGLTLSWIGDGNNILHSIMMSAAKFGMHLQA...
6,243,U-2 OS,Mitochondria,mitochondria,mitochondria,cytoplasm,230049,ENSG00000036473,OTC,P00480,ILADYLTLQEHYSSLKGLTLSWIGDGNNILHSIMMSAAKFGMHLQA...
7,243,U-251 MG,Mitochondria,mitochondria,mitochondria,cytoplasm,230049,ENSG00000036473,OTC,P00480,ILADYLTLQEHYSSLKGLTLSWIGDGNNILHSIMMSAAKFGMHLQA...
8,244,MCF7,Nucleoplasm,nucleoplasm,nucleoplasm,nucleus,230064,ENSG00000049769,PPP1R3F,Q6ZSY5,TDGGMSPSHPLGILTDRDLILKWPGPERALNSALAEEITLHYARLG...
9,244,U-2 OS,Nucleoplasm,nucleoplasm,nucleoplasm,nucleus,230064,ENSG00000049769,PPP1R3F,Q6ZSY5,TDGGMSPSHPLGILTDRDLILKWPGPERALNSALAEEITLHYARLG...


Remove redundant/unneeded columns

In [102]:


# singly_localized = single_level_c.merge(metadata, left_on=merge_cols, right_on=merge_cols)
# print(singly_localized.shape)

## Ignore aa_sequencing_sequence as that's where the antigen binds and isn't hte whole protein sequence
all_columns = set(['tissue_name', 'cellular_component', 'level_a', 'level_b', 'level_c',
                    'aa_sequencing_sequence', 
                    'antibody_id', 'prest_id'])
columns_to_drop = list(all_columns - set([LEVEL]))

singly_localized = single_level_c.drop(columns=columns_to_drop)
singly_localized = singly_localized.drop_duplicates()
print(singly_localized.shape)
singly_localized.head()

(4629, 4)


Unnamed: 0,level_b,ensg_id,gene_name,uniprot_id
5,mitochondria,ENSG00000036473,OTC,P00480
8,nucleoplasm,ENSG00000049769,PPP1R3F,Q6ZSY5
33,vesicles,ENSG00000164292,RHOBTB3,O94955
36,nucleoplasm,ENSG00000156504,FAM122B,Q7Z309
39,cytosol,ENSG00000102096,PIM2,Q9P1W9


In [111]:
nucleus_cytoplasm_names = ["nucleoplasm", "cytosol"]

ind = singly_localized[LEVEL].isin(nucleus_cytoplasm_names)
nucleus_cytoplasm = singly_localized.loc[ind]
print(nucleus_cytoplasm.shape)
nucleus_cytoplasm.head()

(2272, 4)


Unnamed: 0,level_b,ensg_id,gene_name,uniprot_id
8,nucleoplasm,ENSG00000049769,PPP1R3F,Q6ZSY5
36,nucleoplasm,ENSG00000156504,FAM122B,Q7Z309
39,cytosol,ENSG00000102096,PIM2,Q9P1W9
46,nucleoplasm,ENSG00000068394,GPKOW,Q92917
73,nucleoplasm,ENSG00000133131,MORC4,Q8TE76


In [124]:
csv = os.path.join(data_folder, 'nucleus_cytoplasm.csv')
nucleus_cytoplasm.to_csv(csv, index=False)

How many non-unique uniprot ids per antibody?

In [125]:
(nucleus_cytoplasm.groupby('uniprot_id').size() != 1).sum()

3

In [126]:
nucleus_cytoplasm['ensg_id']

8                                          ENSG00000049769
36                                         ENSG00000156504
39                                         ENSG00000102096
46                                         ENSG00000068394
73                                         ENSG00000133131
93                                         ENSG00000269335
126                                        ENSG00000102241
131                                        ENSG00000198157
139                                        ENSG00000102057
142                                        ENSG00000239900
186                                        ENSG00000062096
208                                        ENSG00000169057
225                                        ENSG00000100439
273                                        ENSG00000090061
298                                        ENSG00000182901
307                                        ENSG00000087842
310                                        ENSG000001007

In [127]:
nucleus_cytoplasm['uniprot_id']

8                             Q6ZSY5
36                            Q7Z309
39                            Q9P1W9
46                            Q92917
73                            Q8TE76
93                            Q9Y6K9
126                           O43719
131                           P82970
139                           Q9NSA2
142                           P30566
186                           P54793
208                           P51608
225                           Q8TB40
273                           O75909
298                           P49802
307                           O00625
310                           P11586
321                           P55957
328                           Q5JUX0
345                           Q9Y3L3
347                           Q9Y4F1
364                           Q99538
374                           Q969R5
411                           P20062
414                           P50458
417                           Q9NUQ3
441                           Q8N806
4

In [128]:
nucleus_cytoplasm.isnull().sum()

level_b        0
ensg_id        0
gene_name      0
uniprot_id    17
dtype: int64

### Since there's a bunch of null UNIPROT ids, So let's use the ENSG ids

"unroll" the comma-separated ENSG ids

In [129]:
multiple_ensg_ids = nucleus_cytoplasm.loc[nucleus_cytoplasm['ensg_id'].str.contains(",")]
print(multiple_ensg_ids.shape)
multiple_ensg_ids.head(20)

(72, 4)


Unnamed: 0,level_b,ensg_id,gene_name,uniprot_id
522,nucleoplasm,"ENSG00000126945,ENSG00000169045","HNRNPH2,HNRNPH1","P55795,P31943"
1760,cytosol,"ENSG00000173674,ENSG00000198692","EIF1AX,EIF1AY","P47813,O14602"
2170,nucleoplasm,"ENSG00000213920,ENSG00000255526","MDP1,NEDD8-MDP1",Q86V88
2282,nucleoplasm,"ENSG00000146757,ENSG00000173041","ZNF92,ZNF680","Q03936,Q8NEM1"
2308,cytosol,"ENSG00000169951,ENSG00000169955","ZNF764,ZNF747","Q96H86,Q9BV97"
2943,nucleoplasm,"ENSG00000167081,ENSG00000185630","PBX3,PBX1","P40426,P40424"
3115,cytosol,"ENSG00000038210,ENSG00000281028","PI4K2B,RP11-717K11.2",Q8TCG2
3867,cytosol,"ENSG00000168970,ENSG00000243789","JMJD7-PLA2G4B,JMJD7",P0C870
3967,nucleoplasm,"ENSG00000122566,ENSG00000135486,ENSG00000170144","HNRNPA2B1,HNRNPA1,HNRNPA3","P22626,P09651,P51991"
5148,cytosol,"ENSG00000250506,ENSG00000261408","CDK3,TEN1-CDK3",Q00526


Test out the function with `range()` so it only does the first 20 rows

In [130]:
for j, (i, row) in zip(range(20), multiple_ensg_ids.iterrows()):
    ensg_id = row['ensg_id'].split(',')
    gene_name = row['gene_name'].split(',')
    uniprot_id = row['uniprot_id'].split(",")
    if len(uniprot_id) == 1:
        print("ensg_id:", uniprot_id)
        print("uniprot_id:", uniprot_id)
        uniprot_id = [uniprot_id[0]]*len(ensg_id)
    
    df = pd.DataFrame(dict(ensg_id=ensg_id, gene_name=gene_name, uniprot_id=uniprot_id))
#     df['antibody_id'] = row['antibody_id']
    df[LEVEL] = row[LEVEL]
#     df['prest_id'] = row['prest_id']
    print(df)

           ensg_id gene_name uniprot_id      level_b
0  ENSG00000126945   HNRNPH2     P55795  nucleoplasm
1  ENSG00000169045   HNRNPH1     P31943  nucleoplasm
           ensg_id gene_name uniprot_id  level_b
0  ENSG00000173674    EIF1AX     P47813  cytosol
1  ENSG00000198692    EIF1AY     O14602  cytosol
ensg_id: ['Q86V88']
uniprot_id: ['Q86V88']
           ensg_id   gene_name uniprot_id      level_b
0  ENSG00000213920        MDP1     Q86V88  nucleoplasm
1  ENSG00000255526  NEDD8-MDP1     Q86V88  nucleoplasm
           ensg_id gene_name uniprot_id      level_b
0  ENSG00000146757     ZNF92     Q03936  nucleoplasm
1  ENSG00000173041    ZNF680     Q8NEM1  nucleoplasm
           ensg_id gene_name uniprot_id  level_b
0  ENSG00000169951    ZNF764     Q96H86  cytosol
1  ENSG00000169955    ZNF747     Q9BV97  cytosol
           ensg_id gene_name uniprot_id      level_b
0  ENSG00000167081      PBX3     P40426  nucleoplasm
1  ENSG00000185630      PBX1     P40424  nucleoplasm
ensg_id: ['Q8TCG2']
u

ValueError: arrays must all be same length

Do it on the whole thing

In [None]:
dfs = []

for i, row in multiple_ensg_ids.iterrows():
    ensg_id = row['ensg_id'].split(',')

    
    try:
        df = pd.DataFrame(dict(ensg_id=ensg_id, #gene_name=gene_name, 
                               #uniprot_id=uniprot_id
                              ))
    except ValueError:
        print("no df -- gene_name:", gene_name)
        print("no df -- ensg_id:", ensg_id)
        print("no df -- uniprot_id:", uniprot_id)
#     df['antibody_id'] = row['antibody_id']
    df[LEVEL] = row[LEVEL]
#     df['prest_id'] = row['prest_id']
    dfs.append(df)


In [131]:
single_ensg_ids = nucleus_cytoplasm.loc[~nucleus_cytoplasm['ensg_id'].str.contains(",")]
single_ensg_ids = single_ensg_ids.drop(columns=['gene_name', 'uniprot_id'])
print(single_ensg_ids.shape)
single_ensg_ids.head()

(2200, 2)


Unnamed: 0,level_b,ensg_id
8,nucleoplasm,ENSG00000049769
36,nucleoplasm,ENSG00000156504
39,cytosol,ENSG00000102096
46,nucleoplasm,ENSG00000068394
73,nucleoplasm,ENSG00000133131


In [132]:
nucleus_cytoplasm_ensg_unrolled = pd.concat([single_ensg_ids, pd.concat(dfs)])
nucleus_cytoplasm_ensg_unrolled = nucleus_cytoplasm_ensg_unrolled.drop_duplicates()
print(nucleus_cytoplasm_ensg_unrolled.shape)
nucleus_cytoplasm_ensg_unrolled.head()

(2383, 2)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,ensg_id,level_b
8,ENSG00000049769,nucleoplasm
36,ENSG00000156504,nucleoplasm
39,ENSG00000102096,cytosol
46,ENSG00000068394,nucleoplasm
73,ENSG00000133131,nucleoplasm


In [133]:
nucleus_cytoplasm_ensg_unrolled.ensg_id.is_unique

True

In [134]:
csv = os.path.join(data_folder, 'nucleus_cytoplasm_single_ensg.csv')
nucleus_cytoplasm_ensg_unrolled.to_csv(csv, index=False)