In [1]:
import os

DATA_FOLDER = os.path.join('..', 'data')
FIGURE_FOLDER = os.path.join('..', 'figures')

notebook_name = '020_get_nucleus_cytoplasm_genes'

data_folder = os.path.join(DATA_FOLDER, notebook_name)
figure_folder = os.path.join(FIGURE_FOLDER, notebook_name)

! mkdir -p $data_folder
! mkdir -p $figure_folder

input_folder = os.path.join(DATA_FOLDER, '000_filter_for_differentially_localized_proteins')

In [2]:
import pandas as pd

csv = os.path.join(input_folder, 'v17_antibody_localization_tidy.csv')
localization = pd.read_csv(csv)
print(localization.shape)
localization.head()

(50095, 6)


Unnamed: 0,antibody_id,tissue_name,cellular_component,level_a,level_b,level_c
0,167,A-431,Nuclear_speckles,nuclear_speckles,nuclear_bodies,nucleus
1,167,U-2 OS,Cytosol,cytosol,cytosol,cytoplasm
2,167,U-2 OS,Nuclear_speckles,nuclear_speckles,nuclear_bodies,nucleus
3,167,U-251 MG,Cytosol,cytosol,cytosol,cytoplasm
4,167,U-251 MG,Nuclear_speckles,nuclear_speckles,nuclear_bodies,nucleus


In [3]:
csv = os.path.join(input_folder, 'v17_antibody_metadata.csv')
metadata = pd.read_csv(csv)
print(metadata.shape)
metadata.head()

(33456, 7)


Unnamed: 0,antibody_id,prest_id,tissue_name,ensg_id,gene_name,uniprot_id,aa_sequencing_sequence
0,167,231776,A-431,ENSG00000130830,MPP1,Q00013,TEEMTRNISANEFLEFGSYQGNMFGTKFETVHQIHKQNKIAILDIE...
1,167,231776,U-2 OS,ENSG00000130830,MPP1,Q00013,TEEMTRNISANEFLEFGSYQGNMFGTKFETVHQIHKQNKIAILDIE...
2,167,231776,U-251 MG,ENSG00000130830,MPP1,Q00013,TEEMTRNISANEFLEFGSYQGNMFGTKFETVHQIHKQNKIAILDIE...
3,243,230049,A-431,ENSG00000036473,OTC,P00480,ILADYLTLQEHYSSLKGLTLSWIGDGNNILHSIMMSAAKFGMHLQA...
4,243,230049,U-2 OS,ENSG00000036473,OTC,P00480,ILADYLTLQEHYSSLKGLTLSWIGDGNNILHSIMMSAAKFGMHLQA...


### Get metadata for singly localized antibodies across cell lines

They were only found in one place no matter which cell line. Use "Level C" which is coarsest granularity.

In [4]:
single_level_c = localization.groupby(['antibody_id']).filter(lambda x: len(x['level_c'].unique()) == 1)
print(single_level_c.shape)
single_level_c.head()

(16426, 6)


Unnamed: 0,antibody_id,tissue_name,cellular_component,level_a,level_b,level_c
5,243,A-431,Mitochondria,mitochondria,mitochondria,cytoplasm
6,243,U-2 OS,Mitochondria,mitochondria,mitochondria,cytoplasm
7,243,U-251 MG,Mitochondria,mitochondria,mitochondria,cytoplasm
8,244,MCF7,Nucleoplasm,nucleoplasm,nucleoplasm,nucleus
9,244,U-2 OS,Nucleoplasm,nucleoplasm,nucleoplasm,nucleus


In [5]:
merge_cols = ['antibody_id', 'tissue_name']

singly_localized = single_level_c.merge(metadata, left_on=merge_cols, right_on=merge_cols)
print(singly_localized.shape)

## Ignore aa_sequencing_sequence as that's where the antigen binds and isn't hte whole protein sequence
singly_localized = singly_localized.drop(
    columns=['tissue_name', 'cellular_component', 'level_a', 'level_b', 'aa_sequencing_sequence'])
singly_localized = singly_localized.drop_duplicates()
print(singly_localized.shape)
singly_localized.head()

(16426, 11)
(6444, 6)


Unnamed: 0,antibody_id,level_c,prest_id,ensg_id,gene_name,uniprot_id
0,243,cytoplasm,230049,ENSG00000036473,OTC,P00480
3,244,nucleus,230064,ENSG00000049769,PPP1R3F,Q6ZSY5
6,248,secretory,230547,ENSG00000147383,NSDHL,Q15738
12,265,secretory,252500,ENSG00000164292,RHOBTB3,O94955
15,266,nucleus,231887,ENSG00000156504,FAM122B,Q7Z309


In [6]:
nucleus_cytoplasm = singly_localized.query('level_c in ["nucleus", "cytoplasm"]')
print(nucleus_cytoplasm.shape)
nucleus_cytoplasm.head()

(4356, 6)


Unnamed: 0,antibody_id,level_c,prest_id,ensg_id,gene_name,uniprot_id
0,243,cytoplasm,230049,ENSG00000036473,OTC,P00480
3,244,nucleus,230064,ENSG00000049769,PPP1R3F,Q6ZSY5
15,266,nucleus,231887,ENSG00000156504,FAM122B,Q7Z309
18,285,cytoplasm,231642,ENSG00000102096,PIM2,Q9P1W9
21,287,nucleus,230080,ENSG00000068394,GPKOW,Q92917


In [7]:
csv = os.path.join(data_folder, 'nucleus_cytoplasm.csv')
nucleus_cytoplasm.to_csv(csv, index=False)

How many non-unique uniprot ids per antibody?

In [8]:
(nucleus_cytoplasm.groupby('uniprot_id').size() != 1).sum()

510

In [9]:
nucleus_cytoplasm['ensg_id']

0                                          ENSG00000036473
3                                          ENSG00000049769
15                                         ENSG00000156504
18                                         ENSG00000102096
21                                         ENSG00000068394
25                                         ENSG00000133131
27                                         ENSG00000269335
30                                         ENSG00000101811
40                                         ENSG00000147082
45                                         ENSG00000102241
48                                         ENSG00000101825
50                                         ENSG00000198157
52                                         ENSG00000102057
55                                         ENSG00000239900
60                                         ENSG00000153234
65                                         ENSG00000062096
68                                         ENSG000001471

In [10]:
nucleus_cytoplasm['uniprot_id']

0                             P00480
3                             Q6ZSY5
15                            Q7Z309
18                            Q9P1W9
21                            Q92917
25                            Q8TE76
27                            Q9Y6K9
30                            P33240
40                            Q8WWL7
45                            O43719
48                            Q9NR99
50                            P82970
52                            Q9NSA2
55                            P30566
60                            P43354
65                            P54793
68                            Q96HB5
80                            P51608
83                            Q8TB40
85                            Q6UXV4
88                            Q9Y2U9
94                            O75909
95                            P49802
98                            Q15459
101                           O00625
104                           P11586
107                           P55957
1

In [11]:
nucleus_cytoplasm.isnull().sum()

antibody_id     0
level_c         0
prest_id        0
ensg_id         0
gene_name       0
uniprot_id     35
dtype: int64

### Since there's a bunch of null UNIPROT ids, So let's use the ENSG ids

"unroll" the comma-separated ENSG ids

In [12]:
multiple_ensg_ids = nucleus_cytoplasm.loc[nucleus_cytoplasm['ensg_id'].str.contains(",")]
print(multiple_ensg_ids.shape)
multiple_ensg_ids.head(20)

(126, 6)


Unnamed: 0,antibody_id,level_c,prest_id,ensg_id,gene_name,uniprot_id
153,866,cytoplasm,140707,"ENSG00000213463,ENSG00000258644","SYNJ2BP,SYNJ2BP-COX16",P57105
169,914,nucleus,230385,"ENSG00000126945,ENSG00000169045","HNRNPH2,HNRNPH1","P55795,P31943"
357,1648,cytoplasm,232450,"ENSG00000067048,ENSG00000215301","DDX3Y,DDX3X","O15523,O00571"
529,2561,cytoplasm,240037,"ENSG00000173674,ENSG00000198692","EIF1AX,EIF1AY","P47813,O14602"
640,3064,nucleus,141559,"ENSG00000213920,ENSG00000255526","MDP1,NEDD8-MDP1",Q86V88
667,3173,nucleus,340157,"ENSG00000146757,ENSG00000173041","ZNF92,ZNF680","Q03936,Q8NEM1"
689,3203,cytoplasm,340153,"ENSG00000169951,ENSG00000169955","ZNF764,ZNF747","Q96H86,Q9BV97"
856,3881,nucleus,400177,"ENSG00000167081,ENSG00000185630","PBX3,PBX1","P40426,P40424"
926,4099,cytoplasm,650004,"ENSG00000038210,ENSG00000281028","PI4K2B,RP11-717K11.2",Q8TCG2
1127,5726,cytoplasm,1370182,"ENSG00000168970,ENSG00000243789","JMJD7-PLA2G4B,JMJD7",P0C870


Test out the function with `range()` so it only does the first 20 rows

In [13]:
for j, (i, row) in zip(range(20), multiple_ensg_ids.iterrows()):
    ensg_id = row['ensg_id'].split(',')
    gene_name = row['gene_name'].split(',')
    uniprot_id = row['uniprot_id'].split(",")
    if len(uniprot_id) == 1:
        print("ensg_id:", uniprot_id)
        print("uniprot_id:", uniprot_id)
        uniprot_id = [uniprot_id[0]]*len(ensg_id)
    
    df = pd.DataFrame(dict(ensg_id=ensg_id, gene_name=gene_name, uniprot_id=uniprot_id))
    df['antibody_id'] = row['antibody_id']
    df['level_c'] = row['level_c']
    df['prest_id'] = row['prest_id']
    print(df)

ensg_id: ['P57105']
uniprot_id: ['P57105']
           ensg_id      gene_name uniprot_id  antibody_id    level_c  prest_id
0  ENSG00000213463        SYNJ2BP     P57105          866  cytoplasm    140707
1  ENSG00000258644  SYNJ2BP-COX16     P57105          866  cytoplasm    140707
           ensg_id gene_name uniprot_id  antibody_id  level_c  prest_id
0  ENSG00000126945   HNRNPH2     P55795          914  nucleus    230385
1  ENSG00000169045   HNRNPH1     P31943          914  nucleus    230385
           ensg_id gene_name uniprot_id  antibody_id    level_c  prest_id
0  ENSG00000067048     DDX3Y     O15523         1648  cytoplasm    232450
1  ENSG00000215301     DDX3X     O00571         1648  cytoplasm    232450
           ensg_id gene_name uniprot_id  antibody_id    level_c  prest_id
0  ENSG00000173674    EIF1AX     P47813         2561  cytoplasm    240037
1  ENSG00000198692    EIF1AY     O14602         2561  cytoplasm    240037
ensg_id: ['Q86V88']
uniprot_id: ['Q86V88']
           ensg_i

Do it on the whole thing

In [14]:
dfs = []

for i, row in multiple_ensg_ids.iterrows():
    ensg_id = row['ensg_id'].split(',')

    
    try:
        df = pd.DataFrame(dict(ensg_id=ensg_id, #gene_name=gene_name, 
                               #uniprot_id=uniprot_id
                              ))
    except ValueError:
        print("no df -- gene_name:", gene_name)
        print("no df -- ensg_id:", ensg_id)
        print("no df -- uniprot_id:", uniprot_id)
    df['antibody_id'] = row['antibody_id']
    df['level_c'] = row['level_c']
    df['prest_id'] = row['prest_id']
    dfs.append(df)


In [15]:
single_ensg_ids = nucleus_cytoplasm.loc[~nucleus_cytoplasm['ensg_id'].str.contains(",")]
single_ensg_ids = single_ensg_ids.drop(columns=['gene_name', 'uniprot_id'])
print(single_ensg_ids.shape)
single_ensg_ids.head()

(4230, 4)


Unnamed: 0,antibody_id,level_c,prest_id,ensg_id
0,243,cytoplasm,230049,ENSG00000036473
3,244,nucleus,230064,ENSG00000049769
15,266,nucleus,231887,ENSG00000156504
18,285,cytoplasm,231642,ENSG00000102096
21,287,nucleus,230080,ENSG00000068394


In [16]:
nucleus_cytoplasm_ensg_unrolled = pd.concat([single_ensg_ids, pd.concat(dfs)])
print(nucleus_cytoplasm_ensg_unrolled.shape)
nucleus_cytoplasm_ensg_unrolled.head()

(4553, 4)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,antibody_id,ensg_id,level_c,prest_id
0,243,ENSG00000036473,cytoplasm,230049
3,244,ENSG00000049769,nucleus,230064
15,266,ENSG00000156504,nucleus,231887
18,285,ENSG00000102096,cytoplasm,231642
21,287,ENSG00000068394,nucleus,230080


In [19]:
csv = os.path.join(data_folder, 'nucleus_cytoplasm_single_ensg.csv')
nucleus_cytoplasm_ensg_unrolled.to_csv(csv, index=False)