This is a notebook to consolidate proteingroups that share the same ENSG sets. For more detailed explanation, please read https://czbiohub.atlassian.net/l/cp/vS2GnF2b

In [21]:
import sys

# The notebook requires scripts from OpenCell (https://github.com/czbiohub/opencell/)
# designate where the OpenCell repo is cloned, and import modules
sys.path.append('../../opencell/')

# appending path for Pyseus
sys.path.append('../')

from opencell.database import ms_utils, utils

from pyseus import basic_processing as ip

import pandas as pd
import sqlalchemy
from sqlalchemy.orm import sessionmaker


In [23]:
# ensembl uniprot table is used to map protein IDs to ENSG ids
# code to query ensembl_uniprot from the OC db
url = utils.url_from_credentials('../data/OC_database/db-credentials-cap.json')
engine = sqlalchemy.create_engine(url)
engine.connect()

# initiate session
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()
ensembl_uniprot = pd.read_sql('select * from ensembl_uniprot_association', engine)

In [18]:
# we will use output from OC_Plate_22-25_MBR, renamed table
# Simple filtering to remove irrelevant rows
pgroups = pd.read_csv('../data/OC_Plate_22-25_MBR/proteinGroups_renamed.txt', sep='\t',
    low_memory=False, index_col=0)

# basic filtering 
process = ip.RawTables(proteingroup=pgroups, file_designated=True, intensity_type='LFQ intensity')
process.filter_table()
filtered_table = process.filtered_table.copy()

# for protein group consolidation, we only need the column of protein IDs for now
protein_ids = filtered_table[['Protein IDs']].copy()

Filtered 169 of 3829 rows. Now 3660 rows.


In [None]:
# first, hash the Protein IDs


In [16]:
process.filtered_table

Unnamed: 0,Protein IDs,Majority protein IDs,Protein names,Gene names,P022_RANBP2_01,P022_COMMD1_01,P022_COMMD2_01,P022_COMMD4_01,P022_COMMD6_01,P022_RPL5_01,...,P025_ARL14EP_03,P025_ARPC3_03,P025_YWHAE_03,P025_TRAPPC2_03,P025_ARL6_03,P025_APPL1_03.1,P025_YWHAB_03,P025_TRAPP_03,P025_ARL8B_03,P025_ACTR2_03
0,A0A024QZP7;P06493-2;P06493;E5RIU6;A0A087WZZ9,A0A024QZP7;P06493-2;P06493;E5RIU6;A0A087WZZ9,Cyclin-dependent kinase 1,CDC2;CDK1,11384.0,8629.3,4886.0,16573.0,7598.7,4839.70,...,34538.00,12880.0,85629.0,27395.000,127870.0,9876.0,23344.00,117880.00,74614.0,145260.0
1,A0A087WYK3;A0A0J9YXZ4;A0A087WU71;A0A087WTV5;A0...,A0A087WYK3;A0A0J9YXZ4;A0A087WU71;A0A087WTV5;A0...,Transcriptional adapter 2-alpha,TADA2A;TADA2L,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.0,0.000,0.0,0.0,0.00,0.00,0.0,0.0
2,E9PNH7;A0A024R3B9;P02511;E9PJL7;E9PRA8,E9PNH7;A0A024R3B9;P02511;E9PJL7;E9PRA8,Alpha-crystallin B chain,CRYAB,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.0,0.000,0.0,0.0,0.00,0.00,0.0,0.0
3,Q00341;A0A024R4E5;Q00341-2;H0Y394;C9JK79;C9JIZ...,Q00341;A0A024R4E5;Q00341-2;H0Y394,Vigilin,HDLBP,7018.7,9912.4,9951.9,6666.3,7593.9,17462.00,...,0.00,12494.0,16726.0,24440.000,16161.0,10380.0,79663.00,13942.00,60322.0,135710.0
4,A0A024RA52;P25787;H3BT36;H7C402;C9JCK5,A0A024RA52;P25787;H3BT36,Proteasome subunit alpha type;Proteasome subun...,PSMA2,0.0,0.0,3319.9,0.0,0.0,671.73,...,0.00,0.0,14305.0,0.000,0.0,6410.2,49793.00,5806.70,0.0,139070.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3765,Q9Y6X8;H0YKA3,Q9Y6X8,Zinc fingers and homeoboxes protein 2,ZHX2,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.0,0.000,0.0,0.0,0.00,0.00,0.0,0.0
3767,R4GMX3;R4GMW6;Q9UBI1;P35226;H0Y6Z9;Q5T8Z4;R4GN...,R4GMX3;R4GMW6;Q9UBI1;P35226,COMM domain-containing protein 3;Polycomb comp...,COMMD3-BMI1;COMMD3;BMI1,0.0,3565200.0,222050.0,893110.0,158940.0,0.00,...,0.00,0.0,0.0,0.000,0.0,0.0,658.64,3541.90,0.0,0.0
3768,R4GNH3;P17980;E9PN50;E9PM69;E9PMD8;E9PKD5;E9PLG2,R4GNH3;P17980;E9PN50;E9PM69;E9PMD8;E9PKD5,26S protease regulatory subunit 6A,PSMC3,7202.9,1929.8,10323.0,10158.0,2223.0,7470.80,...,8712.90,0.0,60981.0,9802.600,6008.2,35789.0,40056.00,3697.30,6545.5,121270.0
3827,S4R3H4;E7EQT4;Q9UKV3-5;Q9UKV3;G3V3B0;Q9UKV3-3;...,S4R3H4;E7EQT4;Q9UKV3-5;Q9UKV3,Apoptotic chromatin condensation inducer in th...,ACIN1,0.0,10847.0,0.0,0.0,1153.6,0.00,...,800.16,0.0,1074.5,73.715,0.0,0.0,232.47,319.37,1695.1,364.5
