In [1]:
import pandas as pd

In [2]:
path = 'processed_files/Kac_peptides/'

dkos3 = pd.read_csv(path + 'Exp1_DKOvsSirt3_RO_Kac.csv')
tacmi = pd.read_csv(path + 'Exp2_DKOvsTACMI_RO_Kac.csv')
dkotac = pd.read_csv(path + 'Exp3_DKOTAC_RO_Kac.csv')

In [3]:
dkos3.head()

Unnamed: 0,Name,Symbol,Accession,Description,Entrez,Gene Symbol,Modifications,MitoCarta2_List,Matrix,IMS,...,DKOvsDFC_p_value,S3KOvsS3FL_p_value,DKOvsS3KO_p_value,DFCvsS3FL_p_value,F,F.p.value,DKOvsDFC_significant,S3KOvsS3FL_significant,DKOvsS3KO_significant,DFCvsS3FL_significant
0,17beta-hydroxysteroid dehydrogenase type 10/sh...,Hsd17b10,Q99N15,17beta-hydroxysteroid dehydrogenase type 10/sh...,15108.0,Hsd17b10,1xTMT6plex [K6]; 1xTMT6plex [N-Term]; 1xAcetyl...,True,True,False,...,1.686036e-07,3.829899e-06,0.302903,0.005327,120.36096,3.242435e-07,1.0,1.0,0.0,0.0
1,"Acetyl-CoA acetyltransferase, mitochondrial",Acat1,Q8QZT1,"Acetyl-CoA acetyltransferase, mitochondrial OS...",110446.0,Acat1,1xTMT6plex [K8]; 1xTMT6plex [N-Term]; 1xAcetyl...,True,True,False,...,7.039662e-10,5.57704e-09,0.477078,0.00046,501.688984,8.92374e-10,1.0,1.0,0.0,-1.0
2,"Trifunctional enzyme subunit alpha, mitochondrial",Hadha,Q8BMS1,"Trifunctional enzyme subunit alpha, mitochondr...",97212.0,Hadha,1xTMT6plex [K7]; 1xTMT6plex [N-Term]; 1xAcetyl...,True,True,False,...,4.146363e-05,0.0003045467,0.592053,0.239908,31.824466,6.417063e-05,1.0,1.0,0.0,0.0
3,"Trifunctional enzyme subunit alpha, mitochondrial",Hadha,Q8BMS1,"Trifunctional enzyme subunit alpha, mitochondr...",97212.0,Hadha,2xTMT6plex [K]; 1xTMT6plex [N-Term]; 2xAcetyl ...,True,True,False,...,9.051101e-07,0.02060254,0.061715,6.9e-05,59.627143,5.523857e-06,1.0,1.0,0.0,-1.0
4,"Trifunctional enzyme subunit beta, mitochondrial",Hadhb,Q99JY0,"Trifunctional enzyme subunit beta, mitochondri...",231086.0,Hadhb,1xTMT6plex [K9]; 1xTMT6plex [N-Term]; 1xAcetyl...,True,False,False,...,2.292058e-07,4.067101e-06,0.254298,0.012443,113.422559,4.130911e-07,1.0,1.0,0.0,0.0


# Process Experiments to extract the mitochondrial acetylated residues

Looking only at mito Kac in this analysis b/c the model targets this intraceullular compartment

## Exp1

In [4]:
dkos3_kac_sites = dkos3['Modifications in Proteins'].str.split('; (?!K)', expand=True, )

In [5]:
# list to hold the sites associated with the same master protein accession
dkos3_sites = []
for i in np.arange(0, dkos3_kac_sites.columns[-1]+1):
    bool_mask = dkos3.Accession == dkos3_kac_sites[i].str.split(' ', expand=True)[0]
    try:
        temp_sites = dkos3_kac_sites[i].loc[bool_mask].str.split('(?<!;)\s', expand=True)[2]
        dkos3_sites.append(temp_sites)
    except:
        continue

In [6]:
# convert the list into a pandas series     
dkos3_sites = pd.concat(dkos3_sites)

# remove the percent confidence and brackets 
dkos3_sites = dkos3_sites.str.replace('\(\d{1,6}\)', '').str.replace('\[(.+)\]', '\g<1>')

dkos3_sites.rename('Acetylated Residues', inplace=True)

dkos3_sites.head(10)

1           K260
2           K289
3     K411; K414
4           K189
5            K46
6           K406
7           K316
8            K70
9            K46
11          K350
Name: Acetylated Residues, dtype: object

In [7]:
# merge the residues back onto the dataframe
dkos3 = dkos3.merge(dkos3_sites.to_frame(), left_index=True, right_index=True)

In [8]:
# extract the accession and kac residues
dkos3_kac_sites = dkos3[dkos3.MitoCarta2_List][['Accession', 'Acetylated Residues']].copy()
dkos3_kac_sites.set_index('Accession', inplace=True)
dkos3_kac_sites = dkos3_kac_sites['Acetylated Residues'].str.split('; ', expand=True)

dkos3_kac_sites.head()

Unnamed: 0_level_0,0,1
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1
Q99N15,K104,
Q8QZT1,K260,
Q8BMS1,K289,
Q8BMS1,K411,K414
Q99JY0,K189,


In [9]:
# turn the expanded residues into a single column
dkos3_kac_sites = (dkos3_kac_sites
                   .reset_index()
                   .melt(id_vars=['Accession'])
                   .drop(columns='variable')
                   .rename(columns={'value': 'kac_residue'}))

dkos3_kac_sites.dropna(inplace=True)
dkos3_kac_sites.sort_values('Accession', inplace=True)
dkos3_kac_sites.head()

Unnamed: 0,Accession,kac_residue
50,A0A0R4J023,K179
236,A0A0R4J083,K43
209,A0A0R4J083,K42
244,A0A0R4J083,K92
59,A0A0R4J094,K233


## Exp2 

In [10]:
tacmi_kac_sites = tacmi['Modifications in Proteins'].str.split('; (?!K)', expand=True, )

In [11]:
# list to hold the sites associated with the same master protein accession
tacmi_sites = []
for i in np.arange(0, tacmi_kac_sites.columns[-1]+1):
    bool_mask = tacmi.Accession == tacmi_kac_sites[i].str.split(' ', expand=True)[0]
    try:
        temp_sites = tacmi_kac_sites[i].loc[bool_mask].str.split('(?<!;)\s', expand=True)[2]
        tacmi_sites.append(temp_sites)
    except:
        continue

In [12]:
# convert the list into a pandas series     
tacmi_sites = pd.concat(tacmi_sites)

# remove the percent confidence and brackets 
tacmi_sites = tacmi_sites.str.replace('\(\d{1,6}\)', '').str.replace('\[(.+)\]', '\g<1>')

tacmi_sites.rename('Acetylated Residues', inplace=True)

tacmi_sites.head(10)

0           K60
1    K411; K413
2    K411; K413
3          K350
4          K209
5          K283
6          K393
7          K209
8          K239
9           K60
Name: Acetylated Residues, dtype: object

In [13]:
# merge the residues back onto the dataframe
tacmi = tacmi.merge(tacmi_sites.to_frame(), left_index=True, right_index=True)

In [14]:
# extract the accession and kac residues
tacmi_kac_sites = tacmi[tacmi.MitoCarta2_List][['Accession', 'Acetylated Residues']].copy()
tacmi_kac_sites.set_index('Accession', inplace=True)
tacmi_kac_sites = tacmi_kac_sites['Acetylated Residues'].str.split('; ', expand=True)

tacmi_kac_sites.head()

Unnamed: 0_level_0,0,1
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1
Q9WUR2,K60,
Q8BMS1,K411,K413
Q8BMS1,K411,K413
Q8BMS1,K350,
Q8BWT1,K209,


In [15]:
# turn the expanded residues into a single column
tacmi_kac_sites = (tacmi_kac_sites
                   .reset_index()
                   .melt(id_vars=['Accession'])
                   .drop(columns='variable')
                   .rename(columns={'value': 'kac_residue'}))

tacmi_kac_sites.dropna(inplace=True)
tacmi_kac_sites.sort_values('Accession', inplace=True)
tacmi_kac_sites.head()

Unnamed: 0,Accession,kac_residue
272,A0A0A0MQF6,K295
541,A0A0A0MQF6,K239
668,A0A0A0MQF6,K218
304,A0A0R4J023,K179
330,A0A0R4J023,K179


## Exp3

In [16]:
dkotac_kac_sites = dkotac['Modifications in Proteins'].str.split('; (?!K)', expand=True, )

In [17]:
# list to hold the sites associated with the same master protein accession
dkotac_sites = []
for i in np.arange(0, dkotac_kac_sites.columns[-1]+1):
    bool_mask = dkotac.Accession == dkotac_kac_sites[i].str.split(' ', expand=True)[0]
    try:
        temp_sites = dkotac_kac_sites[i].loc[bool_mask].str.split('(?<!;)\s', expand=True)[2]
        dkotac_sites.append(temp_sites)
    except:
        continue

In [18]:
# convert the list into a pandas series     
dkotac_sites = pd.concat(dkotac_sites)

# remove the percent confidence and brackets 
dkotac_sites = dkotac_sites.str.replace('\(\d{1,6}\)', '').str.replace('\[(.+)\]', '\g<1>')

dkotac_sites.rename('Acetylated Residues', inplace=True)

dkotac_sites.head(10)

0     K265
3     K542
4     K406
5      K70
6     K331
7      K94
8     K189
9      K46
10    K259
11    K282
Name: Acetylated Residues, dtype: object

In [19]:
# merge the residues back onto the dataframe
dkotac = dkotac.merge(dkotac_sites.to_frame(), left_index=True, right_index=True)

In [20]:
# extract the accession and kac residues
dkotac_kac_sites = dkotac[dkotac.MitoCarta2_List][['Accession', 'Acetylated Residues']].copy()
dkotac_kac_sites.set_index('Accession', inplace=True)
dkotac_kac_sites = dkotac_kac_sites['Acetylated Residues'].str.split('; ', expand=True)

dkotac_kac_sites.head()

Unnamed: 0_level_0,0,1,2
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Q8QZT1,K265,,
Q99N15,K104,,
P97807-2,K427,,
D3Z7X0,K542,,
Q8BMS1,K406,,


In [21]:
# turn the expanded residues into a single column
dkotac_kac_sites = (dkotac_kac_sites
                   .reset_index()
                   .melt(id_vars=['Accession'])
                   .drop(columns='variable')
                   .rename(columns={'value': 'kac_residue'}))

dkotac_kac_sites.dropna(inplace=True)
dkotac_kac_sites.sort_values('Accession', inplace=True)
dkotac_kac_sites.head()

Unnamed: 0,Accession,kac_residue
1368,A0A0A0MQF6,K131
1014,A0A0A0MQF6,K358
1632,A0A0A0MQF6,K163
1462,A0A0A0MQF6,K163
1519,A0A0A0MQF6,K110


# Compare the overlap

While performing Experiments 1-3, we initially had trouble with polyethylene glycol (PEG) contaminants. As a result, the earlier experiments had fewer quantified acetylpeptides (these PEG compounds suppressed ionization on the instrument). Ultimatley, we traced the PEG to the inside of the microcentrifuge tubes we used. To reduce this interference, we washed all the tubes used in later experiments with Methanol (then let dry) followed by Ethanol (then let dry). These tubes were used within a few weeks of washing to ensure leaching from the polymer matrix didn't reverse our efforts.

In [22]:
dkos3_kac_sites['combined'] = dkos3_kac_sites.Accession + ' ' + dkos3_kac_sites.kac_residue
tacmi_kac_sites['combined'] = tacmi_kac_sites.Accession + ' ' + tacmi_kac_sites.kac_residue
dkotac_kac_sites['combined'] = dkotac_kac_sites.Accession + ' ' + dkotac_kac_sites.kac_residue

In [23]:
dkos3_unique = pd.DataFrame(dkos3_kac_sites.combined.unique(), columns=['dkos3'])
tacmi_unique = pd.DataFrame(tacmi_kac_sites.combined.unique(), columns=['tacmi'])
dkotac_unique = pd.DataFrame(dkotac_kac_sites.combined.unique(), columns=['dkotac'])

In [26]:
print('Number of Unique Acetyl Sites')
for i, exp in enumerate([dkos3_unique, tacmi_unique, dkotac_unique]):
    print(f'Exp. {i+1}: {exp.shape[0]}')

Number of Unique Acetyl Sites
Exp. 1: 242
Exp. 2: 623
Exp. 3: 1216


In [41]:
exp1_exp2 = dkos3_unique.merge(tacmi_unique, left_on='dkos3', right_on='tacmi')

exp1_exp3 = dkos3_unique.merge(dkotac_unique, left_on='dkos3', right_on='dkotac')

exp2_exp3 = tacmi_unique.merge(dkotac_unique, left_on='tacmi', right_on='dkotac')

In [42]:
fraction = exp1_exp2.shape[0] / dkos3_unique.shape[0]

f'Fraction of Mito Kac sites found Exp1 also found in Exp2: {fraction:.0%}'

'Fraction of Mito Kac sites found Exp1 also found in Exp2: 86%'

In [43]:
fraction = exp1_exp3.shape[0] / dkos3_unique.shape[0]

f'Fraction of Mito Kac sites found Exp1 also found in Exp3: {fraction:.0%}'

'Fraction of Mito Kac sites found Exp1 also found in Exp3: 93%'

In [44]:
fraction = exp2_exp3.shape[0] / tacmi_unique.shape[0]

f'Fraction of Mito Kac sites found Exp2 also found in Exp3: {fraction:.0%}'

'Fraction of Mito Kac sites found Exp2 also found in Exp3: 90%'