In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib import ticker
from IPython.display import display, HTML
import re # finding specific patterns in str
import textwrap # split text into equal parts
import collections # return repeated items in list
from collections import OrderedDict
import time
import sys
import ast # convert str back to tuple/list/int, etc

# FUNCTIONS
def glycansource(df, colname):
    # glycan comprison: only present in byonic -> b, only present in pglyco -> p, both the same -> b+p, not the same -> b/p
    conditions = [
        (df['Glycans[Byonic]'] != -1) & (df['GlycanComposition_ByonicStyle[pGlyco]'] == -1),
        (df['Glycans[Byonic]'] == -1) & (df['GlycanComposition_ByonicStyle[pGlyco]'] != -1),
        (df['Glycans[Byonic]'] != -1) & (df['GlycanComposition_ByonicStyle[pGlyco]'] != -1) & (df['Glycans[Byonic]'] == df['GlycanComposition_ByonicStyle[pGlyco]']),
        (df['Glycans[Byonic]'] != -1) & (df['GlycanComposition_ByonicStyle[pGlyco]'] != -1) & (df['Glycans[Byonic]'] != df['GlycanComposition_ByonicStyle[pGlyco]'])]
    choices = ['B', 'P', 'B=P', 'B≠P'] 
    glycan_source = np.select(conditions, choices, -1) 
    df.insert(df.columns.get_loc(colname) + 1 , 'GlycanSource', glycan_source , True)
# define mask function
def threshold_masks_colorind(df):
    global b_hcd_mask, p_hcd_mask, both_hcd_mask, b_etd_mask, p_etd_mask, both_etd_mask
    global b_glycansource_mask, p_glycansource_mask, byos_exclusiveOr_mask, byos_and_mask, byos_bothsame_mask
    global lightgreen_ind, lightblue_ind, lightorange_ind, normalgreen_ind, normalblue_ind, normalorange_ind, deepgreen_ind, deepblue_ind, lightpink_ind, deeppink_ind, yellow_ind  
    # comparison between byonic & pglyco
    # HCD 
    b_hcd_mask = (df['Fragment\r\nType[Byonic]'] == 'hcd') & (df['Score[Byonic]'] > 200) & (df['PEP\r\n2D[Byonic]'].abs() < 0.001) & ((df['PepScore[pGlyco]'] <= 5) | (df['GlyScore[pGlyco]'] <= 4))
    p_hcd_mask = (df['FragmentType[pGlyco]'] == 'hcd') & ((df['Score[Byonic]'] <= 200) | (df['PEP\r\n2D[Byonic]'].abs() >= 0.001)) & (df['PepScore[pGlyco]'] > 5) & (df['GlyScore[pGlyco]'] > 4)    
    both_hcd_mask = (df['Fragment\r\nType[Byonic]'] == 'hcd') & (df['FragmentType[pGlyco]'] == 'hcd') & (df['Score[Byonic]'] > 200) & (df['PEP\r\n2D[Byonic]'].abs() < 0.001) & (df['PepScore[pGlyco]'] > 5) & (df['GlyScore[pGlyco]'] > 4)
    # ETD: remember byonic etd does not need threshold, so we only need ot make sure that the row only contain byonic data, which means pglyco data will be -1
    b_etd_mask = (df['Fragment\r\nType[Byonic]'] == 'ethcd') & (df['Score[Byonic]'] != -1) & (df['PEP\r\n2D[Byonic]'] != -1) & ((df['PepScore[pGlyco]'] <= 5) | (df['GlyScore[pGlyco]'] <= 4)) 
    p_etd_mask = (df['FragmentType[pGlyco]'] == 'ethcd') & (df['Score[Byonic]'] == -1) & (df['PEP\r\n2D[Byonic]'] == -1) & (df['PepScore[pGlyco]'] > 5) & (df['GlyScore[pGlyco]'] > 4)    
    both_etd_mask = (df['Fragment\r\nType[Byonic]'] == 'ethcd') & (df['FragmentType[pGlyco]'] == 'ethcd') & (df['Score[Byonic]'] != -1) & (df['PEP\r\n2D[Byonic]'] != -1) & (df['PepScore[pGlyco]'] > 5) & (df['GlyScore[pGlyco]'] > 4)
    # (HCD OR ETD) & (B+P OR B/P) & passes threshold: parts of lightblue/blue & lightgreen/green will become deep colors
    b_glycansource_mask = ((df['GlycanSource'] == 'B=P') ^ (df['GlycanSource'] == 'B≠P')) & ((b_hcd_mask) ^ (b_etd_mask)) & (~((p_hcd_mask) ^ (p_etd_mask))) # hcd exclusive or etd & ~(p)
    p_glycansource_mask = ((df['GlycanSource'] == 'B=P') ^ (df['GlycanSource'] == 'B≠P')) & ((p_hcd_mask) ^ (p_etd_mask)) & (~((b_hcd_mask) ^ (b_etd_mask))) # hcd exclusive or etd & ~(b)   
    # comparison between byonic & byos
    byos_exclusiveOr_mask = (df['Calc.M[Byos]'] != -1) & (df['Calc.M[Byonic]'] != -1) & ((df['Calc.M[Byos]'] != df['Calc.M[Byonic]'])^(df['PureSequence[Byos]'] != df['PureSequence[Byonic]']))
    byos_and_mask = (df['Calc.M[Byos]'] != -1) & (df['Calc.M[Byonic]'] != -1) & (df['Calc.M[Byos]'] != df['Calc.M[Byonic]']) & (df['PureSequence[Byos]'] != df['PureSequence[Byonic]'])
    byos_bothsame_mask = (df['Calc.M[Byos]'] != -1) & (df['Calc.M[Byonic]'] != -1) & (df['Calc.M[Byos]'] == df['Calc.M[Byonic]']) & (df['PureSequence[Byos]'] == df['PureSequence[Byonic]'])
    # record df color indices
    lightgreen_ind = df.loc[b_hcd_mask].index.tolist()
    lightblue_ind = df.loc[p_hcd_mask].index.tolist()
    lightorange_ind = df.loc[both_hcd_mask].index.tolist()
    normalgreen_ind = df.loc[b_etd_mask].index.tolist()
    normalblue_ind = df.loc[p_etd_mask].index.tolist()
    normalorange_ind = df.loc[both_etd_mask].index.tolist()
    deepgreen_ind = df.loc[b_glycansource_mask].index.tolist() 
    deepblue_ind = df.loc[p_glycansource_mask].index.tolist()
    lightpink_ind = df.loc[byos_exclusiveOr_mask].index.tolist()
    deeppink_ind = df.loc[byos_and_mask].index.tolist()
    yellow_ind = df.loc[byos_bothsame_mask].index.tolist()
def bg_color(x):
    # byonic & pglyco colors
    # HCD: light colors
    c1 = 'background-color: #98FB98' # light green
    c2 = 'background-color: #add8e6' # light blue
    c3 = 'background-color: #ffedcc' # light orange
    # ETD: normal colors
    c4 = 'background-color: #008000' # normal green
    c5 = 'background-color: #0000FF' # normal blue
    c6 = 'background-color: #FFA500' # normal orange
    # GlycanSource B+P, B/P: deep colors
    c7 = 'background-color: #004d00' # dark green
    c8 = 'background-color: #00008B' # dark blue
    # byos colors
    c9 = 'background-color: #FFB6C1' # light pink
    c10 = 'background-color: #FF1493' # deep pink
    c11 = 'background-color: #FFFF00' # yellow
    c = '' 
    #DataFrame with same index and columns names as original filled empty strings
    df1 =  pd.DataFrame(c, index=x.index, columns=x.columns)    
    # byonic & byos coloring range
    bb_range = [col for col in df1.columns.tolist() if '[Byos]' in col]
    # byonic & pglyco coloring range
    bp_range = [col for col in df1.columns.tolist() if '[Byonic]' in col or '[pGlyco]' in col]
    # modify values of df1 column by boolean mask
    # HCD: light colors (c1-c3)
    df1.loc[b_hcd_mask, bp_range] = c1
    print('<Color Summary>\n%s rows will be colored light green (#98FB98).'%len(df1.loc[b_hcd_mask, bp_range]))
    df1.loc[p_hcd_mask, bp_range] = c2 
    print('%s rows will be colored light blue (#add8e6).'%len(df1.loc[p_hcd_mask, bp_range]))
    df1.loc[both_hcd_mask, bp_range] = c3
    print('%s rows will be colored light orange (#ffedcc).'%len(df1.loc[both_hcd_mask, bp_range]))
    # ETD: normal colors (c4-c6)
    df1.loc[b_etd_mask, bp_range] = c4
    print('%s rows will be colored green (#008000).'%len(df1.loc[b_etd_mask, bp_range]))
    df1.loc[p_etd_mask, bp_range] = c5
    print('%s rows will be colored blue (#0000FF).'%len(df1.loc[p_etd_mask, bp_range]))
    df1.loc[both_etd_mask, bp_range] = c6
    print('%s rows will be colored orange (#FFA500).'%len(df1.loc[both_etd_mask, bp_range]))
    # GlycanSource B+P, B/P: deep colors (c7-c8)
    df1.loc[b_glycansource_mask, bp_range] = c7
    print('%s rows will be colored dark green (#004d00).'%len(df1.loc[b_glycansource_mask, bp_range]))
    df1.loc[p_glycansource_mask, bp_range] = c8
    print('%s rows will be colored dark blue (#00008B).'%len(df1.loc[p_glycansource_mask, bp_range]))
    # byos colors (c9-c11)
    df1.loc[byos_exclusiveOr_mask, bb_range] = c9
    print('%s rows will be colored light pink (#FFB6C1).'%len(df1.loc[byos_exclusiveOr_mask, bb_range]))
    df1.loc[byos_and_mask, bb_range] = c10
    print('%s rows will be colored deep pink (#FF1493).'%len(df1.loc[byos_and_mask, bb_range]))
    df1.loc[byos_bothsame_mask, bb_range] = c11
    print('%s rows will be colored yellow (#FFFF00).'%len(df1.loc[byos_bothsame_mask, bb_range]))
    bp_white = len(df1) - (len(df1.loc[b_hcd_mask, bp_range]) + len(df1.loc[p_hcd_mask, bp_range]) \
                      + len(df1.loc[both_hcd_mask, bp_range]) + len(df1.loc[b_etd_mask, bp_range]) + len(df1.loc[p_etd_mask, bp_range]) \
                      + len(df1.loc[both_etd_mask, bp_range]))
    bb_white = len(df1) - (len(df1.loc[byos_exclusiveOr_mask, bb_range]) + len(df1.loc[byos_and_mask, bb_range]) + len(df1.loc[byos_bothsame_mask, bb_range]))
    print('%s rows will be colorless in byonic & pglyco data.'%bp_white)
    print('%s rows will be colorless in byos data (absent data in certain scans).'%bb_white)
    return df1
def move_df(df, move_col, insert_left_col):
    move_df = df[move_col]
    df.drop(labels=[move_col], axis=1, inplace = True)
    df.insert(df.columns.get_loc(insert_left_col) + 1, move_col, move_df)


# directly read in sorted _UniquePep file as the starting point
id_df = pd.read_excel('20210617_BBP_hcdethcd_UniquePep.xlsx', header = 0)
id_df = id_df.fillna('N/A')
fixed_colname = [i.replace('_x000D_', '\r') if '_x000D_' in i else i for i in id_df.columns]
id_df.columns = fixed_colname
# quantification only apply to single sites (quant_df offers important info. for later multiIndex df construction)
quant = id_df[id_df['N-site(SequonBased)[Byonic]'].apply(lambda x: isinstance(x, int))]
drop_ind = quant.loc[(quant['Fragment\r\nType[Byonic]'] == 'ethcd') & ((quant['Score[Byonic]'] <= 200)|(quant['PEP\r\n2D[Byonic]'].abs() >= 0.001)) & ((quant['Pair[Byonic]'] == -1)|(quant['Pair[Byonic]'] == 'N/A'))].index.tolist()
ind = [i for i in quant.index.tolist() if i not in drop_ind]
quant = quant.loc[ind] # hcd all in, if byonic etd pass score & pep2d all in, otherwise in with pair
# normalize by sum of xicauc/ int/ mono/ iso respectively
total_xicauc = quant.loc[quant['XIC\r\nAUC[Byos]'] != -1, 'XIC\r\nAUC[Byos]'].sum()
print('total_xicauc:%s'%total_xicauc)
total_int = quant.loc[quant['Apex Int.\r\n(Posit)[Byos]'] != -1, 'Apex Int.\r\n(Posit)[Byos]'].sum()
print('total_int:%s'%total_int)
total_mono = quant.loc[quant['MonoArea[pGlyco]'] != -1, 'MonoArea[pGlyco]'].sum()
print('total_mono:%s'%total_mono)
total_iso = quant.loc[quant['IsotopeArea[pGlyco]'] != -1, 'IsotopeArea[pGlyco]'].sum()
print('total_iso:%s'%total_iso)
# avoid adding values from below-threshold rows: if deepgreen -> change value to -1, if deepblue -> change value to -1.(-1 to make lambda function easier to write)
threshold_masks_colorind(quant)
quant.loc[deepgreen_ind, ['MonoArea[pGlyco]', 'IsotopeArea[pGlyco]']] = -1
quant.loc[deepblue_ind, ['XIC\r\nAUC[Byos]', 'Apex Int.\r\n(Posit)[Byos]']] = -1
# convert all the 'N/A' in byonic glycans to 'Unoccupied'
quant.loc[(quant['Glycans[Byonic]'] == 'N/A'), ['Glycans[Byonic]']] = 'Unoccupied'
# using groupby transform sum function to add 8 cols (byos xicauc/byos area int/pglycomono/pglycoisotope) recording summed values & normalized values (same site & same glycan & seq can be dif)
quant['e_sum_XIC\r\nAUC[Byos]'] = quant.groupby(['N-site(SequonBased)[Byonic]', 'Glycans[Byonic]'])['XIC\r\nAUC[Byos]'].transform(lambda x: x[x != -1].sum())
quant['f_sum_Apex Int.\r\n(Posit)[Byos]'] = quant.groupby(['N-site(SequonBased)[Byonic]', 'Glycans[Byonic]'])['Apex Int.\r\n(Posit)[Byos]'].transform(lambda x: x[x != -1].sum())
quant['g_sum_MonoArea[pGlyco]'] = quant.groupby(['ProSites[pGlyco]', 'GlycanComposition_ByonicStyle[pGlyco]'])['MonoArea[pGlyco]'].transform(lambda x: x[x != -1].sum())
quant['h_sum_IsotopeArea[pGlyco]'] = quant.groupby(['ProSites[pGlyco]', 'GlycanComposition_ByonicStyle[pGlyco]'])['IsotopeArea[pGlyco]'].transform(lambda x: x[x != -1].sum())
quant['a_norm_XIC\r\nAUC[Byos]'] = quant.groupby(['N-site(SequonBased)[Byonic]', 'Glycans[Byonic]'])['XIC\r\nAUC[Byos]'].transform(lambda x: x[x != -1].sum()/total_xicauc)
quant['b_norm_Apex Int.\r\n(Posit)[Byos]'] = quant.groupby(['N-site(SequonBased)[Byonic]', 'Glycans[Byonic]'])['Apex Int.\r\n(Posit)[Byos]'].transform(lambda x: x[x != -1].sum()/total_int)
quant['c_norm_MonoArea[pGlyco]'] = quant.groupby(['ProSites[pGlyco]', 'GlycanComposition_ByonicStyle[pGlyco]'])['MonoArea[pGlyco]'].transform(lambda x: x[x != -1].sum()/total_mono)
quant['d_norm_IsotopeArea[pGlyco]'] = quant.groupby(['ProSites[pGlyco]', 'GlycanComposition_ByonicStyle[pGlyco]'])['IsotopeArea[pGlyco]'].transform(lambda x: x[x != -1].sum()/total_iso)
# since it's possible to have real 0 from calculation, change the real absent data back to -1 in sum_ & norm_
quant.loc[quant['MonoArea[pGlyco]'] == -1, ['g_sum_MonoArea[pGlyco]', 'h_sum_IsotopeArea[pGlyco]', 'c_norm_MonoArea[pGlyco]', 'd_norm_IsotopeArea[pGlyco]']] = -1
quant.loc[quant['XIC\r\nAUC[Byos]'] == -1, ['e_sum_XIC\r\nAUC[Byos]', 'f_sum_Apex Int.\r\n(Posit)[Byos]', 'a_norm_XIC\r\nAUC[Byos]', 'b_norm_Apex Int.\r\n(Posit)[Byos]']] = -1 
# extract the needed cols only & split byonicbyos/pglyco, drop all -1 rows, glycans as index to outer union concat data again
quant_bb = quant[['N-site(SequonBased)[Byonic]', 'Glycans[Byonic]', 'MS2 Search\r\nAlias name[Byos]', 'e_sum_XIC\r\nAUC[Byos]', 'f_sum_Apex Int.\r\n(Posit)[Byos]', 'a_norm_XIC\r\nAUC[Byos]', 'b_norm_Apex Int.\r\n(Posit)[Byos]']]
quant_p = quant[['ProSites[pGlyco]', 'GlycanComposition_ByonicStyle[pGlyco]', 'g_sum_MonoArea[pGlyco]', 'h_sum_IsotopeArea[pGlyco]', 'c_norm_MonoArea[pGlyco]', 'd_norm_IsotopeArea[pGlyco]']]
quant_bb = quant_bb.loc[quant_bb['N-site(SequonBased)[Byonic]'] != -1]
print('cleaned up quant_bb len: %s'%len(quant_bb))
quant_p = quant_p.loc[quant_p['ProSites[pGlyco]'] != -1]
print('cleaned up quant_p len: %s'%len(quant_p))
quant_bb = quant_bb.set_index('Glycans[Byonic]')
quant_p = quant_p.set_index('GlycanComposition_ByonicStyle[pGlyco]')
a1, a2 = quant_bb.align(quant_p, join = 'outer', axis = 0) 
quant_glycanid = pd.concat([a1,a2], axis = 1)
quant_glycanid.index.name = 'Glycans ↓'
quant_glycanid.reset_index(level=0, inplace=True)
# add N(x) col
nx = ['Unoccupied' if lst == [] else 'N(%s)'%(lst[0]) for lst in quant_glycanid['Glycans ↓'].str.findall(r'HexNAc\((\d+)\)').tolist()]
quant_glycanid.insert(0 , 'N(x) ↓', nx , True)
move_df(quant_glycanid, 'ProSites[pGlyco]', 'N-site(SequonBased)[Byonic]')
quant_glycanid = quant_glycanid.drop('MS2 Search\r\nAlias name[Byos]', axis = 1)
# change all nan to blank -1
quant_glycanid = quant_glycanid.fillna(-1)
print(len(quant_glycanid))
quant_glycanid = quant_glycanid.drop_duplicates() # remember to drop duplicates after splitting & reconcat df
print(len(quant_glycanid))
# quant_glycanid.to_excel('quant_glycanid.xlsx', index = False)

# generate 4 versions: bp_intersection (both sites are present at the same time <intersection>), bp_union (n site union), only b (xicauc/int), only p (mono/isotope)
## onlyb
onlyb = quant_glycanid.loc[(quant_glycanid['N-site(SequonBased)[Byonic]'] != -1), [col for col in quant_glycanid.columns.tolist() if 'pGlyco' not in col]]
onlyb = onlyb.drop_duplicates()
# clean off all no data rows
onlyb = onlyb.loc[~((onlyb['e_sum_XIC\r\nAUC[Byos]']==-1)&(onlyb['f_sum_Apex Int.\r\n(Posit)[Byos]']==-1)&(onlyb['a_norm_XIC\r\nAUC[Byos]']==-1)&(onlyb['b_norm_Apex Int.\r\n(Posit)[Byos]']==-1))]
# onlyb.to_excel('onlyb.xlsx', index = False)
nx = list(set(onlyb['N(x) ↓'].tolist()))
nx = [int(re.findall(r'[0-9]+', i)[0]) if type(i) == str and i != 'Unoccupied' else i for i in nx]
nx.sort(key=lambda v: (isinstance(v, str), v))
b_new_nx = ['N(%s)'%(str(i)) if type(i) == int and i != -1 else i for i in nx]

## onlyp
onlyp = quant_glycanid.loc[(quant_glycanid['ProSites[pGlyco]'] != -1), [col for col in quant_glycanid.columns.tolist() if 'Byonic' not in col and 'Byos' not in col]]
onlyp = onlyp.drop_duplicates()
# clean off all no data rows
onlyp = onlyp.loc[~((onlyp['g_sum_MonoArea[pGlyco]']==-1)&(onlyp['h_sum_IsotopeArea[pGlyco]']==-1)&(onlyp['c_norm_MonoArea[pGlyco]']==-1)&(onlyp['d_norm_IsotopeArea[pGlyco]']==-1))]
# onlyp.to_excel('onlyp.xlsx', index = False)
nx = list(set(onlyp['N(x) ↓'].tolist()))
nx = [int(re.findall(r'[0-9]+', i)[0]) if type(i) == str and i != 'Unoccupied' else i for i in nx]
nx.sort(key=lambda v: (isinstance(v, str), v))
p_new_nx = ['N(%s)'%(str(i)) if type(i) == int and i != -1 else i for i in nx]

## bp_union
onlyb_union = onlyb.rename(columns={'N-site(SequonBased)[Byonic]':'N-site(Byonic ∪ pGlyco) →'}) 
onlyp_union = onlyp.rename(columns={'ProSites[pGlyco]':'N-site(Byonic ∪ pGlyco) →'}) 
a1, a2 = onlyb_union.drop('N(x) ↓', axis = 1).align(onlyp_union.drop('N(x) ↓', axis = 1), join='outer', axis = 1) # align col only
bp_union = pd.concat([a1,a2])
bp_union = bp_union.sort_values('Glycans ↓').reset_index(drop = True).fillna(-1)
# add N(x) col
nx = ['Unoccupied' if lst == [] else 'N(%s)'%(lst[0]) for lst in bp_union['Glycans ↓'].str.findall(r'HexNAc\((\d+)\)').tolist()]
bp_union.insert(0 , 'N(x) ↓', nx , True)
# bp_union.to_excel('bp_union.xlsx', index = False)
nx = list(set(bp_union['N(x) ↓'].tolist()))
nx = [int(re.findall(r'[0-9]+', i)[0]) if type(i) == str and i != 'Unoccupied' else i for i in nx]
nx.sort(key=lambda v: (isinstance(v, str), v))
bp_union_new_nx = ['N(%s)'%(str(i)) if type(i) == int and i != -1 else i for i in nx]

## bp_intersection
bpsite_union_lst = list(set(onlyb['N-site(SequonBased)[Byonic]'].tolist()).intersection(set(onlyp['ProSites[pGlyco]'].tolist())))
bp_intersection = bp_union.loc[bp_union['N-site(Byonic ∪ pGlyco) →'].isin(bpsite_union_lst)]
# add N(x) col
nx = ['Unoccupied' if lst == [] else 'N(%s)'%(lst[0]) for lst in bp_intersection['Glycans ↓'].str.findall(r'HexNAc\((\d+)\)').tolist()]
# bp_intersection.to_excel('bp_intersection.xlsx', index = False)
nx = list(set(bp_intersection['N(x) ↓'].tolist()))
nx = [int(re.findall(r'[0-9]+', i)[0]) if type(i) == str and i != 'Unoccupied' else i for i in nx]
nx.sort(key=lambda v: (isinstance(v, str), v))
bp_intersection_new_nx = ['N(%s)'%(str(i)) if type(i) == int and i != -1 else i for i in nx]

## onlyb_top10
onlyb_top10 = onlyb.copy()
xicauc_top10 = onlyb_top10.groupby('N-site(SequonBased)[Byonic]')['a_norm_XIC\r\nAUC[Byos]'].nlargest(10).droplevel(level = 0).index.tolist()
int_top10 = onlyb_top10.groupby('N-site(SequonBased)[Byonic]')['b_norm_Apex Int.\r\n(Posit)[Byos]'].nlargest(10).droplevel(level = 0).index.tolist()
onlyb_top10.loc[[i for i in onlyb_top10.index.tolist() if i not in xicauc_top10], 'a_norm_XIC\r\nAUC[Byos]'] = -1
onlyb_top10.loc[[i for i in onlyb_top10.index.tolist() if i not in int_top10], 'b_norm_Apex Int.\r\n(Posit)[Byos]'] = -1
onlyb_top10 = onlyb_top10.loc[(onlyb_top10['a_norm_XIC\r\nAUC[Byos]'] != -1)|(onlyb_top10['b_norm_Apex Int.\r\n(Posit)[Byos]'] != -1)].reset_index(drop=True).drop(['e_sum_XIC\r\nAUC[Byos]', 'f_sum_Apex Int.\r\n(Posit)[Byos]', 'N(x) ↓'], axis = 1)
# onlyb_top10.to_excel('onlyb_top10.xlsx', index = False)

## onlyp_top10
onlyp_top10 = onlyp.copy()
mono_top10 = onlyp_top10.groupby('ProSites[pGlyco]')['c_norm_MonoArea[pGlyco]'].nlargest(10).droplevel(level = 0).index.tolist()
iso_top10 = onlyp_top10.groupby('ProSites[pGlyco]')['d_norm_IsotopeArea[pGlyco]'].nlargest(10).droplevel(level = 0).index.tolist()
onlyp_top10.loc[[i for i in onlyp_top10.index.tolist() if i not in mono_top10], 'c_norm_MonoArea[pGlyco]'] = -1
onlyp_top10.loc[[i for i in onlyp_top10.index.tolist() if i not in iso_top10], 'd_norm_IsotopeArea[pGlyco]'] = -1
onlyp_top10 = onlyp_top10.loc[(onlyp_top10['c_norm_MonoArea[pGlyco]'] != -1)|(onlyp_top10['d_norm_IsotopeArea[pGlyco]'] != -1)].reset_index(drop=True).drop(['g_sum_MonoArea[pGlyco]', 'h_sum_IsotopeArea[pGlyco]', 'N(x) ↓'], axis = 1)
# onlyp_top10.to_excel('onlyp_top10.xlsx', index = False)

## bp_top10
onlyb_top10 = onlyb_top10.set_index('Glycans ↓')
onlyp_top10 = onlyp_top10.set_index('Glycans ↓')
a1, a2 = onlyb_top10.align(onlyp_top10, join='outer', axis = 0) # align rows only
bptop10 = pd.concat([a1,a2], axis = 1)
bptop10.reset_index(level=0, inplace=True)
bptop10 = bptop10.fillna(-1)
# add N(x) col
nx = ['Unoccupied' if lst == [] else 'N(%s)'%(lst[0]) for lst in bptop10['Glycans ↓'].str.findall(r'HexNAc\((\d+)\)').tolist()]
bptop10.insert(0 , 'N(x) ↓', nx , True)
# bptop10.to_excel('bptop10.xlsx', index = False)

print('----- Start summary table construction. -----\n')
# start grouping: N(X) -> byonic glycans -> byonic sites -> .agg({all 8 cols, apply .mean() & .mean()/total XXX})
# bp_intersection
bp_intersection = bp_intersection.groupby(['N(x) ↓', 'Glycans ↓', 'N-site(Byonic ∪ pGlyco) →']).agg({'a_norm_XIC\r\nAUC[Byos]': lambda x: x[x!= -1].mean(), 'b_norm_Apex Int.\r\n(Posit)[Byos]': lambda x: x[x!= -1].mean() \
                                                                                      , 'c_norm_MonoArea[pGlyco]': lambda x: x[x!= -1].mean(), 'd_norm_IsotopeArea[pGlyco]': lambda x: x[x!= -1].mean()})
# fix the str '10' sorted before '2' problem
bp_intersection = bp_intersection.reindex(bp_intersection_new_nx, level = 0)
# unstack byonic n-sites, pglyco n-sites
bp_intersection = bp_intersection.unstack(level=-1)
# multiIndex col swaplevel
bp_intersection.columns = bp_intersection.columns.swaplevel(0, 1)
# sort is necessary for the right table structure (but since it will sort every level, so we need to manually adjust level3 back to the original order)
bp_intersection.sort_index(axis=1, level=[0, 1], inplace=True)
bp_intersection.columns = bp_intersection.columns.set_names(['N-site(Byonic ∪ pGlyco) →', 'Quant. →'])
# delete a-h in Quant. (the alphabet is just for sorting)
bp_intersection.columns.set_levels(['norm_XIC\r\nAUC[Byos]', 'norm_Apex Int.\r\n(Posit)[Byos]', 'norm_MonoArea[pGlyco]', 'norm_IsotopeArea[pGlyco]'], level = 1, inplace=True)

# bp_union
bp_union = bp_union.groupby(['N(x) ↓', 'Glycans ↓', 'N-site(Byonic ∪ pGlyco) →']).agg({'a_norm_XIC\r\nAUC[Byos]': lambda x: x[x!= -1].mean(), 'b_norm_Apex Int.\r\n(Posit)[Byos]': lambda x: x[x!= -1].mean() \
                                                                                      , 'c_norm_MonoArea[pGlyco]': lambda x: x[x!= -1].mean(), 'd_norm_IsotopeArea[pGlyco]': lambda x: x[x!= -1].mean()})

# fix the str '10' sorted before '2' problem
bp_union = bp_union.reindex(bp_union_new_nx, level = 0)
# unstack byonic n-sites, pglyco n-sites
bp_union = bp_union.unstack(level=-1)
# multiIndex col swaplevel
bp_union.columns = bp_union.columns.swaplevel(0, 1)
# sort is necessary for the right table structure (but since it will sort every level, so we need to manually adjust level3 back to the original order)
bp_union.sort_index(axis=1, level=[0, 1], inplace=True)
bp_union.columns = bp_union.columns.set_names(['N-site(Byonic ∪ pGlyco) →', 'Quant. →'])
# delete a-h in Quant. (the alphabet is just for sorting)
bp_union.columns.set_levels(['norm_XIC\r\nAUC[Byos]', 'norm_Apex Int.\r\n(Posit)[Byos]', 'norm_MonoArea[pGlyco]', 'norm_IsotopeArea[pGlyco]'], level = 1, inplace=True)


# onlyb
onlyb = onlyb.groupby(['N(x) ↓', 'Glycans ↓', 'N-site(SequonBased)[Byonic]']).agg({'e_sum_XIC\r\nAUC[Byos]': lambda x: x.mean(), 'f_sum_Apex Int.\r\n(Posit)[Byos]': lambda x: x.mean() \
                                                                                      , 'a_norm_XIC\r\nAUC[Byos]': lambda x: x.mean(), 'b_norm_Apex Int.\r\n(Posit)[Byos]': lambda x: x.mean()})

# fix the str '10' sorted before '2' problem
onlyb = onlyb.reindex(b_new_nx, level = 0)
# unstack byonic n-sites
onlyb = onlyb.unstack(level=-1)
# multiIndex col swaplevel
onlyb.columns = onlyb.columns.swaplevel(0, 1)
# sort is necessary for the right table structure (but since it will sort every level, so we need to manually adjust level3 back to the original order)
onlyb.sort_index(axis=1, level=[0, 1], inplace=True)
onlyb.columns = onlyb.columns.set_names(['N-site(SequonBased)[Byonic] →', 'Quant. →'])
# delete a-h in Quant. (the alphabet is just for sorting)
onlyb.columns.set_levels(['norm_XIC\r\nAUC[Byos]', 'norm_Apex Int.\r\n(Posit)[Byos]', 'sum_XIC\r\nAUC[Byos]', 'sum_Apex Int.\r\n(Posit)[Byos]'], level = 1, inplace=True)

# onlyp
onlyp = onlyp.groupby(['N(x) ↓', 'Glycans ↓', 'ProSites[pGlyco]']).agg({'g_sum_MonoArea[pGlyco]': lambda x: x.mean(), 'h_sum_IsotopeArea[pGlyco]': lambda x: x.mean() \
                                                                                      , 'c_norm_MonoArea[pGlyco]': lambda x: x.mean(), 'd_norm_IsotopeArea[pGlyco]': lambda x: x.mean()})

# fix the str '10' sorted before '2' problem
onlyp = onlyp.reindex(p_new_nx, level = 0)
# unstack byonic n-sites, pglyco n-sites
onlyp = onlyp.unstack(level=-1)
# multiIndex col swaplevel
onlyp.columns = onlyp.columns.swaplevel(0, 1)
# sort is necessary for the right table structure (but since it will sort every level, so we need to manually adjust level3 back to the original order)
onlyp.sort_index(axis=1, level=[0, 1], inplace=True)
onlyp.columns = onlyp.columns.set_names(['ProSites[pGlyco] →', 'Quant. →'])
# delete a-h in Quant. (the alphabet is just for sorting)
onlyp.columns.set_levels(['norm_MonoArea[pGlyco]', 'norm_IsotopeArea[pGlyco]', 'sum_MonoArea[pGlyco]', 'sum_IsotopeArea[pGlyco]'], level = 1, inplace=True)

print('\n----- Exporting 3 "_Quant" files... This may take some time, please wait. -----\n')
# def bp_colorfix(cell):
#     if type(cell) != str and cell < 0 :
#         return 'background-color: white; color:lightgrey'
# def b_colorfix(cell):
#     if type(cell) != str and cell < 0 :
#         return 'background-color: white; color:lightgrey'
# def p_colorfix(cell):
#     if type(cell) != str and cell < 0 :
#         return 'background-color: white; color:lightgrey'

# bp_intersection
bp_intersection.replace(to_replace = -1, value = np.nan , inplace = True)
# bp_intersection.style.background_gradient(cmap ='Reds', subset=bp_allpresent.columns.get_loc_level('norm_XIC\r\nAUC[Byos]', level=2)[0], low=normlow, high=normhigh, vmin = normvmin)\
# .background_gradient(cmap ='Reds', subset=bp_allpresent.columns.get_loc_level('norm_Apex Int.\r\n(Posit)[Byos]', level=2)[0], low=normlow, high=normhigh, vmin = normvmin)\
# .background_gradient(cmap ='Reds', subset=bp_allpresent.columns.get_loc_level('norm_MonoArea[pGlyco]', level=2)[0], low=normlow, high=normhigh, vmin = normvmin)\
# .background_gradient(cmap ='Reds', subset=bp_allpresent.columns.get_loc_level('norm_IsotopeArea[pGlyco]', level=2)[0], low=normlow, high=normhigh, vmin = normvmin)\
# .background_gradient(cmap ='Reds', subset=bp_allpresent.columns.get_loc_level('sum_XIC\r\nAUC[Byos]', level=2)[0], low=sumlow, high=sumhigh, vmin = sumvmin)\
# .background_gradient(cmap ='Reds', subset=bp_allpresent.columns.get_loc_level('sum_Apex Int.\r\n(Posit)[Byos]', level=2)[0], low=sumlow, high=sumhigh, vmin = sumvmin)\
# .background_gradient(cmap ='Reds', subset=bp_allpresent.columns.get_loc_level('sum_MonoArea[pGlyco]', level=2)[0], low=sumlow, high=sumhigh, vmin = sumvmin)\
# .background_gradient(cmap ='Reds', subset=bp_allpresent.columns.get_loc_level('sum_IsotopeArea[pGlyco]', level=2)[0], low=sumlow, high=sumhigh, vmin = sumvmin)\
# .applymap(highlight_negative)\
# .to_excel('bp_allpresent_colored.xlsx') 

bp_intersection.style.background_gradient(cmap ='Reds').highlight_null(null_color='white').to_excel('20210622_bp_intersection_colored.xlsx') 
print('\n----- "_QuantBPintersection" file exported (Intersection of Byonic & pGlyco N-glycosylation sites). -----\n')
# bp_union
bp_union.replace(to_replace = -1, value = np.nan , inplace = True)
bp_union.style.background_gradient(cmap ='Reds').highlight_null(null_color='white').to_excel('20210622_bp_union_colored.xlsx') 
print('\n----- "_QuantBPunion" file exported (Union of Byonic & pGlyco N-glycosylation sites). -----\n')
# onlyb
onlyb.replace(to_replace = -1, value = np.nan , inplace = True)
# onlyb.style.background_gradient(cmap ='Greens', subset=onlyb.columns.get_loc_level('norm_XIC\r\nAUC[Byos]', level=1)[0], low=normlow, high=normhigh, vmin = normvmin)\
# .background_gradient(cmap ='Greens', subset=onlyb.columns.get_loc_level('norm_Apex Int.\r\n(Posit)[Byos]', level=1)[0], low=normlow, high=normhigh, vmin = normvmin)\
# .background_gradient(cmap ='Greens', subset=onlyb.columns.get_loc_level('sum_XIC\r\nAUC[Byos]', level=1)[0], low=sumlow, high=sumhigh, vmin = sumvmin)\
# .background_gradient(cmap ='Greens', subset=onlyb.columns.get_loc_level('sum_Apex Int.\r\n(Posit)[Byos]', level=1)[0], low=sumlow, high=sumhigh, vmin = sumvmin)\
# .applymap(highlight_negative)\
# .to_excel('onlyb_colored.xlsx') 

onlyb.style.background_gradient(cmap ='Greens').highlight_null(null_color='white').to_excel('20210622_onlyb_colored.xlsx')
print('\n----- "_QuantB" file exported. -----\n')
# onlyp
onlyb.replace(to_replace = -1, value = np.nan , inplace = True)
# onlyp.style.background_gradient(cmap ='Blues', subset=onlyp.columns.get_loc_level('norm_MonoArea[pGlyco]', level=1)[0], low=normlow, high=normhigh, vmin = normvmin)\
# .background_gradient(cmap ='Blues', subset=onlyp.columns.get_loc_level('norm_IsotopeArea[pGlyco]', level=1)[0], low=normlow, high=normhigh, vmin = normvmin)\
# .background_gradient(cmap ='Blues', subset=onlyp.columns.get_loc_level('sum_MonoArea[pGlyco]', level=1)[0], low=sumlow, high=sumhigh, vmin = sumvmin)\
# .background_gradient(cmap ='Blues', subset=onlyp.columns.get_loc_level('sum_IsotopeArea[pGlyco]', level=1)[0], low=sumlow, high=sumhigh, vmin = sumvmin)\
# .applymap(highlight_negative)\
# .to_excel('onlyp_colored.xlsx') 

onlyp.style.background_gradient(cmap ='Blues').highlight_null(null_color='white').to_excel('20210622_onlyp_colored.xlsx')
print('\n----- "_QuantP" file exported. -----\n')

total_xicauc:42939314900.0
total_int:2554942900.0
total_mono:4255944683
total_iso:18890600000
cleaned up quant_bb len: 685
cleaned up quant_p len: 137
3296
1078
----- Start summary table construction. -----



  bp_intersection.columns.set_levels(['norm_XIC\r\nAUC[Byos]', 'norm_Apex Int.\r\n(Posit)[Byos]', 'norm_MonoArea[pGlyco]', 'norm_IsotopeArea[pGlyco]'], level = 1, inplace=True)
  bp_union.columns.set_levels(['norm_XIC\r\nAUC[Byos]', 'norm_Apex Int.\r\n(Posit)[Byos]', 'norm_MonoArea[pGlyco]', 'norm_IsotopeArea[pGlyco]'], level = 1, inplace=True)
  onlyb.columns.set_levels(['norm_XIC\r\nAUC[Byos]', 'norm_Apex Int.\r\n(Posit)[Byos]', 'sum_XIC\r\nAUC[Byos]', 'sum_Apex Int.\r\n(Posit)[Byos]'], level = 1, inplace=True)
  onlyp.columns.set_levels(['norm_MonoArea[pGlyco]', 'norm_IsotopeArea[pGlyco]', 'sum_MonoArea[pGlyco]', 'sum_IsotopeArea[pGlyco]'], level = 1, inplace=True)



----- Exporting 3 "_Quant" files... This may take some time, please wait. -----


----- "_QuantBPintersection" file exported (Intersection of Byonic & pGlyco N-glycosylation sites). -----



  smin = np.nanmin(s.to_numpy()) if vmin is None else vmin
  smax = np.nanmax(s.to_numpy()) if vmax is None else vmax



----- "_QuantBPunion" file exported (Union of Byonic & pGlyco N-glycosylation sites). -----


----- "_QuantB" file exported. -----


----- "_QuantP" file exported. -----



In [17]:
import pandas as pd
old = pd.read_excel('quant_singlesite.xlsx', header = 0)
old = old.fillna('N/A')
fixed_colname = [i.replace('_x000D_', '\r') if '_x000D_' in i else i for i in old.columns]
old.columns = fixed_colname
print(len(old))
drop = old.loc[(old['Fragment\r\nType[Byonic]'] == 'ethcd') & ((old['Score[Byonic]'] <= 200)|(old['PEP\r\n2D[Byonic]'].abs() >= 0.001)) & ((old['Pair[Byonic]'] == -1)|(old['Pair[Byonic]'] == 'N/A'))]
print(len(drop))
p_etd_drop = old.loc[(old['FragmentType[pGlyco]'] == 'ethcd') & ((old['Fragment\r\nType[Byonic]'] == 'ethcd') & ((old['Score[Byonic]'] <= 200)|(old['PEP\r\n2D[Byonic]'].abs() >= 0.001)) & ((old['Pair[Byonic]'] == -1)|(old['Pair[Byonic]'] == 'N/A')))]
print(len(p_etd_drop))

1603
873
1
