In [111]:
# %%capture cap --no-stderr
# NEW COLOR CODE
# HCD: only Byonic passes –>light green, only pGlyco passes -> light blue, both pass -> light orange 
# Byonic的 EThcD，Score/PEP2D不設門檻(全pass)
# EThcD: only Byonic EThcD passes –>  green, only pGlyco EThcD passes -> blue, both pass ->  orange 
# 不管 HCD還是 EThcD:B+P 跟 B/P如果是light blue/blue，改deep blue，反之 B+P 跟 B/P如果是 light green/green，改deep green
# add n-site column
# fixed sequence -> puresequence is between 2 dots
# add simplified & quantification (for plotting) version
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib import ticker
from IPython.display import display, HTML
import re # finding specific patterns in str
import textwrap # split text into equal parts
import collections # return repeated items in list
from collections import OrderedDict
import time
import sys
import ast # convert str back to tuple/list/int, etc

# export print comments to text file
# stdout_obj = sys.stdout # store original stdout
# stdout_obj = open("runlog.txt", "w")

# record starting time
start_time = time.time()

# read in pglyco as df
pglycofile = 'pglyco_Fcwf_T_Chy_Mammalian'
pglyco_df = pd.read_excel('%s.xlsx'%pglycofile, header = 0)
pglyco_df = pglyco_df.fillna('N/A')
pglyco_df = pglyco_df.sort_values(by=['Scan'])
pglyco_df = pglyco_df.reset_index(drop = True)
# display(HTML(pglyco_df.to_html()))

## preprocess the pglycofile first
# output column name
print('Original pglyco columns:\n%s\n'%pglyco_df.columns)

# replace _x000D_ w/ \r if exists
fixed_colname = [i.replace('_x000D_', '\r') if '_x000D_' in i else i for i in pglyco_df.columns]
pglyco_df.columns = fixed_colname
print('Fixed pglyco columns:\n%s\n'%pglyco_df.columns)

# record original data size
print('Original pglyco data size:\nrow: %s\ncol: %s\n'%(pglyco_df.shape[0], pglyco_df.shape[1]))

# change J back to N as column named peptide(J-->N)
pglyco_df['Peptide'] = pglyco_df['Peptide'].str.replace('J','N')

# analyze sequon in pglyco file
pglyco_sequon = pglyco_df['Peptide'].str.findall('(N[ARNDBCEQZGHILKMFSTWYV]T)|(N[ARNDBCEQZGHILKMFSTWYV]S)').tolist()
pglyco_sequon_lst = []
for t in pglyco_sequon:
    t = str(t)
    res = re.findall('[ARNDBCEQZGHILKMFSTWYVP]', t)
    res = ''.join(res)
    res = textwrap.wrap(res, 3)
    if res == []:
        pglyco_sequon_lst.append('N/A')
    elif len(res) == 1:
        pglyco_sequon_lst.append(res[0])
    else:
        pglyco_sequon_lst.append(res)
        
# print('pglyco_sequon: %s'%pglyco_sequon_lst)
pglyco_sequon_lst = [tuple(i) if type(i) == list else i for i in pglyco_sequon_lst]
pglyco_df.insert(pglyco_df.columns.get_loc('Peptide') + 1 , 'Sequon', pglyco_sequon_lst , True)
# display(HTML(pglyco_df.to_html()))

# replace H, N, A, F, G symbols w/ Hex, HexNAc, NeuAc, Fuc, NeuGc
byonicstyle = []
if 'Glycan(H,N,A,G,F)' in pglyco_df.columns:
    glycan_num = pglyco_df['Glycan(H,N,A,G,F)'].tolist()
    for i in glycan_num:
        i = i.split(' ')
#         print(i)
        new_order = [1, 0, 4, 2, 3]
        i = [i[x] for x in new_order]
#         print('new: %s'%i)
        if i[0] != '0':
            n = 'HexNAc(%s)'%(i[0])
        else :
            n = ''
        if i[1] != '0':
            h = 'Hex(%s)'%(i[1])
        else:
            h = ''
        if i[2] != '0':
            f = 'Fuc(%s)'%(i[2])
        else:
            f = ''
        if i[3] != '0':
            a = 'NeuAc(%s)'%(i[3])
        else:
            a = ''
        if i[4] != '0':
            g = 'NeuGc(%s)'%(i[4])
        else:
            g = ''
        each_byonicstyle = n + h + f + a + g
        byonicstyle.append(each_byonicstyle)
elif 'Glycan(H,N,A,F)' in pglyco_df.columns:
    glycan_num = pglyco_df['Glycan(H,N,A,F)'].tolist()
    for i in glycan_num:
        i = i.split(' ')
        new_order = [1, 0, 3, 2]
        i = [i[x] for x in new_order]
        if i[0] != '0':
            n = 'HexNAc(%s)'%(i[0])
        else :
            n = ''
        if i[1] != '0':
            h = 'Hex(%s)'%(i[1])
        else:
            h = ''
        if i[2] != '0':
            f = 'Fuc(%s)'%(i[2])
        else:
            f = ''
        if i[3] != '0':
            a = 'NeuAc(%s)'%(i[3])
        else:
            a = ''
        each_byonicstyle = n + h + f + a
        byonicstyle.append(each_byonicstyle)
else:
    sys.exit('Please check if there are glycans other than H, N, A, G, F or if column name has changed. This app will stop.')
# print(byonicstyle)
pglyco_df.insert(pglyco_df.columns.get_loc('GlycanComposition') + 1 , 'GlycanComposition_ByonicStyle', byonicstyle , True)
# display(HTML(pglyco_df.to_html()))

# if the etdscan is not blank, duplicate the row and change the duplicated scan to etdscan (micmic byonic format)
pglyco_df.insert(pglyco_df.columns.get_loc('Scan') + 1 , 'FragmentType', 'hcd' , True) # insert 'fragment type' col
row_to_duplicate = pglyco_df[pglyco_df['ETDScan'] != 'N/A'].copy() # assume the missing ETDScan in pglyco is represented as blank & later filled w/ N/A
row_to_duplicate['FragmentType'] = 'ethcd'
row_to_duplicate['Scan'] = row_to_duplicate['ETDScan']
pglyco_df = pd.concat([pglyco_df, row_to_duplicate]) # duplicate w/ index
pglyco_df = pglyco_df.sort_values(by = ['Scan']) # sort by scan directly, the duplicated ethcd rows can be separated from hcd rows
# display(HTML(pglyco_df.to_html()))
print('----- pGlyco data preprocessing completed. -----\n')

## import byonic raw excel file (contain all the info.) to compare
byonicfile = 'byonic_Fcwf_T_Chy_Mammalian'
byonic_df = pd.read_excel('%s.xlsx'%byonicfile, header = 0)
byonic_df = byonic_df.fillna('N/A')
print('Original byonic columns:\n%s\n'%byonic_df.columns)
# replace _x000D_ w/ \r if exists
fixed_colname = [i.replace('_x000D_', '\r') if '_x000D_' in i else i for i in byonic_df.columns]
byonic_df.columns = fixed_colname
print('Fixed byonic columns:\n%s\n'%byonic_df.columns)
# record original data size
print('Original byonic data size:\nrow: %s\ncol: %s\n'%(byonic_df.shape[0], byonic_df.shape[1]))
# extract 'scan' from 'scan #' in byonic file & add a 'Scan' column
byonic_scan = byonic_df['Scan #'].tolist()
byonic_scan_lst = []
for scan in byonic_scan:
    scan = scan.split(' ')[-1].split('=')[-1]
    scan = int(scan)
#     print(scan)
    byonic_scan_lst.append(scan)
byonic_df.insert(byonic_df.columns.get_loc('Scan\r\nTime') + 1 , 'Scan', byonic_scan_lst , True)
byonic_df = byonic_df.sort_values(by = ['Scan'])
byonic_df = byonic_df.reset_index(drop = True)
# add 'PureSequence' column to byonic file
if 'Sequence\r\n(unformatted)' in byonic_df.columns: # deal w/ dif byonic version
    byonic_seq = byonic_df['Sequence\r\n(unformatted)'].str[2:-2].str.findall('[ARNDBCEQZGHILKMFSTWYVP]').tolist()
    byonic_seq = [''.join(each_pure) for each_pure in byonic_seq]
    byonic_df.insert(byonic_df.columns.get_loc('Sequence\r\n(unformatted)') + 1 , 'PureSequence', byonic_seq , True)
    byonic_seq_forSequonNsite = byonic_df['Sequence\r\n(unformatted)'].str.findall('[ARNDBCEQZGHILKMFSTWYVP]').tolist()
    byonic_seq_forSequonNsite = [''.join(each_pure) for each_pure in byonic_seq_forSequonNsite]
    byonic_seq_forSequonNsite = pd.Series(byonic_seq_forSequonNsite) # preserve aa outside dots
elif 'Sequence' in byonic_df.columns:
    byonic_seq = byonic_df['Sequence'].str[2:-2].str.findall('[ARNDBCEQZGHILKMFSTWYVP]').tolist()  
    byonic_seq = [''.join(each_pure) for each_pure in byonic_seq]
    byonic_df.insert(byonic_df.columns.get_loc('Sequence') + 1 , 'PureSequence', byonic_seq , True)
    byonic_seq_forSequonNsite = byonic_df['Sequence'].str.findall('[ARNDBCEQZGHILKMFSTWYVP]').tolist()
    byonic_seq_forSequonNsite = [''.join(each_pure) for each_pure in byonic_seq_forSequonNsite]
    byonic_seq_forSequonNsite = pd.Series(byonic_seq_forSequonNsite)
else:
    sys.exit('Please check if sequence column name has changed. This app will stop.')  
# add 'Sequon' & 'N-site' column to byonic file
byonic_sequon_lst = []
n_site_lst = []
pureseq_cnt = 0 # this is byonic_seq_forSequonNsite
pos = byonic_df['Pos.'].tolist()
pos_cnt = 0
byonic_sequon = byonic_seq_forSequonNsite.str.findall('(N[ARNDBCEQZGHILKMFSTWYV]T)|(N[ARNDBCEQZGHILKMFSTWYV]S)').tolist()
for t in byonic_sequon:
#     print(t)
    t = str(t)
    res = re.findall('[ARNDBCEQZGHILKMFSTWYVP]', t)
    res = ''.join(res)
    res = textwrap.wrap(res, 3)
    if res == []:
        byonic_sequon_lst.append('N/A')
        n_site_lst.append('N/A')
        pureseq_cnt += 1
        pos_cnt += 1
    elif len(res) == 1: # single sequon -> single n-site
        byonic_sequon_lst.append(res[0])
        regex = re.compile(r'(' + '|'.join(res) + r')')
        each_n_site_pos = [m.start() for m in regex.finditer(byonic_seq_forSequonNsite[pureseq_cnt])][0]
        each_n_site = pos[pos_cnt] -1 + each_n_site_pos # -1 to fix the 'dots problem'
        n_site_lst.append(each_n_site)
        pureseq_cnt += 1
        pos_cnt += 1
    else: # non-single sequon
        byonic_sequon_lst.append(res)
        regex = re.compile(r'(' + '|'.join(res) + r')')
        each_n_site_pos = [m.start() for m in regex.finditer(byonic_seq_forSequonNsite[pureseq_cnt])]
        each_n_site = list(pos[pos_cnt] -1 + np.array(each_n_site_pos))
        n_site_lst.append(each_n_site)
        pureseq_cnt += 1
        pos_cnt += 1
# print(n_site_lst)
# convert lists within list to tuple for later usage ('cos lists are unhashable, which may cause some problems later)
byonic_sequon_lst = [tuple(i) if type(i) == list else i for i in byonic_sequon_lst]
n_site_lst = [tuple(i) if type(i) == list else i for i in n_site_lst]
byonic_df.insert(byonic_df.columns.get_loc('PureSequence') + 1 , 'Sequon', byonic_sequon_lst , True)
byonic_df.insert(0, 'N-site(SequonBased)', n_site_lst , True)
# add N/A col 'Pair' for later modification (pair: cal.m/z same, cal.m same, pureseq same, scan difference <= 5, hcd before ethcd)
byonic_df.insert(byonic_df.columns.get_loc('Scan\r\nTime') + 1 , 'Pair', 'N/A' , True)
# potential_pair = byonic_df[['Calc.\r\nm/z', 'Calc.\r\nMH', 'PureSequence']][byonic_df.duplicated(subset=['Calc.\r\nm/z', 'Calc.\r\nMH', 'PureSequence'], keep=False)]
potential_pair = byonic_df[byonic_df.duplicated(subset=['Calc.\r\nm/z', 'Calc.M', 'PureSequence'], keep=False)].sort_values(['Calc.\r\nm/z'])
# print(potential_pair)
mz_gp = sorted(list(set(potential_pair['Calc.\r\nm/z'].tolist())))
pair_cnt = 0
all_hcd_ind = []
all_etd_ind = []
# mh_gp = sorted(list(set(potential_pair['Calc.\r\nMH'].tolist())))
# pureseq_gp = sorted(list(set(potential_pair['PureSequence'].tolist())))
# gp_cnt = 0
for i in mz_gp:
    gp = potential_pair[potential_pair['Calc.\r\nm/z'] == i][potential_pair.duplicated(subset=['Calc.M', 'PureSequence'], keep=False)]
    gp = gp.sort_values(['Scan'])
#     print('gp:\n%s\n'%gp[['Scan', 'Fragment\r\nType']])
    # skip the gp w/o ethcd & find pairs: last criterion -> scan dif <= 5
    if 'hcd' in gp['Fragment\r\nType'].tolist() and 'ethcd' in gp['Fragment\r\nType'].tolist():
        # from hcd find the nearest ethcd below
        pair_candidate = gp[(gp['Fragment\r\nType'] == 'hcd') & (gp['Fragment\r\nType'].shift(-1) == 'ethcd') & (gp['Scan'].shift(-1) - gp['Scan'] <= 5)]
#         print('pair_candidate:\n%s\n'%pair_candidate[['Scan', 'Fragment\r\nType']])
        hcd_iloc = [gp.index.get_loc(ind) for ind in pair_candidate.index] # get the positions of the hcd in pairs in gp df
# #         print('hcd_ind:\n%s\n'%hcd_loc)
#         pair_lst = ['pair%s'%(i+1+pair_cnt) for i in range(len(hcd_iloc))]
# #         print('pair_lst:%s'%pair_lst)
#         gp['Pair'].iloc[hcd_iloc] = pair_lst
        etd_iloc = list(np.array(hcd_iloc) + 1)
#         gp['Pair'].iloc[etd_iloc] = pair_lst
#         pair_cnt += len(pair_lst) 
# #         print('NEW_gp:\n%s\n'%gp[['Scan', 'Fragment\r\nType', 'Pair']])
#         byonic_df.loc[gp.index.tolist()] = gp
        hcd_ind = pair_candidate.index.tolist()
#         print('hcd_ind:\n%s\n'%hcd_ind)
        etd_ind = [gp.index[i] for i in etd_iloc] 
#         print('etd_ind:\n%s\n'%etd_ind)
        all_hcd_ind.extend(hcd_ind)
        all_etd_ind.extend(etd_ind)
    else:
        pass
all_hcd_ind.sort()
all_etd_ind.sort()
# print('all_hcd_ind:\n%s\n'%all_hcd_ind)
# print('all_etd_ind:\n%s\n'%all_etd_ind)
byonic_df['Pair'].iloc[all_hcd_ind] = ['pair%s'%(i+1) for i in range(len(all_hcd_ind))]
byonic_df['Pair'].iloc[all_etd_ind] = ['pair%s'%(i+1) for i in range(len(all_etd_ind))]
# display(HTML(byonic_df.to_html()))
print('----- Byonic data preprocessing completed. -----\n')

# read in byos file
byosfile = 'UU4__Fcwf' # 2 in 1 out (fcwf)
byos_df = pd.read_excel('%s.xlsx'%byosfile, header = 0)
# output column name
print('Original byos columns:\n%s\n'%byos_df.columns)
# replace _x000D_ w/ \r if exists
fixed_colname = [i.replace('_x000D_', '\r') if '_x000D_' in i else i for i in byos_df.columns]
byos_df.columns = fixed_colname
print('Fixed byos columns:\n%s\n'%byos_df.columns)
# extract needed columns: Scan Number(s)(Posit), MS Alias name, XIC area summed, XIC AUC, Apex Int.(Posit), Calc.M, Sequence
byos_df = byos_df[['Scan Number(s)\r\n(Posit)', 'Glycans', 'MS2 Search\r\nAlias name', 'XIC area\r\nsummed', 'XIC\r\nAUC', 'Apex Int.\r\n(Posit)', 'Calc.M', 'Sequence']]
byos_df = byos_df.fillna('N/A')
# output extracted column name
print('Extracted byos columns:\n%s\n'%byos_df.columns)
# drop the row w/ multiple scan numbers (data type would be str)
# print('before:\n%s\n'%len(byos_df))  
byos_df = byos_df[byos_df['Scan Number(s)\r\n(Posit)'].apply(lambda x: isinstance(x, int))]
# print('after:\n%s\n'%len(byos_df)) 
# sort byos by scan
byos_df = byos_df.sort_values(by=['Scan Number(s)\r\n(Posit)'])
byos_df = byos_df.reset_index(drop = True)
# add 'PureSequence' col
byos_df['Sequence'] = byos_df['Sequence'].str.upper()
byos_df['PureSequence'] = byos_df['Sequence'].str[2:-2]
# display(HTML(byos_df.to_html()))
print('----- Byos data preprocessing completed. -----\n')

## combine scans from the two files
print('----- Start combining pGlyco & Byonic & Byos. -----\n')
# pglyco_scan = pglyco_df['Scan'].tolist()
# sorted_byonic_scan = byonic_df['Scan'].tolist()
# byos_scan = byos_df['Scan Number(s)\r\n(Posit)'].tolist() 
# byonic_repeated_scan = [item for item, count in collections.Counter(sorted_byonic_scan).items() if count > 1]
# pglyco_repeated_scan = [item for item, count in collections.Counter(pglyco_scan).items() if count > 1]
# byos_repeated_scan = [item for item, count in collections.Counter(byos_scan).items() if count > 1]
# byonic_repeated_count = [count for item, count in collections.Counter(sorted_byonic_scan).items() if count > 1]
# pglyco_repeated_count = [count for item, count in collections.Counter(pglyco_scan).items() if count > 1]
# byos_repeated_count = [count for item, count in collections.Counter(byos_scan).items() if count > 1]
# byonic_repeated_dict = dict(zip(byonic_repeated_scan, byonic_repeated_count))
# pglyco_repeated_dict = dict(zip(pglyco_repeated_scan, pglyco_repeated_count))
# byos_repeated_dict = dict(zip(byos_repeated_scan, byos_repeated_count))
# print('repeated scans in byonic file {scan:times}: %s'%(byonic_repeated_dict))
# print('repeated scans in pglyco file {scan:times}: %s'%(pglyco_repeated_dict))
# print('repeated scans in byos file {scan:times}: %s'%(byos_repeated_dict))

# combined data based on 'Scan' in byonic & pglyco
byonic_scanasid = byonic_df.copy()
new_byonic_col = [n + '[Byonic]' if n != 'Scan' else n for n in byonic_scanasid.columns]
# print('\nnew_byonic_col:\n%s\n'%new_byonic_col)
byonic_scanasid.columns = new_byonic_col
pglyco_scanasid = pglyco_df.copy()
new_pglyco_col = [n + '[pGlyco]' if n != 'Scan' else n for n in pglyco_scanasid.columns]
pglyco_scanasid.columns = new_pglyco_col
# print('new_pglyco_col:\n%s\n'%new_pglyco_col)
byos_scanasid = byos_df.copy()
new_byos_col = [n + '[Byos]' if n != 'Scan Number(s)\r\n(Posit)' else n for n in byos_scanasid.columns]
byos_scanasid.columns = new_byos_col
# print('new_byos_col:\n%s\n'%new_byos_col)
byonic_scanasid = byonic_scanasid.set_index('Scan')
pglyco_scanasid = pglyco_scanasid.set_index('Scan')
byos_scanasid = byos_scanasid.set_index('Scan Number(s)\r\n(Posit)')
# display(HTML(byos_scanasid.to_html()))
# align scan & concat (all align on row to make row number all the same)
a1, a2 = byonic_scanasid.align(pglyco_scanasid, join = 'outer', axis = 0) # row: a1 = a2
a1, a3 = a1.align(byos_scanasid, join = 'outer', axis = 0) # row: a1 = a2 = a3
a2, a3 = a2.align(byos_scanasid, join = 'outer', axis = 0) # row: a2 = a1 = a3
all_combined_df = pd.concat([a1,a3,a2], axis = 1)
all_combined_df.index.name = 'Scan'
all_combined_df.reset_index(level=0, inplace=True)
# change all nan to blank -1
all_combined_df = all_combined_df.fillna(-1)
# all_combined_df.to_excel('test.xlsx', index = False)  
# move pglyco column 'Prosite' (already n-stie) to the right side of N-site[Byonic]
move_df = all_combined_df['ProSites[pGlyco]']
all_combined_df.drop(labels=['ProSites[pGlyco]'], axis=1, inplace = True)
all_combined_df.insert(all_combined_df.columns.get_loc('N-site(SequonBased)[Byonic]') + 1,'ProSites[pGlyco]', move_df)
## result post-processing
# glycan comprison: only present in byonic -> b, only present in pglyco -> p, both the same -> b+p, not the same -> b/p
conditions = [
    (all_combined_df['Glycans[Byonic]'] != -1) & (all_combined_df['GlycanComposition_ByonicStyle[pGlyco]'] == -1),
    (all_combined_df['Glycans[Byonic]'] == -1) & (all_combined_df['GlycanComposition_ByonicStyle[pGlyco]'] != -1),
    (all_combined_df['Glycans[Byonic]'] != -1) & (all_combined_df['GlycanComposition_ByonicStyle[pGlyco]'] != -1) & (all_combined_df['Glycans[Byonic]'] == all_combined_df['GlycanComposition_ByonicStyle[pGlyco]']),
    (all_combined_df['Glycans[Byonic]'] != -1) & (all_combined_df['GlycanComposition_ByonicStyle[pGlyco]'] != -1) & (all_combined_df['Glycans[Byonic]'] != all_combined_df['GlycanComposition_ByonicStyle[pGlyco]'])]
choices = ['B', 'P', 'B+P', 'B/P'] 
glycan_source = np.select(conditions, choices, -1) 
all_combined_df.insert(all_combined_df.columns.get_loc('Scan') + 1 , 'GlycanSource', glycan_source , True)

print('\nCombined data shape:\nrow --> %s, column --> %s'%(all_combined_df.shape[0], all_combined_df.shape[1]))

## style apply for excel export
# color the rows below the threshold (threshold [byonic: score > 200 & pep2d < 0.001; pglyco: PepScore>5 & GlyScore>4])
# color the background to separate byonic data from pglyco data
# color code => #ffedcc -> light orange for byonic; #add8e6 -> light blue for pglyco; #FFB6C1 -> light pink for byos w/ dif calm ^ seq from byonic; #FF1493 -> deep pink for byos w/ dif calm & seq from byonic; #FFFF00 -> yellow for byos & byonic all the same

# define masks
# comparison between byonic & pglyco
# HCD (but not B+P OR B/P)
b_hcd_mask = (all_combined_df['Fragment\r\nType[Byonic]'] == 'hcd') & (all_combined_df['Score[Byonic]'] > 200) & (all_combined_df['PEP\r\n2D[Byonic]'].abs() < 0.001) & ((all_combined_df['PepScore[pGlyco]'] <= 5) | (all_combined_df['GlyScore[pGlyco]'] <= 4))
p_hcd_mask = (all_combined_df['FragmentType[pGlyco]'] == 'hcd') & ((all_combined_df['Score[Byonic]'] <= 200) | (all_combined_df['PEP\r\n2D[Byonic]'].abs() >= 0.001)) & (all_combined_df['PepScore[pGlyco]'] > 5) & (all_combined_df['GlyScore[pGlyco]'] > 4)    
both_hcd_mask = (all_combined_df['Fragment\r\nType[Byonic]'] == 'hcd') & (all_combined_df['FragmentType[pGlyco]'] == 'hcd') & (all_combined_df['Score[Byonic]'] > 200) & (all_combined_df['PEP\r\n2D[Byonic]'].abs() < 0.001) & (all_combined_df['PepScore[pGlyco]'] > 5) & (all_combined_df['GlyScore[pGlyco]'] > 4)
# ETD (but not B+P OR B/P): remember byonic etd does not need threshold, so we only need ot make sure that the row only contain byonic data, which means pglyco data will be -1
b_etd_mask = (all_combined_df['Fragment\r\nType[Byonic]'] == 'ethcd') & (all_combined_df['Score[Byonic]'] != -1) & (all_combined_df['PEP\r\n2D[Byonic]'] != -1) & ((all_combined_df['PepScore[pGlyco]'] <= 5) | (all_combined_df['GlyScore[pGlyco]'] <= 4)) 
p_etd_mask = (all_combined_df['FragmentType[pGlyco]'] == 'ethcd') & (all_combined_df['Score[Byonic]'] == -1) & (all_combined_df['PEP\r\n2D[Byonic]'] == -1) & (all_combined_df['PepScore[pGlyco]'] > 5) & (all_combined_df['GlyScore[pGlyco]'] > 4)    
both_etd_mask = (all_combined_df['Fragment\r\nType[Byonic]'] == 'ethcd') & (all_combined_df['FragmentType[pGlyco]'] == 'ethcd') & (all_combined_df['Score[Byonic]'] != -1) & (all_combined_df['PEP\r\n2D[Byonic]'] != -1) & (all_combined_df['PepScore[pGlyco]'] > 5) & (all_combined_df['GlyScore[pGlyco]'] > 4)
# (HCD OR ETD) & (B+P OR B/P) & passes threshold
b_glycansource_mask = ((all_combined_df['GlycanSource'] == 'B+P') ^ (all_combined_df['GlycanSource'] == 'B/P')) & ((b_hcd_mask) ^ (b_etd_mask)) # hcd exclusive or etd
p_glycansource_mask = ((all_combined_df['GlycanSource'] == 'B+P') ^ (all_combined_df['GlycanSource'] == 'B/P')) & ((p_hcd_mask) ^ (p_etd_mask)) # hcd exclusive or etd    
# byonic & pglyco coloring range
bp_range = byonic_scanasid.columns.tolist() + pglyco_scanasid.columns.tolist()
# comparison between byonic & byos
byos_exclusiveOr_mask = (all_combined_df['Calc.M[Byos]'] != -1) & (all_combined_df['Calc.M[Byonic]'] != -1) & ((all_combined_df['Calc.M[Byos]'] != all_combined_df['Calc.M[Byonic]'])^(all_combined_df['PureSequence[Byos]'] != all_combined_df['PureSequence[Byonic]']))
byos_and_mask = (all_combined_df['Calc.M[Byos]'] != -1) & (all_combined_df['Calc.M[Byonic]'] != -1) & (all_combined_df['Calc.M[Byos]'] != all_combined_df['Calc.M[Byonic]']) & (all_combined_df['PureSequence[Byos]'] != all_combined_df['PureSequence[Byonic]'])
byos_bothsame_mask = (all_combined_df['Calc.M[Byos]'] != -1) & (all_combined_df['Calc.M[Byonic]'] != -1) & (all_combined_df['Calc.M[Byos]'] == all_combined_df['Calc.M[Byonic]']) & (all_combined_df['PureSequence[Byos]'] == all_combined_df['PureSequence[Byonic]'])
# record all_combined_df color indices
lightgreen_ind = all_combined_df.loc[b_hcd_mask].index.tolist()
lightblue_ind = all_combined_df.loc[p_hcd_mask].index.tolist()
lightorange_ind = all_combined_df.loc[both_hcd_mask].index.tolist()
normalgreen_ind = all_combined_df.loc[b_etd_mask].index.tolist()
normalblue_ind = all_combined_df.loc[p_etd_mask].index.tolist()
normalorange_ind = all_combined_df.loc[both_etd_mask].index.tolist()
lightpink_ind = all_combined_df.loc[byos_exclusiveOr_mask].index.tolist()
deeppink_ind = all_combined_df.loc[byos_and_mask].index.tolist()
yellow_ind = all_combined_df.loc[byos_bothsame_mask].index.tolist()
# count PSM for both byonic & pglyco separately, then add 'PSM[...]' col to each (skip the rows below threshold)
byonicbyos_nsite = [site for site in all_combined_df['N-site(SequonBased)[Byonic]'].tolist() if site != 'N/A' if site != -1] # contains list & int
# print('byonicbyos nsite len before sort:\n%s\n'%len(byonicbyos_nsite))
pglyco_nsite = [int(site) for site in all_combined_df['ProSites[pGlyco]'].tolist() if site != 'N/A' if site != -1] # mainly int (could have list in the future) 
# print('pglyco_nsite len before sort:\n%s\n'%len(pglyco_nsite))
# byonicbyos_nsite.sort(key=lambda v: (isinstance(v, list), v))
# byonicbyos_nsite = sorted([list(y) for y in set([tuple(x) for x in [site for site in byonicbyos_nsite if type(site) == list]])]) # unique values
byonicbyos_nsite_sortedunique = sorted(list(set([site for site in byonicbyos_nsite if type(site) == int]))) + sorted(list(set(sorted([site for site in byonicbyos_nsite if type(site) == tuple]))))
# print('byonicbyos_nsite_sortedunique:\n%s\n'%byonicbyos_nsite_sortedunique)
# print('byonicbyos nsite len after sort:\n%s\n'%len(byonicbyos_nsite_sortedunique))
# pglyco_nsite.sort(key=lambda v: (isinstance(v, list), v))
# pglyco_nsite = sorted([list(y) for y in set([tuple(x) for x in [site for site in pglyco_nsite if type(site) == list]])]) # unique values
pglyco_nsite_sortedunique = sorted(list(set([site for site in pglyco_nsite if type(site) == int]))) + sorted(list(set(sorted([site for site in pglyco_nsite if type(site) == tuple]))))
# print('pglyco_nsite_sortedunique:\n%s\n'%pglyco_nsite_sortedunique)
# print('pglyco_nsite len after sort:\n%s\n'%len(pglyco_nsite_sortedunique))
# analyze PSM (need to pass threshold)
byonic_colored_id = lightgreen_ind + normalgreen_ind + lightorange_ind + normalorange_ind # byonic pass threshold
byonic_belowthreshold_id = [i for i in all_combined_df.index.tolist() if i not in byonic_colored_id]
pglyco_colored_id = lightblue_ind + normalblue_ind + lightorange_ind + normalorange_ind # pglyco pass threshold
pglyco_belowthreshold_id = [i for i in all_combined_df.index.tolist() if i not in pglyco_colored_id]
# using groupby size function to count psm & add psm columns
# byonic & byos
all_combined_df = all_combined_df.astype({'N-site(SequonBased)[Byonic]': 'str'}) # convert list & tuple to str for later groupby function
byonic_psm = all_combined_df.groupby(['N-site(SequonBased)[Byonic]', 'Calc.\r\nm/z[Byonic]','Glycans[Byonic]'])['N-site(SequonBased)[Byonic]'].transform('size') # count rows with same n-site & glycan & calc.m/z, then add 'PSM' col to record the count
all_combined_df.insert(all_combined_df.columns.get_loc('N-site(SequonBased)[Byonic]') + 1 , 'PSM[Byonic]', byonic_psm , True)
all_combined_df.loc[(all_combined_df['N-site(SequonBased)[Byonic]'] == 'N/A'), 'PSM[Byonic]'] = 'N/A' # if n-site is 'N/A', do not count psm
all_combined_df.loc[byonic_belowthreshold_id, 'PSM[Byonic]'] = 'N/A' # do not count psm for rows below threshold
all_combined_df.loc[(all_combined_df['N-site(SequonBased)[Byonic]'] == '-1'), 'PSM[Byonic]'] = -1 # if site is '-1', then psm set to -1
# pglyco
all_combined_df = all_combined_df.astype({'ProSites[pGlyco]': 'int'})
all_combined_df = all_combined_df.astype({'ProSites[pGlyco]': 'str'}) # convert list & tuple to str for later groupby function
pglyco_psm = all_combined_df.groupby(['ProSites[pGlyco]', 'PrecursorMZ[pGlyco]','GlycanComposition_ByonicStyle[pGlyco]'])['ProSites[pGlyco]'].transform('size') # count rows with same n-site & glycan & calc.m/z, then add 'PSM' col to record the count
all_combined_df.insert(all_combined_df.columns.get_loc('ProSites[pGlyco]') + 1 , 'PSM[pGlyco]', pglyco_psm , True)
all_combined_df.loc[(all_combined_df['ProSites[pGlyco]'] == 'N/A'), 'PSM[pGlyco]'] = 'N/A' # if n-site is 'N/A', do not count psm
all_combined_df.loc[pglyco_belowthreshold_id, 'PSM[pGlyco]'] = 'N/A' # do not count psm for rows below threshold
all_combined_df.loc[(all_combined_df['ProSites[pGlyco]'] == '-1'), 'PSM[pGlyco]'] = -1 # if site is '-1', then psm set to -1

def bg_color(x):
    # byonic & pglyco colors
    # HCD: light colors
    c1 = 'background-color: #98FB98' # light green
    c2 = 'background-color: #add8e6' # light blue
    c3 = 'background-color: #ffedcc' # light orange
    # ETD: normal colors
    c4 = 'background-color: #008000' # normal green
    c5 = 'background-color: #0000FF' # normal blue
    c6 = 'background-color: #FFA500' # normal orange
    # GlycanSource B+P, B/P: deep colors
    c7 = 'background-color: #004d00' # dark green
    c8 = 'background-color: #00008B' # dark blue
    # byos colors
    c9 = 'background-color: #FFB6C1' # light pink
    c10 = 'background-color: #FF1493' # deep pink
    c11 = 'background-color: #FFFF00' # yellow
    c = '' 
    #     # byonic & byos coloring range
#     bb_range = byos_scanasid.columns.tolist()
    #DataFrame with same index and columns names as original filled empty strings
    df1 =  pd.DataFrame(c, index=x.index, columns=x.columns)    
    # byonic & byos coloring range
    bb_range = [col for col in df1.columns.tolist() if '[Byos]' in col]
    # byonic & pglyco coloring range
    bp_range = [col for col in df1.columns.tolist() if '[Byonic]' in col or '[pGlyco]' in col]
    # modify values of df1 column by boolean mask
    # HCD: light colors (c1-c3)
    df1.loc[b_hcd_mask, bp_range] = c1
    print('<Color Summary>\n%s rows will be colored light green (#98FB98).'%len(df1.loc[b_hcd_mask, bp_range]))
    df1.loc[p_hcd_mask, bp_range] = c2 
    print('%s rows will be colored light blue (#add8e6).'%len(df1.loc[p_hcd_mask, bp_range]))
    df1.loc[both_hcd_mask, bp_range] = c3
    print('%s rows will be colored light orange (#ffedcc).'%len(df1.loc[both_hcd_mask, bp_range]))
    # ETD: normal colors (c4-c6)
    df1.loc[b_etd_mask, bp_range] = c4
    print('%s rows will be colored green (#008000).'%len(df1.loc[b_etd_mask, bp_range]))
    df1.loc[p_etd_mask, bp_range] = c5
    print('%s rows will be colored blue (#0000FF).'%len(df1.loc[p_etd_mask, bp_range]))
    df1.loc[both_etd_mask, bp_range] = c6
    print('%s rows will be colored orange (#FFA500).'%len(df1.loc[both_etd_mask, bp_range]))
    # GlycanSource B+P, B/P: deep colors (c7-c8)
    df1.loc[b_glycansource_mask, bp_range] = c7
    print('%s rows will be colored dark green (#004d00).'%len(df1.loc[b_glycansource_mask, bp_range]))
    df1.loc[p_glycansource_mask, bp_range] = c8
    print('%s rows will be colored dark blue (#00008B).'%len(df1.loc[p_glycansource_mask, bp_range]))
    # byos colors (c9-c11)
    df1.loc[byos_exclusiveOr_mask, bb_range] = c9
    print('%s rows will be colored light pink (#FFB6C1).'%len(df1.loc[byos_exclusiveOr_mask, bb_range]))
    df1.loc[byos_and_mask, bb_range] = c10
    print('%s rows will be colored deep pink (#FF1493).'%len(df1.loc[byos_and_mask, bb_range]))
    df1.loc[byos_bothsame_mask, bb_range] = c11
    print('%s rows will be colored yellow (#FFFF00).'%len(df1.loc[byos_bothsame_mask, bb_range]))
    return df1

print('\n----- Exporting "_All" file... This may take some time, please wait. -----\n')
all_combined_df.style.apply(bg_color, axis=None).to_excel('20210608_BBP_hcdethcd_fixedPureSequenceSequonNSiteTuplePSMFixedpColor_All.xlsx', index = False)  
print('\n----- "_All" file exported. -----\n')

# start preparing simplified version
print('----- Start preparing simplified version. -----\n')
# extract rows included in colored indices list separately
byonicbyos_col_range = ['Scan'] + [col for col in all_combined_df.columns.tolist() if '[Byonic]' in col or '[Byos]' in col]
pglyco_col_range = ['Scan'] + [col for col in all_combined_df.columns.tolist() if '[pGlyco]' in col]
simple_byonicbyos = all_combined_df.loc[byonic_colored_id, byonicbyos_col_range] # extract colored (pass threshold) rows
simple_byonicbyos = simple_byonicbyos[(simple_byonicbyos['N-site(SequonBased)[Byonic]'] != 'N/A')&(simple_byonicbyos['N-site(SequonBased)[Byonic]'] != '-1')] # for simplified, n-site must exist
simple_pglyco = all_combined_df.loc[pglyco_colored_id, pglyco_col_range] # extract colored (pass threshold) rows
simple_pglyco = simple_pglyco[(simple_pglyco['ProSites[pGlyco]'] != 'N/A')&(simple_pglyco['ProSites[pGlyco]'] != '-1')] # for simplified, n-site must exist
print('simple_byonicbyos:\n%s\n'%len(simple_byonicbyos))
print('simple_pglyco:\n%s\n'%len(simple_pglyco))
# get unique calc.m/z by groupby & idxmax function to find highest score within each gp and preserve (remember to split the simple_df into byonicbyos & pglyco parts to avoid dropping non-target rows)
simple_byonicbyos = simple_byonicbyos.groupby(['N-site(SequonBased)[Byonic]', 'Calc.\r\nm/z[Byonic]','Glycans[Byonic]'], as_index=False).apply(lambda x: x.loc[x['Score[Byonic]'].idxmax()]).reset_index(drop=True)
simple_pglyco = simple_pglyco.groupby(['ProSites[pGlyco]', 'PrecursorMZ[pGlyco]','GlycanComposition_ByonicStyle[pGlyco]'], as_index=False).apply(lambda x: x.loc[x['GlyScore[pGlyco]'].idxmax()]).reset_index(drop=True)
print('simple_byonicbyos:\n%s\n'%len(simple_byonicbyos))
print('simple_pglyco:\n%s\n'%len(simple_pglyco))
# set scan to index & concat simple_byonicbyos & simple_pglyco by scan
simple_byonicbyos = simple_byonicbyos.set_index('Scan')
simple_pglyco = simple_pglyco.set_index('Scan')
a1, a2 = simple_byonicbyos.align(simple_pglyco, join = 'outer', axis = 0) # row: a1 = a2
simple_df = pd.concat([a1,a2], axis = 1)
simple_df.index.name = 'Scan'
simple_df.reset_index(level=0, inplace=True)
# change all nan to blank -1
simple_df = simple_df.fillna(-1)
# ast.literaleval byonicbyos n-site & order simple_df by n-site (small -> large)
simple_df['N-site(SequonBased)[Byonic]'] = [ast.literal_eval(i) if type(i) == str else i for i in simple_df['N-site(SequonBased)[Byonic]'].tolist()]
simple_df['ProSites[pGlyco]'] = [ast.literal_eval(i) if type(i) == str else i for i in simple_df['ProSites[pGlyco]'].tolist()]
simple_df_byonicnsite = simple_df['N-site(SequonBased)[Byonic]'].tolist()
simple_df_byonicnsite = sorted(list(set([site for site in simple_df_byonicnsite if type(site) == int]))) + sorted(list(set(sorted([site for site in simple_df_byonicnsite if type(site) == tuple]))))
simple_df = simple_df.set_index('N-site(SequonBased)[Byonic]')
simple_df = simple_df.loc[simple_df_byonicnsite]
simple_df.index.name = 'N-site(SequonBased)[Byonic]'
simple_df.reset_index(level=0, inplace=True)
# add GlycanSource column
# glycan comprison: only present in byonic -> b, only present in pglyco -> p, both the same -> b+p, not the same -> b/p
conditions = [
    (simple_df['Glycans[Byonic]'] != -1) & (simple_df['GlycanComposition_ByonicStyle[pGlyco]'] == -1),
    (simple_df['Glycans[Byonic]'] == -1) & (simple_df['GlycanComposition_ByonicStyle[pGlyco]'] != -1),
    (simple_df['Glycans[Byonic]'] != -1) & (simple_df['GlycanComposition_ByonicStyle[pGlyco]'] != -1) & (simple_df['Glycans[Byonic]'] == simple_df['GlycanComposition_ByonicStyle[pGlyco]']),
    (simple_df['Glycans[Byonic]'] != -1) & (simple_df['GlycanComposition_ByonicStyle[pGlyco]'] != -1) & (simple_df['Glycans[Byonic]'] != simple_df['GlycanComposition_ByonicStyle[pGlyco]'])]
choices = ['B', 'P', 'B+P', 'B/P'] 
glycan_source = np.select(conditions, choices, -1) 
simple_df.insert(simple_df.columns.get_loc('Scan') + 1 , 'GlycanSource', glycan_source , True)
# move pglyco n-site to be next to byonic n-site for comparison
move_df = simple_df['ProSites[pGlyco]']
simple_df.drop(labels=['ProSites[pGlyco]'], axis=1, inplace = True)
simple_df.insert(simple_df.columns.get_loc('N-site(SequonBased)[Byonic]') + 1,'ProSites[pGlyco]', move_df)
# define masks & apply color again
# comparison between byonic & pglyco
# HCD (but not B+P OR B/P)
b_hcd_mask = (simple_df['Fragment\r\nType[Byonic]'] == 'hcd') & (simple_df['Score[Byonic]'] > 200) & (simple_df['PEP\r\n2D[Byonic]'].abs() < 0.001) & ((simple_df['PepScore[pGlyco]'] <= 5) | (simple_df['GlyScore[pGlyco]'] <= 4))
p_hcd_mask = (simple_df['FragmentType[pGlyco]'] == 'hcd') & ((simple_df['Score[Byonic]'] <= 200) | (simple_df['PEP\r\n2D[Byonic]'].abs() >= 0.001)) & (simple_df['PepScore[pGlyco]'] > 5) & (simple_df['GlyScore[pGlyco]'] > 4)    
both_hcd_mask = (simple_df['Fragment\r\nType[Byonic]'] == 'hcd') & (simple_df['FragmentType[pGlyco]'] == 'hcd') & (simple_df['Score[Byonic]'] > 200) & (simple_df['PEP\r\n2D[Byonic]'].abs() < 0.001) & (simple_df['PepScore[pGlyco]'] > 5) & (simple_df['GlyScore[pGlyco]'] > 4)
# ETD (but not B+P OR B/P): remember byonic etd does not need threshold, so we only need ot make sure that the row only contain byonic data, which means pglyco data will be -1
b_etd_mask = (simple_df['Fragment\r\nType[Byonic]'] == 'ethcd') & (simple_df['Score[Byonic]'] != -1) & (simple_df['PEP\r\n2D[Byonic]'] != -1) & ((simple_df['PepScore[pGlyco]'] <= 5) | (simple_df['GlyScore[pGlyco]'] <= 4)) 
p_etd_mask = (simple_df['FragmentType[pGlyco]'] == 'ethcd') & (simple_df['Score[Byonic]'] == -1) & (simple_df['PEP\r\n2D[Byonic]'] == -1) & (simple_df['PepScore[pGlyco]'] > 5) & (simple_df['GlyScore[pGlyco]'] > 4)    
both_etd_mask = (simple_df['Fragment\r\nType[Byonic]'] == 'ethcd') & (simple_df['FragmentType[pGlyco]'] == 'ethcd') & (simple_df['Score[Byonic]'] != -1) & (simple_df['PEP\r\n2D[Byonic]'] != -1) & (simple_df['PepScore[pGlyco]'] > 5) & (simple_df['GlyScore[pGlyco]'] > 4)
# (HCD OR ETD) & (B+P OR B/P) & passes threshold
b_glycansource_mask = ((simple_df['GlycanSource'] == 'B+P') ^ (simple_df['GlycanSource'] == 'B/P')) & ((b_hcd_mask) ^ (b_etd_mask)) # hcd exclusive or etd
p_glycansource_mask = ((simple_df['GlycanSource'] == 'B+P') ^ (simple_df['GlycanSource'] == 'B/P')) & ((p_hcd_mask) ^ (p_etd_mask)) # hcd exclusive or etd    
# comparison between byonic & byos
byos_exclusiveOr_mask = (simple_df['Calc.M[Byos]'] != -1) & (simple_df['Calc.M[Byonic]'] != -1) & ((simple_df['Calc.M[Byos]'] != simple_df['Calc.M[Byonic]'])^(simple_df['PureSequence[Byos]'] != simple_df['PureSequence[Byonic]']))
byos_and_mask = (simple_df['Calc.M[Byos]'] != -1) & (simple_df['Calc.M[Byonic]'] != -1) & (simple_df['Calc.M[Byos]'] != simple_df['Calc.M[Byonic]']) & (simple_df['PureSequence[Byos]'] != simple_df['PureSequence[Byonic]'])
byos_bothsame_mask = (simple_df['Calc.M[Byos]'] != -1) & (simple_df['Calc.M[Byonic]'] != -1) & (simple_df['Calc.M[Byos]'] == simple_df['Calc.M[Byonic]']) & (simple_df['PureSequence[Byos]'] == simple_df['PureSequence[Byonic]'])

print('\n----- Exporting "_ID" file... This may take some time, please wait. -----\n')
simple_df.style.apply(bg_color, axis=None).to_excel('20210608_BBP_hcdethcd_fixedPureSequenceSequonNSiteTuplePSM_ID.xlsx', index = False)  
print('\n----- "_ID" file exported. -----\n')















# # split byonic & pglyco to do the highest score selection separately, preserve scan for later locating ('cos byonic & pglyco could have dif site number in the same row)
# byonic_col = ['Scan'] + [col for col in simple_df.columns.tolist() if '[Byonic]' in col]
# byos_col = ['Scan'] + [col for col in simple_df.columns.tolist() if '[Byos]' in col]
# pglyco_col = ['Scan'] + [col for col in simple_df.columns.tolist() if '[pGlyco]' in col]
# simple_byonic = simple_df[byonic_col]
# simple_byos = simple_df[byos_col]
# simple_pglyco = simple_df[pglyco_col]
# # get unique val in byonic
# byonicsite_lst = simple_byonic['N-site(SequonBased)[Byonic]'].tolist()
# byonicsite_lst.sort(key=lambda v: (isinstance(v, list), v))
# single_site_set = sorted(list(set([site for site in byonicsite_lst if type(site) == int])))
# multi_site_set = sorted([list(y) for y in set([tuple(x) for x in [site for site in byonicsite_lst if type(site) == list]])]) 
# concat_df = []
# for site in single_site_set:
#     print(site)
#     eachsite_gp = simple_byonic[simple_byonic['N-site(SequonBased)[Byonic]'] == site]
#     print(len(eachsite_gp))
#     eachsite_gp = eachsite_gp.groupby(['Calc.\r\nm/z[Byonic]','Glycans[Byonic]'], as_index=False).apply(lambda x: x.loc[x['Score[Byonic]'].idxmax()]).reset_index(drop=True)
#     print(eachsite_gp)
#     print('after group: %s'%len(eachsite_gp))
#     concat_df.append(eachsite_gp)
# for site in multi_site_set:
#     print(site)
#     eachsite_gp = simple_byonic[simple_byonic['N-site(SequonBased)[Byonic]'] == site]
#     print(len(eachsite_gp))
#     eachsite_gp = eachsite_gp.groupby(['Calc.\r\nm/z[Byonic]','Glycans[Byonic]'], as_index=False).apply(lambda x: x.loc[x['Score[Byonic]'].idxmax()]).reset_index(drop=True)
#     print(eachsite_gp)
#     print('after group: %s'%len(eachsite_gp))
#     concat_df.append(eachsite_gp)    
# get unique val in pglyco
# byonicsite_lst = simple_byonic['N-site(SequonBased)[Byonic]'].tolist()
# byonicsite_lst.sort(key=lambda v: (isinstance(v, list), v))
# single_site_set = sorted(list(set([site for site in byonicsite_lst if type(site) == int])))
# multi_site_set = sorted([list(y) for y in set([tuple(x) for x in [site for site in byonicsite_lst if type(site) == list]])]) 
# concat_df = []
# for site in single_site_set:
#     print(site)
#     eachsite_gp = simple_byonic[simple_byonic['N-site(SequonBased)[Byonic]'] == site]
#     print(len(eachsite_gp))
#     eachsite_gp = eachsite_gp.groupby(['Calc.\r\nm/z[Byonic]','Glycans[Byonic]'], as_index=False).apply(lambda x: x.loc[x['Score[Byonic]'].idxmax()]).reset_index(drop=True)
#     print(eachsite_gp)
#     print('after group: %s'%len(eachsite_gp))
#     concat_df.append(eachsite_gp)

# simple_byonic_unique = pd.concat(concat_df)
# simple_byonic_unique.to_excel('20210605simple_byonic.xlsx', index = False)



# # analyze glycansource again after combining sorted byonic & pglyco
# # print("\nTask completed.\nExecution time: %.2f seconds"%(time.time() - start_time))

Original pglyco columns:
Index(['GlySpec', 'PepSpec', 'RawName', 'Scan', 'RT', 'PrecursorMH',
       'PrecursorMZ', 'Charge', 'Rank', 'Peptide', 'Mod', 'PeptideMH',
       'Glycan(H,N,A,G,F)', 'GlycanComposition', 'PlausibleStruct', 'GlyID',
       'GlyFrag', 'GlyMass', 'GlySite', 'TotalScore', 'PepScore', 'GlyScore',
       'CoreMatched', 'MassDeviation', 'PPM', 'GlyIonRatio', 'byIonRatio',
       'czIonRatio', 'GlyDecoy', 'PepDecoy', 'IsSmallGlycan', 'GlycanPEP',
       'GlycanFDR', 'PeptidePEP', 'PeptideFDR', 'TotalFDR', 'Proteins',
       'Genes', 'ProSites', 'MonoArea', 'IsotopeArea', 'ETDScan',
       'LocalizedSiteGroups', 'LocalizedScore', 'LocalizedIonRatio',
       'PreLocalizedScore'],
      dtype='object')

Fixed pglyco columns:
Index(['GlySpec', 'PepSpec', 'RawName', 'Scan', 'RT', 'PrecursorMH',
       'PrecursorMZ', 'Charge', 'Rank', 'Peptide', 'Mod', 'PeptideMH',
       'Glycan(H,N,A,G,F)', 'GlycanComposition', 'PlausibleStruct', 'GlyID',
       'GlyFrag', 'GlyMass', 'Gl

  gp = potential_pair[potential_pair['Calc.\r\nm/z'] == i][potential_pair.duplicated(subset=['Calc.M', 'PureSequence'], keep=False)]
  gp = potential_pair[potential_pair['Calc.\r\nm/z'] == i][potential_pair.duplicated(subset=['Calc.M', 'PureSequence'], keep=False)]
  gp = potential_pair[potential_pair['Calc.\r\nm/z'] == i][potential_pair.duplicated(subset=['Calc.M', 'PureSequence'], keep=False)]
  gp = potential_pair[potential_pair['Calc.\r\nm/z'] == i][potential_pair.duplicated(subset=['Calc.M', 'PureSequence'], keep=False)]
  gp = potential_pair[potential_pair['Calc.\r\nm/z'] == i][potential_pair.duplicated(subset=['Calc.M', 'PureSequence'], keep=False)]
  gp = potential_pair[potential_pair['Calc.\r\nm/z'] == i][potential_pair.duplicated(subset=['Calc.M', 'PureSequence'], keep=False)]
  gp = potential_pair[potential_pair['Calc.\r\nm/z'] == i][potential_pair.duplicated(subset=['Calc.M', 'PureSequence'], keep=False)]
  gp = potential_pair[potential_pair['Calc.\r\nm/z'] == i][potential_

----- Byonic data preprocessing completed. -----

Original byos columns:
Index(['Row#', 'PID', 'Protein_x000D_\nname', 'Sequence_x000D_\n(unformatted)',
       'Scan Number(s)_x000D_\n(Posit)', 'Sequence', 'XIC area_x000D_\nsummed',
       'XIC_x000D_\nAUC', 'XIC_x000D_\nRatio%', 'Apex Int._x000D_\n(Posit)',
       '_grp_x000D_\nnum', 'Feature_x000D_\nnumber', 'iso',
       'Sample-charge_x000D_\nID', 'Apex Time_x000D_\n(Posit)',
       'MS_x000D_\nAlias name', 'MS2 Search_x000D_\nAlias name', 'Score',
       'PEP_x000D_\n2D', 'Glycans', 'z', 'Calc._x000D_\nm/z', 'Calc.M',
       'Scan Time(s)_x000D_\n(Posit)', 'Fragment_x000D_\ntype(s)', 'Validate',
       'Comment', 'Labels', 'Mod._x000D_\nNames', '_mod_x000D_\nids',
       'Mod._x000D_\nSummary', 'Mod._x000D_\nAAs', 'Var. Pos._x000D_\nProtein',
       'Var. Pos._x000D_\nPeptide', 'Start_x000D_\nAA', 'End_x000D_\nAA',
       'PEP_x000D_\n1D', 'Delta_x000D_\nScore', 'Score_x000D_\nDelta',
       'Delta Mod._x000D_\nScore', 'Obs._x000D

  return array(a, dtype, copy=False, order=order)


5 rows will be colored blue (#0000FF).
0 rows will be colored orange (#FFA500).
0 rows will be colored dark green (#004d00).
0 rows will be colored dark blue (#00008B).
0 rows will be colored light pink (#FFB6C1).
0 rows will be colored deep pink (#FF1493).
2200 rows will be colored yellow (#FFFF00).

----- "_ID" file exported. -----



In [None]:
with open('runlog.txt', 'w') as f:
    f.write(cap.stdout)