In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib import ticker
from matplotlib.ticker import ScalarFormatter
from IPython.display import display, HTML
import re # finding specific patterns in str
import textwrap # split text into equal parts
import collections # return repeated items in list
from collections import OrderedDict
import time
from datetime import datetime # attach current date to export filename
import sys
import ast # convert str back to tuple/list/int, etc
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import string

def findNPosBySequon(seq):
    pattern = '(?i)(N[ARNDBCEQZGHILKMFSTWYV]T)|(N[ARNDBCEQZGHILKMFSTWYV]S)' # case insensitive
    result = [(m.start(0)+1, seq[m.start(0):m.start(0)+3]) for m in re.finditer(pattern, seq)]
    return result # [(site, 'sequon'), ...(...)]
def findUnionOnly(val):   
    color = 'red'
    weight = 'bold'
    bg_color = 'WhiteSmoke'
    return f'color: {color}; font-weight: {weight}; background-color: {bg_color}'
           
# INPUT FILE 
union = 'bp_union'
inter = 'bp_inter'
df_union = pd.read_excel('%s.xlsx'%union, header = 0)
df_inter = pd.read_excel('%s.xlsx'%inter, header = 0)
## INPUT TARGET PROTEIN SEQUENCE
seq = 'MDAMKRGLCCVLLLCGAVFVSPSASDVTRCQSTINFRRFFSKFNVQAPAVVVLGGYLPSMNSSSWYCGTGLETASGVHGIFLSYIDSGQGFEIGISQEPFDPSGYQLYLHKATNGNQDAIARLRICQFPNNKTLGPSVNDVTTGRNCLFNKAIPAYMQDGKNIIVGITWDNDRVTVFADKIYHFYLKNEWSRVATRCYNKRSCAMQYVYTPTYYMLNVTSAGEDGIYYSLCTANCIGYAVNVFATDSNGHIPEGFSFNNWFLLSNDSTLLHGKVVSNQPLLVNCLLAIPKIYGLGQFFSFNQTMDGACNGVAAQRAPEALRFNINDTSVILAEGSIVLHTALGTNLSFVCSNSSDPHLSTFAIPLGATQVPYYCFLKVDTYNSTVYKFLAVLPPTVREIVITKYGDVYVNGFGYLHLGLLDAVTINFTGHGTDDDVSGFWTIASTNFVDALIEVQGTAIERILYCDDPVSQLKCSQVAFDLDDGFYPISSRNLLSHEQPISFVTLPSFNDHSFVNITVSASFGGHSGANVIASDTTINGFSSFCVDTRQFTISLFYNVTNIYGYVSTSQGSNCPFTLQSVNDYLSFSKFCVSTSLLASACTIDLFGYPDFGSGVKLTSLYFQFTKGELITGTPKPLQGVTDVSFMTLDVCTKYTIYGFKGEGVITLTNSSFLAGVYYTSDSGQLLAFKNVTSGAVYSVTPCSFSEQAAYVDDDIVGVISSLSNSTFNSTRELPGFFYHSNDGSNCTEPVLVYSNIGVCKSGSIGYVPSQSGQVKIAPTVTGNISIPTNFSMSIRTEYLQLYNTPVSVDCATYVCNGNSRCKQLLTQYTAACKTIESALQLSARLESVEVNSMLTISEEALQLATVSSFNGDGYNFTNVLGVSVYDPASGRVVQKRSFIEDLLFNKVVTNGLGTVDEDYKRCSKGRSVADLVCAQYYSGVMVLPGVVDAEKLHMYSASLIGGMVLGGFTAAAALPFSYAVQARLNYLALQTDVLQRNQQLLAESFNSAIGNITSAFESVKEAISQTSKGLNTVAHALTKVQEVVNSQGAALTQLTVQLQHNFQAISSSIDDIYSRLDSLSADVQVDRLITGRLSALNAFVAQTLTKYTEVQASRKLAQQKVNECVKSQSQRYGFCGGDGEHIFSLVQAAPQGLLFLHTVLVPGDFVNVIAIAGLCVNDEIALTLREPGLVLFTHELQNYTATEYFVSSRRMFEPRKPTFSDFVQIESCVVTYVNLTRDQLPDVIPDYIDVNKTLDEILASLPNRTGPSLPLDVFNATYLNLTGEIADLEQRSESLRNTTEELQSLIYNINNTLVDLEWLNRVETYIKWPGSGGYIPEAPRDGQAYVRKDGEWVLLSTFLKGQDNSADIQHSGGRSSLEGPRFEGKPIPNPLLGLDSTRTGHHHHHH' 
# FILE PREPROCESSING
fixed_colname = [i.replace('_x000D_', '\r') if '_x000D_' in i else i for i in df_union.columns]
df_union.columns = fixed_colname
fixed_colname = [i.replace('_x000D_', '\r') if '_x000D_' in i else i for i in df_inter.columns]
df_inter.columns = fixed_colname
fixed_colname = [i.replace('_x000D_', '\r') if '_x000D_' in i else i for i in df_inter.columns]
df_inter.columns = fixed_colname
# CONSTRUCT MAJOR GLYCOFORM TABLE
# prepare needed lists for the table cols (no./nsite/sequon are the indices, major glycoform is the col)
result = findNPosBySequon(seq)
nsite = [t[0] for t in result]
nsite.append('')
sequon = [t[1] for t in result]
sequon.append('')
no = [i+1 for i in range(len(nsite)-1)]
no.append('Intersection/Union')
table = [(no[i], nsite[i], sequon[i]) for i in range(len(no))]
table = pd.DataFrame(table)
table.columns = ['No.', 'N-site', 'Sequon']
# rename col & extract xicauc max and its glycoform
df_union = df_union.rename(columns={'Glycans ↓':'Major Glycoform(union)'})
df_inter = df_inter.rename(columns={'N-site(Byonic ∪ pGlyco) →':'N-site(Byonic ∩ pGlyco) →', 'Glycans ↓':'Major Glycoform(inter)'})
df_union = df_union.groupby('N-site(Byonic ∪ pGlyco) →').apply(lambda x: x.loc[x['a_norm_XIC\r\nAUC[Byos]'].idxmax(), ['N-site(Byonic ∪ pGlyco) →', 'Major Glycoform(union)']]).reset_index(drop=True)
df_inter = df_inter.groupby('N-site(Byonic ∩ pGlyco) →').apply(lambda x: x.loc[x['a_norm_XIC\r\nAUC[Byos]'].idxmax(), ['N-site(Byonic ∩ pGlyco) →', 'Major Glycoform(inter)']]).reset_index(drop=True)
df_union = df_union.set_index('N-site(Byonic ∪ pGlyco) →')
df_inter = df_inter.set_index('N-site(Byonic ∩ pGlyco) →')
combined = pd.concat([df_inter, df_union], axis=1)
combined.index.name = 'N-site'
combined = combined.reset_index()
combined.loc[(combined['Major Glycoform(inter)']!=combined['Major Glycoform(union)']), 'compare'] = 'color'
dif_num = len(combined.loc[(combined['Major Glycoform(inter)'].isnull())&(combined['compare']=='color')])
union_num = len(combined)
inter_num = union_num - dif_num
# map the result to the empty df
table = table.merge(combined, on=['N-site'], how='outer')
table.loc[table.index[-1], 'Major Glycoform(union)'] = f'{inter_num}/{union_num}'
# style apply to union only data
table = table.fillna('').style.applymap(findUnionOnly, subset=pd.IndexSlice[(table['compare']=='color'), 'Major Glycoform(union)'])
table.data = table.data.drop('compare', axis=1)
# table.to_excel('MajorGlycoformTableTest.xlsx', index = False)