In [4]:
import pandas as pd 
import pathlib
import ntpath
from commons.data_processing import *
from commons.DataProcessors.pd_processor import PDProcessor

In [5]:
# get all pd data files
std_files = get_files(r".\N_Glycosylation_Results\StandardProteins", exts=['.xlsx'])

mt10_files = get_files(r".\N_Glycosylation_Results\BCaP_MT10", exts=['.xlsx'])


In [12]:
# read each data file and concatenate contents

# instantiate main object
read_data = None

# create map for dilution factor
dilutions = {"1": "1x", "05": "2x", "025": "4x", "0125": "8x", "00625": "16x"}

for file in std_files:
    # get filename from full path
    base_name, _ = ntpath.splitext(ntpath.basename(file))
    
    # extract dilution, temperature and run #
    div = base_name.split("-")[0]
    div = div.split("_")
    dil, temp, run = div[-3:]

    # get sample group name
    group = ntpath.dirname(file)
    group = group.split('\\')[-1]
    
    # use PDProcessor class to read data
    data = PDProcessor([file], sample_name="_".join(div[-3:]))

    # create special columns with our extracted info
    data.add_special_column("dilution", dilutions[dil])
    data.add_special_column("temperature", temp)
    data.add_special_column("run", run)
    data.add_special_column("sample", group)

    # concatenate data
    if read_data is None:
        read_data = data
    else:
        # joining handled by PDProcessor class
        read_data.join_processors(data)





In [13]:
for file in mt10_files:
    # get filename from full path
    base_name, _ = ntpath.splitext(ntpath.basename(file))
    
    # extract dilution, temperature and run #
    div = base_name.split("-")[0]
    div = div.split("_")
    temp, dil, run = div[-3:]

    # get sample group name
    group = ntpath.dirname(file)
    group = group.split('\\')[-1]
    
    # use PDProcessor class to read data
    data = PDProcessor([file], sample_name="_".join(div[-3:]))

    # create special columns with our extracted info
    data.add_special_column("dilution", dil)
    data.add_special_column("temperature", temp)
    data.add_special_column("run", run)
    data.add_special_column("sample", group)

    # concatenate data
    if read_data is None:
        read_data = data
    else:
        # joining handled by PDProcessor class
        read_data.join_processors(data)

read_data.alias_engine("")

In [7]:
# extract peptide contents from PDProcessor object
peptides = read_data.peptides

# map the concentration values to floats
conc_map = {"1": 1.0, "05": 0.5, "025": 0.25, "0125": 0.125, "00625": 0.0625}
peptides.loc[:, 'new_concentration'] = peptides.concentration.map(conc_map)

# map concentration to dilution
vals = list(conc_map.values())
dilutions = [f'{int(1/x)}x' for x in vals]
dil_map = dict(zip(vals, dilutions))
peptides.loc[:, 'dilution'] = peptides.new_concentration.map(dil_map)

In [5]:
# fix column names
peptides.drop('concentration', axis=1, inplace=True)

cols = [c for c in peptides.columns]
cols[-2] = 'concentration'
peptides.columns = cols

In [24]:
# define functions to match glycan composition to type
def categorize_glycan(glycan):

    glycan = glycan.replace(")", ",")
    glycan = glycan.replace("(", " ")
    glycan = glycan.split(",")[:-1]
    d = {k: int(v) for [k, v] in [i.split(" ") for i in glycan]}

    if "NeuAc" in d or "NeuGc" in d:
        return "Sialylated"
    elif "Fuc" in d:
        if d["HexNAc"] > 2:
            return "Fucosylated"
        elif d["HexNAc"] == 2:
            if "Hex" in d:
                if d["Hex"] > 4:
                    return "Complex"
                else:
                    return "Paucimannose"
            else:
                return "Paucimannose"
        else:
            return "Paucimannose"
    elif d["HexNAc"] > 2:
        return "Complex"
    elif d["HexNAc"] <= 2:
        if "Hex" in d:
            if d["Hex"] <= 9 and d["Hex"] > 4:
                return "High Mannose"

        return "Paucimannose"

def determine_degree_sial(glycan):

    degree_map = {
        1: 'Monosialylated',
        2: 'Disialylated',
        3: 'Trisialylated',
        4: 'Tetrasialylated',
        5: 'Pentasialylated',
    }

    glycan = glycan.replace(')', ',')
    glycan = glycan.replace('(', ' ')
    glycan = glycan.split(',')[:-1]
    d = {k:int(v) for [k, v] in [i.split(' ') for i in glycan]}

    if not 'NeuAc' in d and 'NeuGc' not in d:
        return 0
    else:
        total = int(d.get('NeuAc', 0)) + int(d.get('NeuGc', 0))
        return degree_map[total]

In [31]:
# extract peptide contents from PDProcessor object
peptides = read_data.peptides

# extract only rows containing glycans
peptides = peptides[~peptides.glycan_composition.isna()] 

# get high scoring peptides
peptides = peptides.loc[
    (peptides.byonic_score >= 200) &
    (peptides.delta_mod_score >= 10), :]

# add new column
peptides['glycan'] = peptides.glycan_composition

# map glycan type and sialylation info
peptides.loc[:, "glycan_type"] = peptides.glycan.map(categorize_glycan)
peptides.loc[:, "degree_sial"] = peptides.glycan.map(determine_degree_sial)


# map unique glycopeptide id
peptides.loc[:, "pep_mods"] = peptides.apply(
    lambda x: x["sequence"] + "_" + x["glycan"], axis=1
)

# sort according to concentration
peptides = peptides.sort_values('dilution', ascending=False)

# reset index
peptides = peptides.reset_index(drop=True)

peptides.head(3)
peptides.shape

(10889, 47)

In [32]:
# further reduce the dataset to include only those glycopeptides found in 2/3 of runs

final_results = pd.DataFrame()

# for each temperature
for t, temperature in iterate_contents('temperature', peptides, get_item=True):
    # for each concentration
    for c, concentration in iterate_contents('dilution', temperature, get_item=True):
        # count occurrences of peptides
        counts = concentration.pep_mods.value_counts()

        # keep only the ones with occurence >= 2
        valid_number = counts[counts.values >= 2].keys()
        valid_ids = concentration[concentration.pep_mods.isin(valid_number)]
        
        # append final results
        final_results = pd.concat([final_results, valid_ids])

final_results.reset_index(inplace=True, drop=True)

print(f'Original dataframe has been reduced from {len(peptides)} rows to {len(final_results)} rows')

Original dataframe has been reduced from 10889 rows to 9715 rows


In [33]:
# create an output location
export_loc = pathlib.PurePath('.', 'Working_Datafiles')
pathlib.Path(export_loc).mkdir(parents=True, exist_ok=True)

# export final dataframe as working document
export_filename = pathlib.PurePath(export_loc, 'All_N_Glycopeptides.csv')
final_results.to_csv(str(export_filename), index=False)

In [10]:
peptides.loc[
    (peptides.concentration==1.0) &
    (peptides.glycan_type=='Sialylated') &
    (peptides.degree_sial=='Monosialylated') &
    (peptides.sequence=='QNGTLSK') &
    (peptides.glycan == 'HexNAc(5)Hex(6)NeuAc(1)'), :
]

Unnamed: 0,accession,description,checked,confidence,annotated_sequence,modifications,master_protein_accessions,rt_min,mz_da,charge,...,sequence,data_source,temperature,run,concentration,dilution,glycan,glycan_type,degree_sial,pep_mods
30,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].QNGTLSK.[V],1xHexNAc(5)Hex(6)NeuAc(1) [N2],Q3SZR3,26.5127,1513.61,2,...,QNGTLSK,1_45C_Run1,45C,Run1,1.0,1x,HexNAc(5)Hex(6)NeuAc(1),Sialylated,Monosialylated,QNGTLSK_1xHexNAc(5)Hex(6)NeuAc(1) [N2]
124,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].QNGTLSK.[V],1xHexNAc(5)Hex(6)NeuAc(1) [N2],Q3SZR3,26.2705,1513.61,2,...,QNGTLSK,1_45C_Run2,45C,Run2,1.0,1x,HexNAc(5)Hex(6)NeuAc(1),Sialylated,Monosialylated,QNGTLSK_1xHexNAc(5)Hex(6)NeuAc(1) [N2]
174,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].QNGTLSK.[V],1xHexNAc(5)Hex(6)NeuAc(1) [N2],Q3SZR3,24.5519,1513.61,2,...,QNGTLSK,1_30C_Run3,30C,Run3,1.0,1x,HexNAc(5)Hex(6)NeuAc(1),Sialylated,Monosialylated,QNGTLSK_1xHexNAc(5)Hex(6)NeuAc(1) [N2]
234,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].QNGTLSK.[V],1xHexNAc(5)Hex(6)NeuAc(1) [N2],Q3SZR3,24.9925,1513.61,2,...,QNGTLSK,1_30C_Run1,30C,Run1,1.0,1x,HexNAc(5)Hex(6)NeuAc(1),Sialylated,Monosialylated,QNGTLSK_1xHexNAc(5)Hex(6)NeuAc(1) [N2]
533,Q3SZR3,Alpha-1-acid glycoprotein OS=Bos taurus OX=991...,True,High,[R].QNGTLSK.[V],1xHexNAc(5)Hex(6)NeuAc(1) [N2],Q3SZR3,26.1591,1513.61,2,...,QNGTLSK,1_45C_Run3,45C,Run3,1.0,1x,HexNAc(5)Hex(6)NeuAc(1),Sialylated,Monosialylated,QNGTLSK_1xHexNAc(5)Hex(6)NeuAc(1) [N2]
