In [1]:
import pandas as pd 
import pathlib
import ntpath
from Commons.data_processing import *
from Commons.DataProcessors.pd_processor import PDProcessor

In [2]:
# get all pd data files
files = get_files(r".\N_Glycosylation_Results", exts=['.xlsx'])

In [3]:
# read each data file and concatenate contents

# instantiate main object
read_data = None

for file in files:
    # get filename from full path
    base_name, _ = ntpath.splitext(ntpath.basename(file))

    # extract concentration, temperature and run #
    div = base_name.split("-")[0]
    div = div.split("_")
    conc, temp, run = div[-3:]

    # use PDProcessor class to read data
    data = PDProcessor([file], sample_name="_".join(div[-3:]))

    # create special columns with our extracted info
    data.add_special_column("concentration", conc)
    data.add_special_column("temperature", temp)
    data.add_special_column("run", run)

    # concatenate data
    if read_data is None:
        read_data = data
    else:
        # joining handled by PDProcessor class
        read_data.join_processors(data)


read_data.alias_engine("")

In [4]:
# extract peptide contents from PDProcessor object
peptides = read_data.peptides

# map the concentration values to floats
conc_map = {"1": 1.0, "05": 0.5, "025": 0.25, "0125": 0.125, "00625": 0.0625}
peptides.loc[:, 'new_concentration'] = peptides.concentration.map(conc_map)

# map concentration to dilution
vals = list(conc_map.values())
dilutions = [f'{int(1/x)}x' for x in vals]
dil_map = dict(zip(vals, dilutions))
peptides.loc[:, 'dilution'] = peptides.new_concentration.map(dil_map)

In [5]:
# fix column names
peptides.drop('concentration', axis=1, inplace=True)

cols = [c for c in peptides.columns]
cols[-2] = 'concentration'
peptides.columns = cols

In [6]:
# define functions to match glycan composition to type
def categorize_glycan(glycan):

    glycan = glycan.replace(")", ",")
    glycan = glycan.replace("(", " ")
    glycan = glycan.split(",")[:-1]
    d = {k: int(v) for [k, v] in [i.split(" ") for i in glycan]}

    if "NeuAc" in d or "NeuGc" in d:
        return "Sialylated"
    elif "Fuc" in d:
        if d["HexNAc"] > 2:
            return "Fucosylated"
        elif d["HexNAc"] == 2:
            if "Hex" in d:
                if d["Hex"] > 4:
                    return "Complex"
            else:
                return "Paucimannose"
    elif d["HexNAc"] > 2:
        return "Complex"
    elif d["HexNAc"] <= 2:
        if "Hex" in d:
            if d["Hex"] <= 9 and d["Hex"] > 4:
                return "High Mannose"

        return "Paucimannose"

def determine_degree_sial(glycan):

    degree_map = {
        1: 'Monosialylated',
        2: 'Disialylated',
        3: 'Trisialylated',
        4: 'Tetrasialylated',
        5: 'Pentasialylated',
    }

    glycan = glycan.replace(')', ',')
    glycan = glycan.replace('(', ' ')
    glycan = glycan.split(',')[:-1]
    d = {k:int(v) for [k, v] in [i.split(' ') for i in glycan]}

    if not 'NeuAc' in d and 'NeuGc' not in d:
        return 0
    else:
        total = int(d.get('NeuAc', 0)) + int(d.get('NeuGc', 0))
        return degree_map[total]

In [7]:
# extract only rows containing glycans
peptides = peptides[~peptides.glycan_composition.isna()] 

# get high scoring peptides
peptides = peptides.loc[
    (peptides.byonic_score >= 300) &
    (peptides.delta_mod_score >= 10), :]

# add new column
peptides['glycan'] = peptides.glycan_composition

# map glycan type and sialylation info
peptides.loc[:, "glycan_type"] = peptides.glycan.map(categorize_glycan)
peptides.loc[:, "degree_sial"] = peptides.glycan.map(determine_degree_sial)


# map unique glycopeptide id
peptides.loc[:, "pep_mods"] = peptides.apply(
    lambda x: x["sequence"] + "_" + x["modifications"], axis=1
)

# sort according to concentration
peptides = peptides.sort_values('concentration', ascending=False)

# reset index
peptides = peptides.reset_index()
peptides = peptides.drop('index', axis=1)

peptides.head(3)


Unnamed: 0,accession,description,checked,confidence,annotated_sequence,modifications,master_protein_accessions,num_psms,mz_da,charge,...,sequence,data_source,temperature,run,concentration,dilution,glycan,glycan_type,degree_sial,pep_mods
0,P61823,Ribonuclease pancreatic OS=Bos taurus OX=9913 ...,True,High,[R].NLTK.[D],1xHexNAc(2)Hex(4) [N1],P61823,7,765.334,2,...,NLTK,1_60C_Run3,60C,Run3,1.0,1x,HexNAc(2)Hex(4),Paucimannose,0,NLTK_1xHexNAc(2)Hex(4) [N1]
1,P61823,Ribonuclease pancreatic OS=Bos taurus OX=9913 ...,True,High,[R].NLTK.[D],1xHexNAc(4)Hex(5)Fuc(1) [N1],P61823,4,1122.47,2,...,NLTK,1_30C_Run3,30C,Run3,1.0,1x,HexNAc(4)Hex(5)Fuc(1),Fucosylated,0,NLTK_1xHexNAc(4)Hex(5)Fuc(1) [N1]
2,P61823,Ribonuclease pancreatic OS=Bos taurus OX=9913 ...,True,High,[R].NLTKDR.[C],1xHexNAc(2)Hex(7) [N1],P61823,3,1143.98,2,...,NLTKDR,1_30C_Run3,30C,Run3,1.0,1x,HexNAc(2)Hex(7),High Mannose,0,NLTKDR_1xHexNAc(2)Hex(7) [N1]


In [8]:
# further reduce the dataset to include only those glycopeptides found in 2/3 of runs

final_results = pd.DataFrame()

# for each temperature
for t, temperature in iterate_contents('temperature', peptides, get_item=True):
    # for each concentration
    for c, concentration in iterate_contents('concentration', temperature, get_item=True):
        # count occurrences of peptides
        counts = concentration.pep_mods.value_counts()

        # keep only the ones with occurence >= 2
        valid_number = counts[counts.values >= 2].keys()
        valid_ids = concentration[concentration.pep_mods.isin(valid_number)]
        
        # append final results
        final_results = pd.concat([final_results, valid_ids])

final_results.reset_index(inplace=True)
final_results = final_results.drop('index', axis=1)

print(f'Original dataframe has been reduced from {len(peptides)} rows to {len(final_results)} rows')

Original dataframe has been reduced from 2470 rows to 2253 rows


In [9]:
# create an output location
export_loc = pathlib.PurePath('.', 'Working_Datafiles')
pathlib.Path(export_loc).mkdir(parents=True, exist_ok=True)

# export final dataframe as working document
export_filename = pathlib.PurePath(export_loc, 'All_N_Glycopeptides.csv')
final_results.to_csv(str(export_filename), index=False)