In [1]:
import numpy as np 
import pandas as pd
import altair as alt 
import re 
import os 
import ntpath
import sys
sys.path.append(r'../.')
from commons.data_processing import *
from modlamp.descriptors import PeptideDescriptor

In [2]:
# get all files and pull out those belonging to 45C runs
# 45C chosen because they provided best overall performance in peptide
# and protein identification
files = get_files(r'E:\Research\PEAKS\20200316_PANC1_Glyco_Columns\PANC1', exts=['-peptides.csv'])
files = [f for f in files if re.search('45', f)]

In [3]:
files

['E:\\Research\\PEAKS\\20200316_PANC1_Glyco_Columns\\PANC1\\BP_45\\protein-peptides.csv',
 'E:\\Research\\PEAKS\\20200316_PANC1_Glyco_Columns\\PANC1\\C18_45\\protein-peptides.csv',
 'E:\\Research\\PEAKS\\20200316_PANC1_Glyco_Columns\\PANC1\\PGC_45\\protein-peptides.csv']

In [4]:
# create a blank DataFrame
df = pd.DataFrame()
# add identifier to each file and add to DF
for file in files:
    sub = pd.read_csv(file)
    source = ntpath.dirname(file)
    source = source.split('\\')[-1]
    sub['source'] = [source]*len(sub)
    if df.empty:
        df = sub
        continue
    df = pd.concat([df, sub])

In [5]:
# define function to clean peptide sequence
def clean(s):
    err_doc = f'Not all data entered as string: error in {s} type is {type(s)}'
    assert isinstance(s, str), err_doc
    regs = [r'^.\.', r'\(\+.+\..+\)', r'\(\-.+\..+\)', r'\..$']
    for reg in regs:
        s = re.sub(reg, '', s)
    return s

# define func to calculate GRAVY
def pour(s):
    err_doc = f'Not all data entered as string: error in {s} type is {type(s)}'
    assert isinstance(s, str), err_doc
    desc = PeptideDescriptor(s, 'gravy')
    desc.calculate_global()
    return desc.descriptor[0][0]

In [6]:
# clean peptide sequences
df.loc[:, 'Peptide'] = df.Peptide.map(clean)

# calculate gravy scores
df['GRAVY'] = df.Peptide.map(pour)

# bin out RT values
df['time_bin'] = df.RT // 10 * 10

In [7]:
# add second data point to make things look nice
sdf = df[['source', 'time_bin', 'GRAVY']]
source = []
times = []
scores = []
for r in sdf.itertuples():
    source.extend([r[1]]*2)
    times.append(r[2])
    times.append(r[2]+4)
    scores.extend([r[3]]*2)

df2 = pd.DataFrame({
    'source':source,
    'time_bin':times,
    'GRAVY': scores
})

In [10]:
domain = df2.source.unique().tolist()
# _range = ['#43648c', '#86858f', '#cc7833']
_range = ['#86858f', '#43648c', '#cc7833']

chart = alt.Chart(df2, title='GRAVY vs. Retention Time').mark_line().encode(
    x=alt.X('time_bin:Q', title='Time (min)',
            axis=alt.Axis(tickCount=8)),
    y=alt.Y('average(GRAVY):Q', title='GRAVY',
            axis=alt.Axis(tickCount=5)),
    color=alt.Color('source:O', scale=alt.Scale(domain=domain, range=_range))
).properties(
    height=500,
    width=700
)

band = alt.Chart(df2).mark_errorband(extent='ci').encode(
    x=alt.X('time_bin:Q', title='Time (min)'),
    y=alt.Y('GRAVY:Q', title='GRAVY'),
    color='source:O'
)
(band + chart).configure_axis(
    labelFontSize=16,
    titleFontSize=16
).configure_title(
    fontSize=18
).save('GRAVY_over_time.svg')