In [None]:
import numpy as np
import pandas as pd 
import altair as alt
import copy
import re 
import ntpath 
from commons import data_processing
from commons.DataProcessors import msfragger
from commons.APICallers import uniprot

alt.data_transformers.disable_max_rows()

In [None]:
files = data_processing.get_files(r'./data/fractions/', exts=['psm.tsv'])

# use msfragger module to process data
msf = None
for file in files:
    m = msfragger.msf_processor([file])

#     # extract fraction and column info
    dirname = ntpath.dirname(file)
    spl = re.split(r'[\\\/]', dirname)
    column, fraction = spl[-2:]
    m.add_special_column('fraction', fraction)
    m.add_special_column('column', column)

    if msf is None:
        msf = m
    else:
        msf.join_processors(m)

# grab data
df = msf.data
msf

In [None]:
no_mods = df[df.modified_peptide.isna()].index
df.loc[no_mods, "modified_peptide"] = df.loc[no_mods, "peptide"]
df.head(2)

In [None]:
avg = df.groupby(['column', 'fraction', 'modified_peptide', 'peptide',]).mean()
avg.loc[:, 'retention_min'] = avg.retention/60

In [None]:
column_colors = alt.Color('column:N',
    scale=alt.Scale(
        domain=["PGC", "RPLC"],
        range=["#D56E3B", "#58728C"]
    ))

axis_params = alt.Axis(
    labelFontSize=14,
    labelFontWeight=600,
    labelAngle=0,
    labelFlush=False
)

In [None]:
overlap = 1

alt.Chart(avg.reset_index(), height=40).mark_area(
    opacity=1,
    stroke='lightgray',
    strokeWidth=0.5
).transform_density(
    'retention_min',
    as_ = ['retention_min', 'density'],
    groupby=['fraction', 'column',]
).encode(
    x=alt.X('retention_min:Q', title='Retention Time (min)',
        axis=alt.Axis(grid=False)),
    y=alt.Y('density:Q', axis=None,
        scale=alt.Scale(range=[40, -5])),
    color=column_colors
).facet(
    row=alt.Row('fraction:N', title=None,
        header=alt.Header(labelAngle=0)),
    column=alt.Column('column:N', title=None)
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
).properties(
    bounds='flush'
)

In [None]:
d = avg.reset_index()
# d = d.drop_duplicates(['modified_peptide', 'fraction'])


total = alt.Chart(d).mark_bar().encode(
    x=alt.X('column:N', title='', 
        axis=alt.Axis(labels=False, ticks=False)),
    y=alt.Y('distinct(modified_peptide)', title='Total Peptides'),
    color=column_colors,
    column=alt.Column('fraction:N')
).properties(
    height=250,
    width=25
)

unique_frac = alt.Chart(d.drop_duplicates(['modified_peptide', 'fraction'])).mark_bar().encode(
    x=alt.X('column:N', title='', 
        axis=alt.Axis(labels=False, ticks=False)),
    y=alt.Y('distinct(modified_peptide)', title='Unique Peptides (fraction)'),
    color=column_colors,
    column=alt.Column('fraction:N')
).properties(
    height=250,
    width=25
)

unique = alt.Chart(d.drop_duplicates(['modified_peptide'], keep=False)).mark_bar().encode(
    x=alt.X('column:N', title='', 
        axis=alt.Axis(labels=False, ticks=False)),
    y=alt.Y('distinct(modified_peptide)', title='Unique Peptides'),
    color=column_colors,
    column=alt.Column('fraction:N')
).properties(
    height=250,
    width=25
)

# total | unique_frac | unique
total | unique

In [None]:
prot1 = df.groupby(['column', 'fraction', 'protein_id'], as_index=False).mean()
prot2 = prot1.drop_duplicates('protein_id', keep=False)


base = alt.Chart(prot1).encode(
    x=alt.X('fraction:N', title='Fraction'
        axis=axis_params),
    y=alt.Y('distinct(protein_id):Q', title='Unique Proteins'
        axis=axis_params),
    color=column_colors,
    # column=alt.Column('fraction:N')
).properties(
    height=150,
    width=400
)

line = base.mark_line(
    opacity=0.6,
    strokeWidth=3,
    strokeDash=[5, 3],
)

dots = base.mark_circle(size=75)

line + dots

In [None]:
from modlamp.descriptors import PeptideDescriptor

def pour(seq):
    desc = PeptideDescriptor(seq, 'gravy')
    desc.calculate_global()
    return desc.descriptor[0][0]

df.loc[:, 'gravy'] = df.peptide.map(pour)
avg = df.groupby(['column', 'modified_peptide', 'peptide']).mean()
avg.loc[:, 'retention_min'] = avg.retention/60

In [None]:
d = avg.reset_index()
# d.drop_duplicates('modified_peptide', inplace=True)
base = alt.Chart(d).encode(
    x=alt.X('retention_min:Q',
        bin=alt.Bin(
            step=5
        )),
    y=alt.Y('mean(gravy):Q'),
    color=column_colors
)

line = base.mark_line(interpolate='basis') 
band = base.mark_errorband(extent='ci', interpolate='basis')

line + band

In [None]:
d = avg.reset_index()
d.drop_duplicates('modified_peptide', inplace=True, keep=False)
alt.Chart(d).mark_boxplot().encode(
    x='column:N',
    y='peptide_length:Q'
)

# import seaborn as sns 
# sns.violinplot(x='column', y='peptide_length', data=d)

x_axis = axis_params.copy()
x_axis["labels"] = False
x_axis["ticks"] = False
x_axis["grid"] = False

alt.Chart(d, width=100).mark_circle(
    size=8,
    opacity=0.2
).encode(
    x=alt.X('jitter:Q', title='',
        axis=x_axis),
    y=alt.Y('peptide_length:Q', title='Peptide Length',
        axis=axis_params),
    column=alt.Column('column:N'),
    color=column_colors
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

In [None]:
import seaborn as sns 
# sns.violinplot(x='column', y='peptide_length', data=d)
alt.Chart(d, width=100).mark_boxplot(opacity=0.5, size=50).encode(
    # x='column:N',
    y=alt.Y('peptide_length:Q', title='Peptie Length',
        axis=axis_params),
    column=alt.Column('column:N'),
    color=column_colors
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

In [None]:
from importlib import reload 
reload(uniprot)
accs = df.protein_id.unique()
print(len(accs))
loc_map = {}
for i in range(0, len(accs)+1000, 500):
    sent = accs[i:i+500]
    call = uniprot.send_accessions(sent)
    resp = uniprot.parse_response(call, wanted_value='sc_location')
    loc_map.update(resp)

df.loc[:, 'sub_location'] = df.protein_id.map(loc_map)


In [None]:
df.loc[:, 'sub_location'] = df.protein_id.map(loc_map)

In [None]:
df.loc[(~df.sub_location.isna()), 'sub_location'] = df.loc[(~df.sub_location.isna()), 'sub_location'].map(lambda x: x.split(',')[0])

In [None]:
reduced = df.drop_duplicates('modified_peptide', keep=False)
reduced = reduced.groupby(['column', 'sub_location', 'protein_id'], as_index=False).count()
reduced = reduced.iloc[:, :3].groupby(['column', 'sub_location'], as_index=False).count()
reduced = reduced[reduced.protein_id>=100]
# pgc_comp_count = reduced[reduced.column=='PGC'].iloc[:, :3]
# rplc_comp_count = reduced[reduced.column=='RPLC'].iloc[:, :3]

alt.Chart(reduced).mark_arc().encode(
    # x=alt.X('column:N'),
    theta=alt.Theta('protein_id:Q'),
    color=column_colors,
    facet=alt.Facet('sub_location:N', columns=5)
).resolve_scale(
    theta='independent'
).properties(
    width=75,
    height=75
)


In [None]:
reduced = df.drop_duplicates(['sub_location', 'protein_id'], keep=False)
reduced = reduced.groupby(['column', 'sub_location', 'protein_id'], as_index=False).count()
reduced = reduced.groupby(['column', 'sub_location'], as_index=False).count()

valid = reduced.groupby('sub_location', as_index=False).sum()
valid = valid[valid.protein_id >= 10]

reduced = reduced[reduced.sub_location.isin(valid.sub_location)]

alt.Chart(reduced).mark_arc().encode(
    # x=alt.X('column:N'),
    theta=alt.Theta('protein_id:Q'),
    color=column_colors,
    facet=alt.Facet('sub_location:N', columns=5)
).resolve_scale(
    theta='independent'
).properties(
    width=75,
    height=75
)

In [None]:
df.columns