In [1]:
import numpy as np
import pandas as pd 
import altair as alt
import re 
import ntpath 
from commons import data_processing
from commons.DataProcessors import msfragger
from commons.APICallers import uniprot

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
files = data_processing.get_files(r'./data/fractions/', exts=['psm.tsv'])

# use msfragger module to process data
msf = None
for file in files:
    m = msfragger.msf_processor([file])

#     # extract fraction and column info
    dirname = ntpath.dirname(file)
    spl = re.split(r'[\\\/]', dirname)
    column, fraction = spl[-2:]
    m.add_special_column('fraction', fraction)
    m.add_special_column('column', column)

    if msf is None:
        msf = m
    else:
        msf.join_processors(m)

# grab data
df = msf.data
msf

MSF Processor constructed on the following data:
./data/fractions/PGC\F1\psm.tsv
./data/fractions/PGC\F2\psm.tsv
./data/fractions/PGC\F3\psm.tsv
./data/fractions/PGC\F4\psm.tsv
./data/fractions/PGC\F5\psm.tsv
./data/fractions/PGC\F6\psm.tsv
./data/fractions/PGC\F7\psm.tsv
./data/fractions/PGC\F8\psm.tsv
./data/fractions/RPLC\F1\psm.tsv
./data/fractions/RPLC\F2\psm.tsv
./data/fractions/RPLC\F3\psm.tsv
./data/fractions/RPLC\F4\psm.tsv
./data/fractions/RPLC\F5\psm.tsv
./data/fractions/RPLC\F6\psm.tsv
./data/fractions/RPLC\F7\psm.tsv
./data/fractions/RPLC\F8\psm.tsv

In [3]:
no_mods = df[df.modified_peptide.isna()].index
df.loc[no_mods, "modified_peptide"] = df.loc[no_mods, "peptide"]
df.head(2)

Unnamed: 0,spectrum,spectrum_file,peptide,modified_peptide,prev_aa,next_aa,peptide_length,charge,retention,observed_mass,...,is_unique,protein,protein_id,entry_name,gene,protein_description,mapped_genes,mapped_proteins,fraction,column
0,20221109_GD_PGC_F1_run1.02562.02562.2,D:\MSFragger\20221109\MSFragger_PGC_Frations\F...,ATGAATPKK,ATGAATPKK,K,S,9,2,758.9805,843.4767,...,True,sp|P10412|H14_HUMAN,P10412,H14_HUMAN,HIST1H1E,Histone H1.4,,,F1,PGC
1,20221109_GD_PGC_F1_run1.02591.02591.2,D:\MSFragger\20221109\MSFragger_PGC_Frations\F...,ATTKPPPAK,ATTKPPPAK,K,K,9,2,768.113,909.5245,...,True,sp|Q14978|NOLC1_HUMAN,Q14978,NOLC1_HUMAN,NOLC1,Nucleolar and coiled-body phosphoprotein 1,,,F1,PGC


In [4]:
avg = df.groupby(['column', 'fraction', 'modified_peptide', 'peptide',]).mean()
avg.loc[:, 'retention_min'] = avg.retention/60

In [5]:
column_colors = alt.Color('column:N',
    scale=alt.Scale(
        domain=["PGC", "RPLC"],
        range=["#D56E3B", "#58728C"]
    ))

In [None]:
overlap = 1

alt.Chart(avg.reset_index(), height=40).mark_area(
    opacity=1,
    stroke='lightgray',
    strokeWidth=0.5
).transform_density(
    'retention_min',
    as_ = ['retention_min', 'density'],
    groupby=['fraction', 'column',]
).encode(
    x=alt.X('retention_min:Q', title='Retention Time (min)',
        axis=alt.Axis(grid=False)),
    y=alt.Y('density:Q', axis=None,
        scale=alt.Scale(range=[40, -5])),
    color=column_colors
).facet(
    row=alt.Row('fraction:N', title=None,
        header=alt.Header(labelAngle=0)),
    column=alt.Column('column:N', title=None)
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
).properties(
    bounds='flush'
)

In [None]:
d = avg.reset_index()
# d = d.drop_duplicates(['modified_peptide', 'fraction'])
total = alt.Chart(d).mark_bar().encode(
    x=alt.X('column:N', title='', 
        axis=alt.Axis(labels=False, ticks=False)),
    y=alt.Y('distinct(modified_peptide)', title='Unique Peptides'),
    color=column_colors,
    column=alt.Column('fraction:N')
).properties(
    height=250,
    width=25
)

unique_frac = alt.Chart(d.drop_duplicates(['modified_peptide', 'fraction'])).mark_bar().encode(
    x=alt.X('column:N', title='', 
        axis=alt.Axis(labels=False, ticks=False)),
    y=alt.Y('distinct(modified_peptide)', title='Unique Peptides'),
    color=column_colors,
    column=alt.Column('fraction:N')
).properties(
    height=250,
    width=25
)

unique = alt.Chart(d.drop_duplicates(['modified_peptide'])).mark_bar().encode(
    x=alt.X('column:N', title='', 
        axis=alt.Axis(labels=False, ticks=False)),
    y=alt.Y('distinct(modified_peptide)', title='Unique Peptides'),
    color=column_colors,
    column=alt.Column('fraction:N')
).properties(
    height=250,
    width=25
)

total | unique_frac | unique

In [None]:
from modlamp.descriptors import PeptideDescriptor

def pour(seq):
    desc = PeptideDescriptor(seq, 'gravy')
    desc.calculate_global()
    return desc.descriptor[0][0]

df.loc[:, 'gravy'] = df.peptide.map(pour)
avg = df.groupby(['column', 'modified_peptide', 'peptide']).mean()
avg.loc[:, 'retention_min'] = avg.retention/60

In [None]:
d = avg.reset_index()
# d.drop_duplicates('modified_peptide', inplace=True)
base = alt.Chart(d).encode(
    x=alt.X('retention_min:Q',
        bin=alt.Bin(
            step=5
        )),
    y=alt.Y('mean(gravy):Q'),
    color='column:N'
)

base.mark_line(interpolate='basis') + base.mark_errorband(extent='ci', interpolate='basis')

In [None]:
d = avg.reset_index()
d.drop_duplicates('modified_peptide', inplace=True)
alt.Chart(d).mark_boxplot().encode(
    x='column:N',
    y='peptide_length:Q'
)

# import seaborn as sns 
# sns.violinplot(x='column', y='gravy', data=avg.reset_index())

In [14]:
from importlib import reload 
reload(uniprot)
accs = df.protein_id.unique()
print(len(accs))
loc_map = {}
for i in range(0, len(accs)+1000, 1000):
    sent = accs[i:i+1000]
    call = uniprot.send_accessions(sent)
    resp = uniprot.parse_response(call, wanted_value='gene')
    loc_map.update(resp)

df.loc[:, 'sub_location'] = df.protein_id.map(loc_map)


5750


ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))

In [11]:
df.protein_id.drop_duplicates().to_clipboard(index=False, header=False)

In [12]:
mapper = r"C:\Users\graha\Downloads\uniprot-download_true_fields_accession_2Cprotein_name_2Cgene_names_2-2022.11.22-00.58.09.52.tsv"
loc_map = pd.read_csv(mapper, delimiter='\t')
loc_map

Unnamed: 0,From,Entry,Protein names,Gene Names,Length,Subcellular location [CC]
0,P10412,P10412,Histone H1.4 (Histone H1b) (Histone H1s-4),H1-4 H1F4 HIST1H1E,219,SUBCELLULAR LOCATION: Nucleus. Chromosome. Not...
1,Q14978,Q14978,Nucleolar and coiled-body phosphoprotein 1 (14...,NOLC1 KIAA0035 NS5ATP13,699,"SUBCELLULAR LOCATION: Nucleus, nucleolus {ECO:..."
2,P61927,P61927,60S ribosomal protein L37 (G1.16) (Large ribos...,RPL37,97,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000269|P...
3,Q86VM9,Q86VM9,Zinc finger CCCH domain-containing protein 18 ...,ZC3H18 NHN1,953,SUBCELLULAR LOCATION: Nucleus {ECO:0000250}.
4,Q9Y6Y8,Q9Y6Y8,SEC23-interacting protein (p125),SEC23IP MSTP053,1000,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, COP..."
...,...,...,...,...,...,...
5743,O43505,O43505,"Beta-1,4-glucuronyltransferase 1 (EC 2.4.1.-) ...",B4GAT1 B3GNT1 B3GNT6,415,SUBCELLULAR LOCATION: Golgi apparatus membrane...
5744,Q9GZV4,Q9GZV4,Eukaryotic translation initiation factor 5A-2 ...,EIF5A2,153,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000250}....
5745,Q8IY31,Q8IY31,Intraflagellar transport protein 20 homolog (h...,IFT20,132,"SUBCELLULAR LOCATION: Golgi apparatus, cis-Gol..."
5746,Q9NVQ4,Q9NVQ4,Fas apoptotic inhibitory molecule 1,FAIM FAIM1,179,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000250}.
