In [None]:
import numpy as np
import pandas as pd 
import altair as alt
import copy
import re 
import ntpath 
from commons import data_processing
from commons.DataProcessors import msfragger
from commons.APICallers import uniprot

alt.data_transformers.disable_max_rows()

In [None]:
column_colors = alt.Color('column:N',
    scale=alt.Scale(
        domain=["PGC", "RPLC", "both"],
        range=["#D56E3B", "#58728C", "#AAAAAA"]
    )
)

axis_params = alt.Axis(
    labelFontSize=14,
    labelFontWeight=600,
    labelAngle=0,
    labelFlush=False
)

In [None]:
# read in data from pax db
# https://pax-db.org/dataset/9606/1502934799/
pax = pd.read_csv('./data/paxdb/9606-WHOLE_ORGANISM-integrated.txt', delimiter='\t', skiprows=range(10))
pax.columns = [c.replace('#', '') for c in pax.columns]

# read in uniprot mappings from paxdb
# https://pax-db.org/download
pax_map = pd.read_csv('./data/paxdb/paxdb-uniprot-links-v4.2.tsv', delimiter='\t', header=None)
pax_map.columns = ['internal_id', 'uniprot_id']

# filter mappings to only human proteins
pax_map_human = pax_map[pax_map.uniprot_id.str.contains('HUMAN')]
pax_human_lookup = dict(zip(pax_map_human.internal_id.tolist(), pax_map_human.uniprot_id.tolist()))


In [None]:
# map uniprot entries to pax db
pax.loc[:, "uniprot_id"] = pax.string_external_id.map(pax_human_lookup)
pax.loc[:, "rank"] = [i for i in range(1, len(pax)+1)]
pax

In [None]:
# make a test chart to see if the data is how we expect
alt.Chart(pax).mark_circle().encode(
    x=alt.X('rank:Q', title='paxdb rank',
        axis=axis_params),
    y=alt.Y('log_abundance:Q', title='paxdf abundance',
        axis=axis_params),
    # size='abundance:Q'
).transform_calculate(
    log_abundance='log(datum.abundance)'
)

In [None]:
# now we read in experimental data
files = data_processing.get_files(r'./data/fractions/', exts=['psm.tsv'])

# use msfragger module to process data
msf = None
for file in files:
    m = msfragger.msf_processor([file])

    # extract fraction and column info
    dirname = ntpath.dirname(file)
    spl = re.split(r'[\\\/]', dirname)
    column, fraction = spl[-2:]
    m.add_special_column('fraction', fraction)
    m.add_special_column('column', column)

    if msf is None:
        msf = m
    else:
        msf.join_processors(m)

# grab data
df = msf.data
msf

In [None]:
# combine based on which column identified the proteins
exp_proteins = df.groupby(['column', 'entry_name', 'protein_id', 'gene'], as_index=False).count()

# find all proteins identified in both
counts = exp_proteins.protein_id.value_counts()
pairs = counts[counts.values==2].keys()
print(f'{len(counts)} total proteins, {len(pairs)} identified in both columns')

# change column name to 'both'
exp_proteins.loc[(exp_proteins.protein_id.isin(pairs)), 'column'] = 'both'

# remove those duplicates
exp_proteins = exp_proteins.drop_duplicates(['column', 'entry_name'])

# keep only needed rows
exp_proteins = exp_proteins.iloc[:, :4]

In [None]:
# merge to pax db based on entry name
pax_join = pax.merge(exp_proteins, left_on='uniprot_id', right_on='entry_name')

# calculate log10 abundance
# since it includes values << 10, we will multiply by 1000 for normalization
pax_join.loc[:, 'log10_abundance'] = np.log10(pax_join.abundance*1000)

# normalize
pax_join.loc[:, 'norm10_abundance'] = pax_join.log10_abundance /  np.max(pax_join.log10_abundance)

abund_dots = alt.Chart(pax_join).mark_circle(
    size=10
).encode(
    x=alt.X('rank:Q', title='paxdb rank',
        axis=axis_params),
    y=alt.Y('norm10_abundance:Q', title='paxdf abundance, normalized',
        axis=axis_params),
    # size='abundance:Q',
    color=column_colors,
    column='column:N'
).properties(
    width=100,
    height=300
)

In [None]:
abund_box = alt.Chart(pax_join, width=53).mark_boxplot(
    size=40
).encode(
    y=alt.Y('norm10_abundance:Q', title='paxdb abundance, normalized',
        axis=axis_params),
    column='column:N',
    color=column_colors
).transform_calculate(
    log_abundance='log(datum.abundance)'
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

In [None]:
abund_dots

In [None]:
abund_box

In [None]:
uni_data = pd.read_csv('./data/Uniprot/Uniprot_GO_BP.tsv', delimiter='\t')
uni_data = uni_data.dropna()

columns = [c.lower().replace(' ', '_') for c in uni_data.columns]
columns = [re.sub('[\(\)]', '', c) for c in columns]
uni_data.columns = columns 
uni_data.loc[:, 'spl_go'] = uni_data.gene_ontology_biological_process.str.split('; ')
uni_data = uni_data.explode('spl_go')

uni_data

In [None]:
# my_pax = pax_join.copy()

uni_join = pax_join.merge(uni_data, on='entry_name')

uni_select = data_processing.get_valid_counts(uni_join, 'spl_go', 50, filter='greater_equal')
uni_select.loc[:, 'clean_go'] = uni_select.spl_go.map(lambda x: re.sub(r' \[\w*\:\w*\]', '', x))
order = uni_select.clean_go.value_counts().keys().tolist()

test = uni_select.groupby(['column', 'clean_go'], as_index=False).mean()

base = alt.Chart(uni_select).encode(
    x=alt.X('column:N'),
    y=alt.Y('mean(norm10_abundance):Q'),
    color=column_colors
)

# alt.layer(base.mark_circle(size=15), base.mark_errorbar(extent='stdev')).facet(
#     column=alt.Column('clean_go:N', sort=order)
# ).configure_facet(spacing=500)

In [None]:
pgc_good = ["mRNA splicing, via spliceosome",
"translation",
"proteolysis",
"vesicle mediated transport",
"cytoplasmic translation", 
"protein localization",
"regulation of expression",
"lipid metabolic process"]

rplc_good = ["positive regulation of NF-kappaB transcription factor activity",
"protein import into nucleus",
"regulation of cell shape",
"protein-containing complex assembly",
"signal transduction"]

axis_params = alt.Axis(
    labelFontSize=14,
    labelFontWeight=600,
    labelAngle=0,
    labelFlush=False
)
mod_x = axis_params.copy()
mod_x['labelAngle'] = -45

base = alt.Chart(uni_select[(uni_select.clean_go.isin(pgc_good))|(uni_select.clean_go.isin(rplc_good))]).encode(
    x=alt.X('column:N',
        axis=mod_x),
    y=alt.Y('mean(norm10_abundance):Q',
        axis=axis_params,
        scale=alt.Scale(
            domain=[0,1]
        )),
    color=column_colors
).properties(
    width=100,
    height=100
)

alt.layer(base.mark_circle(size=50), base.mark_errorbar(extent='stdev')).facet(
    column=alt.Column('clean_go:N', sort=order)
).configure_facet(spacing=50)

base.mark_boxplot(size=25).facet(
    column=alt.Column('clean_go:N', sort=order)
)