In [1]:
import numpy as np
import pandas as pd 
import altair as alt
import copy
import re 
import ntpath 
from commons import data_processing
from commons.DataProcessors import msfragger
from commons.APICallers import uniprot

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [122]:
column_colors = alt.Color('column:N',
    scale=alt.Scale(
        domain=["PGC", "RPLC", "both"],
        range=["#D56E3B", "#58728C", "#AAAAAA"]
    )
)

axis_params = alt.Axis(
    labelFontSize=14,
    labelFontWeight=600,
    labelAngle=0,
    labelFlush=False
)

In [111]:
# read in data from pax db
# https://pax-db.org/dataset/9606/1502934799/
pax = pd.read_csv('./data/paxdb/9606-WHOLE_ORGANISM-integrated.txt', delimiter='\t', skiprows=range(10))
pax.columns = [c.replace('#', '') for c in pax.columns]

# read in uniprot mappings from paxdb
# https://pax-db.org/download
pax_map = pd.read_csv('./data/paxdb/paxdb-uniprot-links-v4.2.tsv', delimiter='\t', header=None)
pax_map.columns = ['internal_id', 'uniprot_id']

# filter mappings to only human proteins
pax_map_human = pax_map[pax_map.uniprot_id.str.contains('HUMAN')]
pax_human_lookup = dict(zip(pax_map_human.internal_id.tolist(), pax_map_human.uniprot_id.tolist()))


In [112]:
# map uniprot entries to pax db
pax.loc[:, "uniprot_id"] = pax.string_external_id.map(pax_human_lookup)
pax.loc[:, "rank"] = [i for i in range(1, len(pax)+1)]
pax

Unnamed: 0,internal_id,string_external_id,abundance,uniprot_id,rank
0,4445428,9606.ENSP00000370010,37104.0,TYB4_HUMAN,1
1,4443001,9606.ENSP00000356969,30702.0,APOA2_HUMAN,2
2,4443752,9606.ENSP00000360522,23754.0,RET4_HUMAN,3
3,4437206,9606.ENSP00000295897,21929.0,ALBU_HUMAN,4
4,4434700,9606.ENSP00000259396,14374.0,A1AG1_HUMAN,5
...,...,...,...,...,...
19333,4440324,9606.ENSP00000332110,0.0,OR4N5_HUMAN,19334
19334,4439600,9606.ENSP00000323354,0.0,OR5M8_HUMAN,19335
19335,4447613,9606.ENSP00000389475,0.0,CC079_HUMAN,19336
19336,4445007,9606.ENSP00000367353,0.0,CL042_HUMAN,19337


In [113]:
# make a test chart to see if the data is how we expect
alt.Chart(pax).mark_circle().encode(
    x=alt.X('rank:Q', title='paxdb rank',
        axis=axis_params),
    y=alt.Y('log_abundance:Q', title='paxdf abundance',
        axis=axis_params),
    # size='abundance:Q'
).transform_calculate(
    log_abundance='log(datum.abundance)'
)

In [114]:
# now we read in experimental data
files = data_processing.get_files(r'./data/fractions/', exts=['psm.tsv'])

# use msfragger module to process data
msf = None
for file in files:
    m = msfragger.msf_processor([file])

    # extract fraction and column info
    dirname = ntpath.dirname(file)
    spl = re.split(r'[\\\/]', dirname)
    column, fraction = spl[-2:]
    m.add_special_column('fraction', fraction)
    m.add_special_column('column', column)

    if msf is None:
        msf = m
    else:
        msf.join_processors(m)

# grab data
df = msf.data
msf

MSF Processor constructed on the following data:
./data/fractions/PGC\F1\psm.tsv
./data/fractions/PGC\F2\psm.tsv
./data/fractions/PGC\F3\psm.tsv
./data/fractions/PGC\F4\psm.tsv
./data/fractions/PGC\F5\psm.tsv
./data/fractions/PGC\F6\psm.tsv
./data/fractions/PGC\F7\psm.tsv
./data/fractions/PGC\F8\psm.tsv
./data/fractions/RPLC\F1\psm.tsv
./data/fractions/RPLC\F2\psm.tsv
./data/fractions/RPLC\F3\psm.tsv
./data/fractions/RPLC\F4\psm.tsv
./data/fractions/RPLC\F5\psm.tsv
./data/fractions/RPLC\F6\psm.tsv
./data/fractions/RPLC\F7\psm.tsv
./data/fractions/RPLC\F8\psm.tsv

In [115]:
# combine based on which column identified the proteins
exp_proteins = df.groupby(['column', 'entry_name', 'protein_id'], as_index=False).count()

# find all proteins identified in both
counts = exp_proteins.protein_id.value_counts()
pairs = counts[counts.values==2].keys()
print(f'{len(counts)} total proteins, {len(pairs)} identified in both columns')

# change column name to 'both'
exp_proteins.loc[(exp_proteins.protein_id.isin(pairs)), 'column'] = 'both'

# remove those duplicates
exp_proteins = exp_proteins.drop_duplicates(['column', 'entry_name'])

# keep only needed rows
exp_proteins = exp_proteins.iloc[:, :3]

5750


In [168]:
# merge to pax db based on entry name
pax_join = pax.merge(exp_proteins, left_on='uniprot_id', right_on='entry_name')

# calculate log10 abundance
# since it includes values << 10, we will multiply by 1000 for normalization
pax_join.loc[:, 'log10_abundance'] = np.log10(pax_join.abundance*1000)

# normalize
pax_join.loc[:, 'norm10_abundance'] = pax_join.log10_abundance /  np.max(pax_join.log10_abundance)

abund_dots = alt.Chart(pax_join).mark_circle(
    size=10
).encode(
    x=alt.X('rank:Q', title='paxdb rank',
        axis=axis_params),
    y=alt.Y('norm10_abundance:Q', title='paxdf abundance',
        axis=axis_params),
    # size='abundance:Q',
    color=column_colors,
    column='column:N'
).properties(
    width=100,
    height=300
)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [174]:
abund_box = alt.Chart(pax_join, width=75).mark_boxplot(
    size=50
).encode(
    y=alt.Y('norm10_abundance:Q', title='paxdb abundance, normalized',
        axis=axis_params),
    column='column:N',
    color=column_colors
).transform_calculate(
    log_abundance='log(datum.abundance)'
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

In [175]:
abund_dots

In [176]:
abund_box