In [None]:
import numpy as np 
import pandas as pd 
import altair as alt
import re 
import ntpath 
from commons import data_processing
from commons.APICallers import uniprot
# from commons.common_objects import alt_axis
from matplotlib import pyplot as plt 
from venn import venn
from scipy.stats import gaussian_kde
import warnings

warnings.filterwarnings('ignore')
alt.data_transformers.disable_max_rows()

In [None]:
files = data_processing.get_files('./data/DIA/', exts=['.tsv'])

df = pd.DataFrame()
for file in files:
    spl = re.split(r'[\\\/]', file)
    # data = pd.read_parquet(file)
    data = pd.read_csv(file, delimiter='\t')
    data.loc[:, 'column'] = spl[-2]

    df = pd.concat([df, data])
    df.reset_index(inplace=True, drop=True)
df

In [None]:
columns = [c.lower().replace('.', '_') for c in df.columns]
df.columns = columns

df.loc[:, 'sample'] = df.run.map(lambda x: x.split('_')[-3])
df.loc[:, 'tech_rep'] = df.run.map(lambda x: x.split('_')[-1])

df

In [None]:
exp = df.copy()
print(len(exp))
exp.loc[:, 'protein_ids'] = exp.protein_ids.str.split(';')
exp = exp.explode('protein_ids')
print(len(exp))

In [None]:
from venn import venn 

for sample in exp['sample'].unique():
    small = exp[exp['sample']==sample]
    overlap = {}
    for column in small.column.unique():
        overlap[column] = set(small[small.column==column].protein_ids.tolist())
    venn(overlap)

In [None]:
files = data_processing.get_files('./data/DIA/', exts=['.csv'])

df = pd.DataFrame()
for file in files:
    data = pd.read_csv(file)

    df = pd.concat([df, data])
    df.reset_index(inplace=True, drop=True)
df

In [None]:
# rename columns
columns = [c.lower().replace(' ', '_') for c in df.columns]
clean_df = df.copy()
clean_df.columns = columns
# drop decoys
print(f'{len(clean_df)} before decoys removed')
clean_df = clean_df[~clean_df.protein_name.str.contains('Decoy')]
clean_df.reset_index(inplace=True, drop=True)
print(f'{len(clean_df)} after decoys removed')

In [None]:
sample_info = clean_df.replicate.str.split('_', expand=True)
sample_info.columns = ['column', 'sample_name', 'acquisition', 'tech_rep']

clean_df = clean_df.merge(sample_info, left_index=True, right_index=True)

In [None]:
clean_df.loc[:, 'sample_id'] = clean_df.apply(lambda x: f'{x.column}_{x.sample_name}', axis=1)


In [None]:
clean_df.columns

In [None]:
xs = np.linspace(0.01, 0.99, 99)


grouped = clean_df.groupby(['sample_id', 'protein_accession', 'modified_sequence'], as_index=False).mean()

# alt.Chart(grouped).mark_line().encode(
#     x=alt.X('isotope_dot_product', bin=alt.Bin(step=0.02)),
#     y=alt.Y('count():Q', title=''),
#     color='sample_id'
# )


idotp_frame = pd.DataFrame()
for x in xs:
    count = grouped[grouped.isotope_dot_product>=x]
    c = count.sample_id.value_counts()
    data = pd.DataFrame(c).reset_index()
    data.columns = ['sample_id', 'value']
    data.loc[:, 'cutoff'] = x
    data.loc[:, 'level'] = 'peptide'
    idotp_frame = pd.concat([idotp_frame, data])
    idotp_frame.reset_index(inplace=True, drop=True)
    # break 


grouped = clean_df.groupby(['sample_id', 'protein_accession'], as_index=False).mean()
for x in xs:
    count = grouped[grouped.isotope_dot_product>=x]
    c = count.sample_id.value_counts()
    data = pd.DataFrame(c).reset_index()
    data.columns = ['sample_id', 'value']
    data.loc[:, 'cutoff'] = x
    data.loc[:, 'level'] = 'protein'
    idotp_frame = pd.concat([idotp_frame, data])
    idotp_frame.reset_index(inplace=True, drop=True)

In [None]:
alt.Chart(idotp_frame).mark_line(
).encode(
    x=alt.X('cutoff:Q', title='iDotP',
        ),
    y=alt.Y('value:Q', title='Count of Peptides',
        ),
    color='sample_id:N',
    column='level:N'
).resolve_scale(
    y='independent'
)

In [None]:
grouped = clean_df.groupby(['sample_id', 'sample_name', 'column', 'protein_accession', 'modified_sequence'], as_index=False).mean()
g = grouped[grouped.isotope_dot_product>=0.6]
g = g.drop_duplicates(['sample_name', 'protein_accession'], keep=False)
alt.Chart(g).mark_bar().encode(
    x=alt.X('column:N', title=''),
    y=alt.Y('distinct(protein_accession):Q', title='Proteins', stack=True),
    color='column',
    column='sample_name'
)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 15))
i = 0
for sample in clean_df.sample_name.unique():
    small = clean_df[clean_df.sample_name==sample]
    overlap = dict()
    for column in ['PGC', 'RPLC']:
        proteins = small[small.column==column].protein_accession.tolist()
        overlap[column] = set(proteins)
    venn(overlap, ax=axs[i])
    axs[i].set_title(sample)
    i+=1

In [None]:
# show general DIA stuff
# show if proteins identified from same peptides agree in quantitation
# show if quantitation changes when PGC peptides are accounted for

In [None]:
proteins = pd.DataFrame(clean_df.protein_accession.drop_duplicates()).reset_index(drop=True)
rep_frame = pd.DataFrame()

for sample, s_frame in data_processing.iterate_contents('sample_name', clean_df, get_item=True):
    minor = pd.DataFrame()
    for column, c_frame in data_processing.iterate_contents('column', s_frame, get_item=True):
        for run in ["run1", "run2"]:
            small = c_frame.groupby(['tech_rep', 'protein_accession'], as_index=False).mean()
small

In [None]:
g = clean_df[clean_df.isotope_dot_product>=0.9]
g = g.groupby(['sample_name', 'column', 'protein_accession', 'tech_rep']).sum()
from itertools import combinations
samples = ["MT10", "T10"]
columns = ["PGC", "RPLC"]
reps = ["run1", "run2"]
g = g.reset_index('protein_accession')
g = g.sort_index()
p = pd.DataFrame()
for s in samples:
    for c in columns:
        d = pd.DataFrame()
        for r in reps:
            print(s, c, r)
            small = g.loc[(s, c, r), :]
            data = small[['protein_accession', 'isotope_dot_product', 'total_area_ms1']]
            data.columns = ['protein_accession', 'isotope_dot_product', f'{r}_ms1_area']
            data = data.reset_index(drop=True)
            data = data[['protein_accession', 'isotope_dot_product', f'{r}_ms1_area']]
            data.loc[:, f'{r}_log2_area'] = np.log2(data[f'{r}_ms1_area'])
            if d.empty:
                d = data
            else:
                d = d.merge(data, left_on='protein_accession', right_on='protein_accession', how='outer')
        d = d.dropna()
        x, y = d.run1_log2_area.to_numpy(), d.run2_log2_area.to_numpy()
        xy = np.vstack((x, y))
        density = gaussian_kde(xy)(xy)

        d.loc[:, 'sample'] = s
        d.loc[:, 'column'] = c
        d.loc[:, 'density'] = density
        p = pd.concat([p, d])
        p.reset_index(inplace=True, drop=True)
p

In [None]:
p.describe()

In [None]:
p.loc[:, 'run1_log2_area'] = np.log2(p.run1_ms1_area)
p.loc[:, 'run2_log2_area'] = np.log2(p.run2_ms1_area)

alt.Chart(p).mark_circle(
    size=5,
    opacity=0.5
).encode(
    x=alt.X('run1_log2_area:Q', title='Run 1',
        scale=alt.Scale(
            domain=[20,40],
            clamp=True
        ),
),
    y=alt.Y('run2_log2_area:Q', title='Run 2',
        scale=alt.Scale(
            domain=[20,40],
            clamp=True
        ),
),
    color=alt.Color('density:Q', legend=None),
    column='sample:N',
    row='column:N'
).properties(
    height=100,
    width=300
).resolve_scale(
    color='independent'
)

In [None]:
small = clean_df[(clean_df['sample_name']=='MT10')&(clean_df['column']=='PGC')]
xs = np.linspace(0.1, 0.9, 9)

cutoff = pd.DataFrame()

for cut in xs:
    red = small[(small.library_dot_product>=cut)&(small.isotope_dot_product>=cut)]
    g = red.groupby(['tech_rep', 'protein_accession']).mean()
    g = g.dropna().reset_index()

    counts = g.protein_accession.value_counts()
    valid = counts[counts.values==2].keys()

    g = g[g.protein_accession.isin(valid)]

    run1 = g[g.tech_rep=='run1'].copy()
    run2 = g[g.tech_rep=='run2'].copy()

    run1.loc[:, 'log2_ms_area'] = np.log2(run1.total_area_ms1)
    run2.loc[:, 'log2_ms_area'] = np.log2(run2.total_area_ms1)

    x, y = run1.log2_ms_area.to_numpy(), run2.log2_ms_area.to_numpy()
    xy = np.vstack((x, y))
    density = gaussian_kde(xy)(xy)
    print(cut, len(x))
    s = pd.DataFrame({
        'run1_intensity':x,
        'run2_intensity':y,
        'line_x':np.linspace(0,40,len(x)),
        'line_y':np.linspace(0,40,len(x)),
        'density':density,
        'cutoff':cut
    })

    cutoff = pd.concat([cutoff, s])
    cutoff.reset_index(drop=True, inplace=True)

In [None]:
cutoff

In [None]:
line = alt.Chart(cutoff).mark_line(
    strokeDash=[5,3],
    color='#888888'
).encode(
    x=alt.X('line_x',
        scale=alt.Scale(
            domain=[15,40],
            clamp=True
        )),
    y=alt.Y('line_y',
        scale=alt.Scale(
            domain=[15,40],
            clamp=True
        ))
)


dots = alt.Chart(cutoff).mark_circle(
    size=5,
    opacity=0.8
).encode(
    x=alt.X('run1_intensity:Q',
        scale=alt.Scale(
            domain=[15,40],
            clamp=True
        ),
),
    y=alt.Y('run2_intensity:Q',
        scale=alt.Scale(
            domain=[15,40],
            clamp=True
        ),
),
    color='density:Q',
).properties(
    width=300,
    height=100
)

alt.layer(line, dots).facet(
    'cutoff:Q',
    columns=3
).resolve_scale(
    # color='independent'
)

In [None]:
grouped = clean_df[(clean_df.library_dot_product>=0.7)&(clean_df.isotope_dot_product>=0.7)].groupby(['sample_name', 'column', 'protein_accession', 'modified_sequence']).mean()
grouped = grouped.reset_index()
counts = grouped.modified_sequence.value_counts()
valid = counts[counts.values==4].keys()
grouped = grouped[grouped.modified_sequence.isin(valid)]
grouped = grouped.groupby(['sample_name', 'column', 'protein_accession']).sum()
grouped

samples = ["MT10", "T10"]
columns = ["PGC", "RPLC"]

protein_comparison = pd.DataFrame()

for s in samples:
    d = pd.DataFrame()
    for c in columns:
        small = grouped.loc[(s, c), :]
        data = small[['total_area_ms1']]
        data.columns = [f'{c}_ms1_area']
        data.loc[:, f'{c}_log2'] = np.log2(data[f'{c}_ms1_area'])
        # data.loc[:, 'column'] = c
        data.loc[:, 'sample'] = s
        
        if d.empty:
            d = data 
        else:
            d = d.merge(data, left_on=['protein_accession', 'sample'], right_on=['protein_accession', 'sample'])

        d.loc[:, 'line_x'] = np.linspace(0,40,len(d))
        d.loc[:, 'line_y'] = np.linspace(0,40,len(d))

    protein_comparison = pd.concat([protein_comparison, d])
    protein_comparison = protein_comparison.reset_index(drop=True)

In [None]:
line = alt.Chart(protein_comparison).mark_line(
    strokeDash=[5,3],
    color='#888888'
).encode(
    x=alt.X('line_x',
        scale=alt.Scale(
            domain=[20,40],
            clamp=True
        )),
    y=alt.Y('line_y',
        scale=alt.Scale(
            domain=[20,40],
            clamp=True
        ))
)

dots = alt.Chart(protein_comparison).mark_circle(
    size=5,
    opacity=0.5
).encode(
    x=alt.X('PGC_log2:Q', title='PGC',
        scale=alt.Scale(
            domain=[20,40],
            clamp=True
        )),
    y=alt.Y('RPLC_log2:Q', title='RPLC',
        scale=alt.Scale(
            domain=[20,40],
            clamp=True
        )),
).properties(
    width=300,
    height=100
)

alt.layer(line, dots).facet(
    'sample:N'
).resolve_scale(
    # color='independent'
)

In [None]:
counts[counts.values==4]

In [None]:
grouped