In [None]:
# @REMOVE-FROM-TEMPLATE
from plaster.tools.ipynb_helpers.displays import restart_kernel; restart_kernel()

In [None]:
# @IMPORT-MERGE
import numpy as np
import pandas as pd
from munch import Munch
from plaster.tools.zplots import zplots
from plaster.run.plots import plots
from plaster.run.plots import plots_dev as pdev
from plaster.run.plots import plots_dev_pro as ppro
from plaster.run.plots import plots_dev_ptm as pptm
from plaster.run.run import RunResult
from plaster.run.job import JobResult, MultiJobResult
from plaster.tools.ipynb_helpers.displays import hd
from plaster.tools.log.log import error, debug, prof
from plaster.tools.utils.utils import json_print, np_safe_divide, munch_abbreviation_string


In [None]:
# @REMOVE-FROM-TEMPLATE
z = zplots.setup()
job = JobResult("/erisyon/internal/jobs_folder/bmf_2020_07_23_02_hemolysate_classify_v2")

# If you are running this report by dropping it into a job folder,
# then comment the above and uncomment this line before running:
# job = JobResult("./")

run = job.runs[0]

## Optionally Edit your Proteins or PTMs of interest
* These are typically specified when you create a plaster job
* You needn't specify anything here unless you want to change this
* Execute this cell to see the current setting.
* This cell loads all PR data and may take some minutes.

In [None]:
which_classifier = "nn_v2"  # None to use best available, or e.g. 'rf' to request specific

#
# Add entries to this list to specify proteins of interest, and optionally locations
# of interest on those proteins.  Note that if you don't set anything here, any
# --protein_of_interest you specified via pgen will be used instead, and any PTM
# locations given in a --protein_csv file will be used for that protein.
#
proteins_locations = [
# These are examples.  Add your own that are not commented out.
#     ( 'P10636-8', '181;184;185;199;202;214;231;237;404' ),
#     ( 'P02768'  , '25' ),
#     ( 'Q13885'  , '' )
]

# You should not edit anything below this point unless you're adventurous.  :)
#
# TODO: much/all of this code should get moved into a python file that is called from here.
#===========================================================================================

if len(proteins_locations) > 0:    
    job.set_pros_of_interest(protein_ids=[tup[0] for tup in proteins_locations])
    for poi, ptms in proteins_locations:
        job.set_pro_ptm_locs(protein_id=poi, ptms=ptms)

# If there are proteins of interest, reporting will be focused on those.
# If those have PTM locations of interest, reporting will further focused on those.
# The flags include_xxx_only determine which proteins/peptides are pulled into
# the reporting -- you can override those if you want.
columns = ["pro_id", "pro_ptm_locs"]
if "abundance" in job.runs[0].prep.pros().columns:
    columns += ["abundance"]
proteins_of_interest = job.get_pros_of_interest().drop_duplicates("pro_id")[columns]
ptms_for_proteins = [job.get_pro_ptm_locs(poi) for poi in proteins_of_interest["pro_id"].unique()]

include_poi_only = len(proteins_of_interest) > 0  # poi only if there are some specified
include_ptm_only = include_poi_only and all(ptms_for_proteins)

# This section tells you what the reporting will be based on, and
# loads precision/recall/scoring information for that domain.
if not proteins_of_interest.empty:
    print("Proteins of interest:")
    display(proteins_of_interest)
    print()

# Choose a classifier based on availability and user request at top of cell.
available_classifiers = job.runs[0].get_available_classifiers()
chosen_classifier = which_classifier if which_classifier in available_classifiers else available_classifiers[0]
print(f"Available classifiers : {available_classifiers}")

prs_args = Munch(
    include_poi_only=include_poi_only,
    include_ptm_only=include_ptm_only,
    force_compute_prs=False,
    classifier=chosen_classifier,
)

print("Loading PR information for peptides based on this:")
json_print(prs_args)
print("\nTakes a minute...")

all_runs_pr = job.peps_prs_report_df(**prs_args)
all_runs_pr_abund = job.peps_prs_report_df(**prs_args, pr_with_abundance=True)
print("done.")

## Edit your filters and find best runs

In [None]:
def best_runs_for_proteins(filters):
    pr_df = all_runs_pr
    abund_title = "(equal abundance)"
    if filters.pr_with_abundance:
        if all_runs_pr_abund is None:
            abund_title = "(no abundance information available)"
            filters.pr_with_abundance=False
        else:
            abund_title = "(with abundance)" 
            pr_df = all_runs_pr_abund
            
    hd("h1", f"Best runs per protein {abund_title}")
    #hd("h3", "Filters")
    #json_print(filters)
    best_pr = job.get_best_precision_runs_for_pros(pr_df, filters)
    run_info = pdev._run_iz_count_pep_iz(best_pr)
    ppro.plot_best_runs_pr(best_pr, pr_df, run_info, filters, _size=640)
    pd.set_option('display.max_columns', None)
    display(best_pr)
    return best_pr

def pr_for_a_protein(filters,pro=None):
    #pr_df = all_runs_pr         
    hd("h1", f"Best run per protein {pro}")
    best_pr = job.get_best_precision_runs_for_pros(all_runs_pr, filters)
    run_info = pdev._run_iz_count_pep_iz(best_pr)
    ppro.plot_best_runs_pr(best_pr, all_runs_pr, run_info, filters, _size=640)
    display(best_pr[['run_name','pep_i','pep_start','pep_stop','prec','recall','run_i']])
    return best_pr

# Edit the filters here, then run this cell
filters = Munch(
    allow_proline_at_2=False,      # True or False
    classifier=chosen_classifier,  # edit which_classifier in cell above to change this.
    exclude_runs=[],               # [] or List of runs to exclude, e.g. ['gluc_ph4_c_k_de_y_9880']   
    include_runs=[],               # [] or List of runs to consider, e.g. ['gluc_ph4_c_k_de_y_9880']
    max_dyes_per_ch=4,             # None, or some integer
    max_pep_len=50,                # None, or some integer
    min_recall=0.1,                # floating point number between 0 and 1
    n_best_runs=1,                 # integer >= 1
    pr_with_abundance=True,        # adjust PR for available protein abundance information
    pro_subset=[],                 # [] or subset of proteins to consider, e.g. ['Q8WXI7','P21217']
                                   # Note the proteins_of_interest is already respected with []
                                   # pro_subset is used to specify a further subset of these.
)

best_pr = best_runs_for_proteins(filters)
if False: #set to True if you want PR curves for each POI individually
    for index,row in proteins_of_interest.iterrows():
        pro = row['pro_id']
        filters['pro_subset'] = [pro]
        best_pr = pr_for_a_protein(filters,pro)
# The following line saves your best_pr dataframe to a CSV named for the filter settings.
# user =  ''
# best_pr.to_csv(f'./report_best_pr__{user}__{munch_abbreviation_string(filters)}.csv',index=False,float_format="%g")




# Runs that produced at least one best-precision-at-recall

In [None]:
#Change this to vary how many best precisions at min recall to show
nbr_best_to_show = 15

# Execute this cell to get a standard report on each run that produced at least one
# best precision-recall for a peptide.
#
# Or call run_report with your run_i of interest.
def run_report(run_i):
    run = job.runs[run_i]
    hd("h1", "_________________________________________________________________")
    plots.standard_run_report( run, classifier=filters.classifier )

    hd("h3", f"Top {nbr_best_to_show} best precisions at min_recall={filters.min_recall} {filters.classifier}")
    df = pdev.peps_prec_at_min_recall_df(all_runs_pr[all_runs_pr.run_i==run_i], min_recall=filters.min_recall)
    df = df.sort_values(by=["prec", "recall"], ascending=[False, False])
    display(df.head(nbr_best_to_show)) 
    print()
    
    # Rendering of large confusion matricies are crashing the brower
    # removed until we can figure out how to display better
    #hd('h2', f"Confusion Matrix, with & without score threshold (best precision pep_i)")
    #row = best_pr[best_pr.run_i==run_i].sort_values(by=['prec','recall'],ascending=[False,False]).iloc[0]
    #pdev.plot_confusion_matrix_compare( job.runs[run_i],row.pep_i,row.score, classifier=filters.classifier )


# Set to True to get a standard run report on each run that produced a "best pr"
run_info = pdev._run_iz_count_pep_iz(best_pr)
if False:
    for run_i in run_info.run_iz:
        run_report(run_i)


# Or get a report on a specific run_i
# run_info.run_iz is a list of run_i sorted by best->worst based on filter
# best = produces most peptides with "best pr" of all runs
if True:
    run_i = run_info.run_iz[0]  # run_iz is sorted from best->worst
    run_report(run_i)

# Explore fluorosequences

In [None]:
#
# To explore details for a given fluorosequence:
#
# Edit the flu and run you want to explore & set to True

if False:
    flu = '1..21...01..... ;0,0,0'
    run_i = run_info.run_iz[0]
    peps_prs_df = all_runs_pr[(all_runs_pr.run_i==run_i)&(all_runs_pr.flustr==flu)]
    pdev.plot_flu_info( job.runs[run_i], flu, peps_prs_df=peps_prs_df, min_recall=filters.min_recall, classifier=filters.classifier )



# Explore imposters by peptide

In [None]:
# If you want to explore imposters, set this to true and
# update the several parameters below for your peptide.
show_imposters=False

# the run you are interested in
run_i = 1

# the peptide you are interested in
pep_i = 259 

# Score Threshold:
# Only classifier calls with score above score_threshold are considered.
# Start with any number in [0,1] like 0.5, or look this up in a table above where
# you saw your peptide along with a precision, recall, and score.  You should
# get back the same precision and recall if you use the same score.
#
# If score_threshold is 0, no classifier calls will be dropped.
# The higher the threshold is set, the more classifier calls will be dropped.
# Exactly how many is shown as those assigned to pep_i==0 in the False Negatives list.
#
# As the score_threshold goes up, the precision will also go up, and 
# the recall will go down, as we drop classifier calls which were scored 
# below the threshold (and more likely to be wrong than those above).
score_threshold = 0.42

# Do we want abundance taken into account?
with_abundance=True

# How many top imposters of each class (False Positives, False Negatives) to show
topN = 20


#########################################################################################
# Users shouldn't need to edit anything below here

if show_imposters:
    cb = job.runs[run_i].test_call_bag(classifier=filters.classifier)
    cm = cb.conf_mat_at_score_threshold( score_threshold )

    # scale confusion matrix by abundance if desired.
    if with_abundance:
        pep_abundance = cb._prep_result.peps_abundance()
        if pep_abundance is None:
            print( "*** abundance requested but none available.")
            with_abundance=False
        else:
            cm = cm.scale_by_abundance(pep_abundance)
            print( f"abundance of pep {pep_i} is {pep_abundance[pep_i]}")

    hd('h3',f'run {run_i}, pep {pep_i}, score_threshold {score_threshold}, abundance: {with_abundance}')

    print("confusion matrix shape", cm.shape)

    predictions = cm[pep_i,:]
    print("precision",cm.precision()[pep_i])

    truths = cm[:,pep_i]
    print("recall",cm.recall()[pep_i])

    print(f"total pep{pep_i} present: {np.sum(truths)}")
    print(f"total correct predictions to pep{pep_i}: {predictions[pep_i]}")
    print(f"total wrong predictions to pep{pep_i}: {np.sum(predictions)-predictions[pep_i]}")


    hd('h3',f'Top{topN} predictions of any peptide to peptide {pep_i} (i.e. False Positives)')

    top_row_peps = np.argsort(predictions)[-topN:][::-1]
    top_row_peps_counts = predictions[top_row_peps]

    top_row_df = pd.DataFrame( Munch(pep_i=top_row_peps,n_predictions=top_row_peps_counts))
    display(top_row_df)

    peps_flus_etc = cb.peps__pepstrs__flustrs__p2()

    top_row_peps_info = peps_flus_etc[peps_flus_etc.pep_i.isin(top_row_df.pep_i)].copy().set_index('pep_i').join(top_row_df.set_index('pep_i')).sort_values('n_predictions',ascending=False).reset_index()
    display(top_row_peps_info)


    hd('h3',f'Top{topN} predictions of peptide {pep_i} to any peptide (i.e. False Negatives)')

    top_col_peps = np.argsort(truths)[-topN:][::-1]
    top_col_peps_counts = truths[top_col_peps]

    top_col_df = pd.DataFrame( Munch(pep_i=top_col_peps,n_predictions=top_col_peps_counts))
    display(top_col_df)

    top_col_peps_info = peps_flus_etc[peps_flus_etc.pep_i.isin(top_col_df.pep_i)].copy().set_index('pep_i').join(top_col_df.set_index('pep_i')).sort_values('n_predictions',ascending=False).reset_index()
    display(top_col_peps_info)



# runs_pr_falses.csv for selected runs

In [None]:
# @REMOVE-FROM-TEMPLATE
PGEN_report_precisions = (0.9,)

In [None]:
#==========================================================================================
# Edit your desired parameters here
#==========================================================================================
precisions = PGEN_report_precisions  # see above cell, or cell at top of notebook
n_falses = 1
protein_of_interest_only = False  

# This controls the ordering of the columns in the csv
cols = ['run_i', 'run_name', 'pro_i', 'pro_id', 'pep_i', 'pep_start', 'pep_stop', 'at_prec', 'recall_at_prec', 'score_at_prec', 'ptms', 'P2', 'seqstr', 'seqlen', 'flustr', 'flu_pros', 'false_i', 'false_type', 'false_pro_i', 'false_pep_i','false_flustr', 'false_weight']

# This controls the default sorting
sort = ['run_i','pro_i','pep_start', 'at_prec', 'recall_at_prec', 'pep_i', 'false_weight' ]
ascend = [True,True,True,False,False,True,False]

#==========================================================================================

def pr_falses_for_best_runs(_run_info, prec, n_falses, protein_of_interest_only, classifier):
    df_list = []
    for run_i in _run_info.run_iz:
        run = job.runs[run_i]
        bag = run.test_call_bag( classifier=classifier )
        df = bag.false_rates_all_peps__ptm_info(prec, n_falses, protein_of_interest_only)
        df["run_i"] = run_i
        df["run_name"] = run.manifest.run_name
        df_list += [df]
    return pd.concat(df_list).reset_index(drop=True)

pep_false_df = pd.concat([
    pr_falses_for_best_runs(run_info, prec, n_falses, protein_of_interest_only=protein_of_interest_only, classifier=filters.classifier)
    for prec in precisions
]).sort_values(by=sort,ascending=ascend).reset_index()[cols]

if False:
    hd('h3','peptides with non-zero recall at precision thresholds (avail as pep_false_df)')
    
    filename = f"./runs_pr_falses__{'_'.join(map(str,precisions))}__{munch_abbreviation_string(filters)}.csv"
    pep_false_df.to_csv(filename,index=False,float_format="%g")
    print( f"Wrote full pep_false_df to: {filename}")
    
    display(pep_false_df[pep_false_df.recall_at_prec>0])

