# Purity Reviewer Example

In [1]:
%load_ext autoreload
%autoreload 2
    

In [2]:
from PurityReviewer.Reviewers.MatchedPurityReviewer import MatchedPurityReviewer
from PurityReviewer.Reviewers.ManualPurityReviewer import ManualPurityReviewer
from PurityReviewer.AppComponents.utils import download_rdata
import pandas as pd
import numpy as np
import dalmatian
import os

# Set up simuated tumor data

Clone the `SimulatedTumorData` repo and install in your environment.
```
cd <path to put SimulatedTumorData repo>
git clone https://github.com/getzlab/SimulatedTumorData

conda activate <your env>
pip install -e <path to put SimulatedTumorData repo>
```

This only needs to be done once.

## Load data

In [3]:
from SimulatedTumorData.src.generate_simulated_data import load_patients_and_samples

In [4]:
samples, participants = load_patients_and_samples(
    path_to_sim_data='SimulatedTumorData/sim_data'
)

/Users/cchu/Desktop/Methods/SimulatedTumorData/sim_data/patient1/phylogicNDT_results_1000
loading existing CNV pickle file SimulatedTumorData/sim_data/patient1/patient1.cnv_events.pkl
SimulatedTumorData/sim_data/patient1/sample_coverage/p1_t1.binned_coverage.tsv already exists.
SimulatedTumorData/sim_data/patient1/sample_coverage/p1_t2.binned_coverage.tsv already exists.
SimulatedTumorData/sim_data/patient1/sample_coverage/p1_t3.binned_coverage.tsv already exists.
patient variants path exists: SimulatedTumorData/sim_data/patient1/patient1.variants.tsv
Sample p1_t1 has variants_fn: SimulatedTumorData/sim_data/patient1/sample_muts/p1_t1.variants.tsv
Sample p1_t2 has variants_fn: SimulatedTumorData/sim_data/patient1/sample_muts/p1_t2.variants.tsv
Sample p1_t3 has variants_fn: SimulatedTumorData/sim_data/patient1/sample_muts/p1_t3.variants.tsv
Run SimulatedTumorData/sim_data/patient1/sample_mut_vcf/p1_t1.variants.vcf through (nexus-snp hg19 RefSeq).
Generated sif file: SimulatedTumorData/s

In [5]:
samples

Unnamed: 0_level_0,maf_fn,wxs_purity,collection_date_dfd,cnv_seg_fn,participant_id,preservation_method,wxs_ploidy,ABSOLUTE_pp_calls_tab_fn,ABSOLUTE_pp_modes_data_fn,ABSOLUTE_pp_modes_plots_fn,ABSOLUTE_mode_res_rds_fn,ABSOLUTE_mode_tab_fn,ABSOLUTE_plot_fn,ABSOLUTE_SSNV_mode_res_rds_fn,ABSOLUTE_RData
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
p1_t1,SimulatedTumorData/sim_data/patient1/sample_mu...,0.7,50,SimulatedTumorData/sim_data/patient1/sample_cn...,patient1,,1.83,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...
p1_t2,SimulatedTumorData/sim_data/patient1/sample_mu...,0.45,100,SimulatedTumorData/sim_data/patient1/sample_cn...,patient1,,1.9,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...
p1_t3,SimulatedTumorData/sim_data/patient1/sample_mu...,0.9,120,SimulatedTumorData/sim_data/patient1/sample_cn...,patient1,,1.75,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...,SimulatedTumorData/sim_data/patient1/sample_AB...
p2_t1,SimulatedTumorData/sim_data/patient2/sample_mu...,0.5,100,SimulatedTumorData/sim_data/patient2/sample_cn...,patient2,,1.99,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...
p2_t2,SimulatedTumorData/sim_data/patient2/sample_mu...,0.85,150,SimulatedTumorData/sim_data/patient2/sample_cn...,patient2,,1.99,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...
p2_t3,SimulatedTumorData/sim_data/patient2/sample_mu...,0.2,180,SimulatedTumorData/sim_data/patient2/sample_cn...,patient2,,2.0,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...,SimulatedTumorData/sim_data/patient2/sample_AB...


In [6]:
participants

Unnamed: 0_level_0,maf_fn,cluster_ccfs_fn,build_tree_posterior_fn,tumor_molecular_subtype,tumor_morphology,tumor_primary_site,cancer_stage,vital_status,death_date_dfd,follow_up_date,age_at_diagnosis,gender,notes,treatments_fn
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
patient1,SimulatedTumorData/sim_data/patient1/phylogicN...,SimulatedTumorData/sim_data/patient1/phylogicN...,SimulatedTumorData/sim_data/patient1/phylogicN...,Unknown,Unknown,,,,,,32,,,SimulatedTumorData/sim_data/patient1/patient1_...
patient2,SimulatedTumorData/sim_data/patient2/phylogicN...,SimulatedTumorData/sim_data/patient2/phylogicN...,SimulatedTumorData/sim_data/patient2/phylogicN...,Unknown,Unknown,,,,,,32,,,SimulatedTumorData/sim_data/patient2/patient2_...


# Reviewer

In [7]:
from PurityReviewer.AppComponents.utils import parse_absolute_soln, CSIZE_DEFAULT
from PurityReviewer.AppComponents.utils import parse_absolute_soln_simulatedTumorData

In [8]:
data_path = 'simulated_data_purity_review'

In [12]:
matched_reviewer = MatchedPurityReviewer()
matched_reviewer.set_review_data(
    data_path=data_path, 
    description='Matched purity reviewer for simulated data', 
    df=samples,
    index=samples.index, 
)
matched_reviewer.set_review_app(
    sample_info_cols=['ABSOLUTE_plot_fn', 'wxs_purity', 'wxs_ploidy'],
    acs_col='cnv_seg_fn', 
    maf_col='maf_fn',
    rdata_fn_col='ABSOLUTE_RData',
    mut_fig_hover_data=['Hugo_Symbol', 'Chromosome', 'Start_position'],
    csize=CSIZE_DEFAULT,
    custom_parse_absolute_soln=parse_absolute_soln_simulatedTumorData # <-- update with my_custom_parse_absolute_soln()
)

matched_reviewer.set_default_review_data_annotations_configuration()
matched_reviewer.set_default_autofill()

In [13]:
matched_reviewer.run(port=8099, mode='tab', collapsable=False, hide_history_df_cols=['source_data_fn'])

Setting auto_export_path to simulated_data_purity_review/data.auto_export
Using simulated_data_purity_review/data.auto_export for auto exporting.



You are in test mode. Your data will not be saved.



Dash app running on http://0.0.0.0:8099/


<IPython.core.display.Javascript object>

In [14]:
matched_reviewer.get_annot()


Data is not frozen. Annotations will not be saved. Please freeze data in the dashboard to save annotations.



Unnamed: 0_level_0,Purity,Ploidy,Method,Absolute_solution_idx,Notes
value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
p1_t1,0.7,1.8,Absolute,1.0,
p1_t2,0.45,1.8,Absolute,1.0,
p1_t3,,,,,
p2_t1,,,,,
p2_t2,,,,,
p2_t3,,,,,


# Purity Review from CGA characterization pipeline Terra workspace

## Load data

In [11]:
import dalmatian

```
import os
os.environ["GCLOUD_PROJECT"] = <google project>

wm = dalmatian.WorkspaceManager(<namespace>/<workspace_name>)
pairs_df = wm.get_pairs()

# Download rdata from ABSOLUTE locally
rdata_dir = <directory to save rData>
downloaded_rdata_s = download_rdata(pairs_df['absolute_rdata'], rdata_dir=rdata_dir)
downloaded_rdata_s.name = 'local_absolute_rdata'
pairs_df = pd.concat([pairs_df, downloaded_rdata_s], axis=1)
```

```
from PurityReviewer.AppComponents.utils import CSIZE_DEFAULT
sex_chr_map = {'23': 'X', '24': 'Y'}
rename_chroms = {x: sex_chr_map[x] if x in sex_chr_map.keys() else x for x in CSIZE_DEFAULT.keys()}
custom_csize = {f'chr{rename_chroms[chrom]}': length for chrom, length in CSIZE_DEFAULT.items()}
custom_csize
```

Depending on the pipeline or version of ABSOLUTE, the scripts to parse the Rdata may need to be modified. 

```
from rpy2.robjects import r, pandas2ri
import rpy2.robjects as robjects

def my_custom_parse_absolute_soln(rdata_path: str): # has to be a local path   
    absolute_rdata_cols = ['alpha', 'tau', 'tau_hat', '0_line', '1_line',
                       'sigma_H', 
                       'theta_Q', 
                       'lambda',  
                       'SCNA_likelihood', 
                       'Kar_likelihood', 
                       'SSNVs_likelihood']
    pandas2ri.activate()
    print('New version!')
    r_list_vector = robjects.r['load'](rdata_path)
    r_list_vector = robjects.r[r_list_vector[0]]
    # r_data_id = r_list_vector.names[0]

    # rdata_tables = r_list_vector.rx2(str(r_data_id))
    
    # mode_res = rdata_tables.rx2('mode.res')
    # mode_tab = mode_res.rx2('mode.tab')
    mode_res = r_list_vector.rx2('mode.res')
    mode_tab = r_list_vector.rx2('mode.tab') # or segtab?
    mod_tab_df = pd.DataFrame(columns=absolute_rdata_cols)
    mod_tab_df['alpha'] = mode_tab[:, 0]
    mod_tab_df['tau'] = mode_tab[:, 1]
    mod_tab_df['tau_hat'] = mode_tab[:, 7]
    mod_tab_df['0_line'] = mode_tab[:, 3]
    mod_tab_df['step_size'] = mode_tab[:, 4] * 2
    mod_tab_df['1_line'] = mod_tab_df['step_size'] + mod_tab_df['0_line']
    mod_tab_df['sigma_H'] = mode_tab[:, 8]
    mod_tab_df['theta_Q'] = mode_tab[:, 11]
    mod_tab_df['lambda'] = mode_tab[:, 12]
    mod_tab_df['SCNA_likelihood'] = mode_tab[:, 15]
    mod_tab_df['Kar_likelihood'] = mode_tab[:, 17]
    mod_tab_df['SSNVs_likelihood'] = mode_tab[:, 20]

    return mod_tab_df
```

Run the reviewer. See [CGA WES Characterization Pipeline User Guide
](https://docs.google.com/document/d/1VO2kX_fgfUd0x3mBS9NjLUWGZu794WbTepBel3cBg08/edit?usp=sharing) for more inforation about the CGA characterization pipeline outputs.

```
matched_reviewer = MatchedPurityReviewer()
matched_reviewer.set_review_data(
    data_path = <data path>, 
    description= <description>, 
    df=pairs_df,
    index=pairs_df.index,
)
matched_reviewer.set_review_app(
    sample_info_cols=['participant', 'alleliccapseg_plot'],
    acs_col='alleliccapseg_tsv', 
    maf_col='mutation_validator_validated_maf',
    rdata_fn_col='local_absolute_rdata',
    mut_fig_hover_data=['Hugo_Symbol', 'Chromosome', 'Start_position'],
    csize=CSIZE_DEFAULT,
    custom_parse_absolute_soln=parse_absolute_soln # <-- update with my_custom_parse_absolute_soln()
)

matched_reviewer.set_default_review_data_annotations_configuration()
matched_reviewer.set_default_autofill()
```

```
matched_reviewer.run(port=8099, review_data_table_df=luad_pairs_df[['Purity', 'participant']], mode='tab')
```