# General Mutation Reviewer Example

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import os
from MutationReviewer.Reviewers.GeneralMutationReviewer import GeneralMutationReviewer
import dalmatian

In [3]:
data_dir = './data/'
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)
    
notebook_data_dir = f'{data_dir}/general_local_mutation_reviewer_example/'
if not os.path.isdir(notebook_data_dir):
    os.mkdir(notebook_data_dir)

# 1000 genomes bams

In [4]:
patients = ['HG00096', 'HG00097']

In [5]:
from download_1000genomes_bams import download_genomes, download_vcf

In [10]:
vcf_path = '../1k_genomes/tp53.vcf'

In [87]:
download_vcf(
    vcf_path,
    onek_chr_ftp_path="https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr17.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz",
    region_str="17:7571739-7590808",
)

grep: stdout: Broken pipe


In [152]:
output_dir = '../1k_genomes'

In [160]:
bam_paths_fn = download_genomes(
    patient_ids=patients, 
    output_dir=os.path.abspath(output_dir),
    onek_genomes_ftp="ftp.1000genomes.ebi.ac.uk",
    patient_path_str_format="/vol1/ftp/phase3/data/REPLACE/exome_alignment/REPLACE.mapped.ILLUMINA.bwa.GBR.exome.*.bam",
    region_str="17:7571739-7590808", # TP53
    replace_str='REPLACE',
)
bam_paths_fn

'/Users/cchu/Desktop/Methods/MutationReviewer/1k_genomes/1k_genomes_bam_paths.txt'

In [6]:
bam_paths_df = pd.read_csv('/Users/cchu/Desktop/Methods/MutationReviewer/1k_genomes/1k_genomes_bam_paths.txt', sep='\t', index_col=0)
bam_paths_df.index.name = 'patient_id'
bam_paths_df = bam_paths_df.reset_index()

In [7]:
bam_paths_df.local_bai_path.tolist()

['/Users/cchu/Desktop/Methods/MutationReviewer/1k_genomes/HG00096.17_7571739_7590808.bai',
 '/Users/cchu/Desktop/Methods/MutationReviewer/1k_genomes/HG00097.17_7571739_7590808.bai']

# Mutations to review

In [8]:
header_vcf_path = '../1k_genomes/tp53.vcf.header.txt'
header = pd.read_csv(header_vcf_path, sep='\t')

In [11]:
vcf_df = pd.read_csv(vcf_path, sep='\t', comment='#', header=None)
vcf_df.columns = header.columns.tolist()
vcf_df = vcf_df.rename(columns={'#CHROM': 'CHROM'})

In [12]:
patient_vcf_df = vcf_df[vcf_df[patients].apply(lambda r: r.str.contains('1').any(), axis=1)][vcf_df.columns.tolist()[:9] + patients]

In [13]:
def format_patient_vcf(vcf_df, patient):
    filtered_vcf_df = vcf_df[vcf_df[patient].str.contains('1')][vcf_df.columns.tolist()[:9]]
    filtered_vcf_df['patient_id'] = patient
    return filtered_vcf_df
reformat_patient_vcf_df = pd.concat(
    [format_patient_vcf(patient_vcf_df, p) for p in patients]
)

In [14]:
reformat_patient_vcf_df

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,patient_id
9,17,7572101,.,C,T,100,PASS,AC=286;AF=0.0571086;AN=5008;NS=2504;DP=15895;E...,GT,HG00096
12,17,7572154,.,G,A,100,PASS,AC=2178;AF=0.434904;AN=5008;NS=2504;DP=18905;E...,GT,HG00096
37,17,7573229,.,C,T,100,PASS,AC=824;AF=0.164537;AN=5008;NS=2504;DP=17555;EA...,GT,HG00096
81,17,7574721,.,C,T,100,PASS,AC=65;AF=0.0129792;AN=5008;NS=2504;DP=17045;EA...,GT,HG00096
82,17,7574775,.,C,T,100,PASS,AC=2152;AF=0.429712;AN=5008;NS=2504;DP=15948;E...,GT,HG00096
85,17,7574864,.,C,T,100,PASS,AC=423;AF=0.0844649;AN=5008;NS=2504;DP=16743;E...,GT,HG00096
103,17,7575564,.,T,C,100,PASS,AC=4163;AF=0.83127;AN=5008;NS=2504;DP=10496;EA...,GT,HG00096
115,17,7576276,.,T,A,100,PASS,AC=1287;AF=0.256989;AN=5008;NS=2504;DP=13042;E...,GT,HG00096
117,17,7576348,.,C,T,100,PASS,AC=409;AF=0.0816693;AN=5008;NS=2504;DP=13257;E...,GT,HG00096
174,17,7578115,.,T,C,100,PASS,AC=4175;AF=0.833666;AN=5008;NS=2504;DP=16669;E...,GT,HG00096


# IGV js Run

No additional setup needed. IGV will be rendered inside the notebook

In [15]:
data_pkl_fn = f'{notebook_data_dir}/1k_genomes.TP53.review_data.pkl'
review_description = 'Test reviewer'

In [16]:
notebook_data_dir

'./data//general_local_mutation_reviewer_example/'

In [17]:
reviewer = GeneralMutationReviewer()
reviewer.set_review_data(
    data_pkl_fn=data_pkl_fn, 
    description=review_description,     
    mutations_df=reformat_patient_vcf_df,
    mutation_groupby_cols=['CHROM', 'POS'], # columns to groupby
    mutations_df_bam_ref_col='patient_id', 
    chrom_cols='CHROM', # if a list, must be same length as start_pos_cols
    pos_cols='POS',
    bams_df=bam_paths_df,
    bams_df_ref_col='patient_id',
    bam_cols='local_bam_path',
    bai_cols='local_bai_path',
)
reviewer.set_review_app(
    mutation_table_display_cols=['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'patient_id'],
    bam_table_display_cols=[],
    igv_mode='igv_js',
)
reviewer.set_default_review_data_annotations_configuration()



In [18]:
reviewer.run(port=8094, collapsable=False)

Dash is running on http://0.0.0.0:8094/

Dash app running on http://0.0.0.0:8094/


In [19]:
reviewer.get_history()

Unnamed: 0,index,timestamp,source_data_fn,mutation_call,sequencing_tags,alignment_tags,normal_tags,tumor_tags,other_tag_description,Notes


In [21]:
bam_paths_df

Unnamed: 0,patient_id,original_ftp_path,local_bam_path,local_bai_path
0,HG00096,https://ftp.1000genomes.ebi.ac.uk//vol1/ftp/ph...,/Users/cchu/Desktop/Methods/MutationReviewer/1...,/Users/cchu/Desktop/Methods/MutationReviewer/1...
1,HG00097,https://ftp.1000genomes.ebi.ac.uk//vol1/ftp/ph...,/Users/cchu/Desktop/Methods/MutationReviewer/1...,/Users/cchu/Desktop/Methods/MutationReviewer/1...


In [20]:
test_df = mutect2_vcf_wes_samples_df.reset_index()

NameError: name 'mutect2_vcf_wes_samples_df' is not defined

In [44]:
test_df[test_df['sample_id'] == 'ACH-000017']

Unnamed: 0,sample_id,hg19_RRBS_bai,hg19_RRBS_bam,hg19_hybrid_capture_bai,hg19_hybrid_capture_bam,hg19_raindance_bai,hg19_raindance_bam,hg19_targeted_bai,hg19_targeted_bam,hg38_rna_bai,...,hg38_wes_bai,hg38_wes_bam,hg38_wgs_bai,hg38_wgs_bam,mutect2_parquet_wes,mutect2_parquet_wgs,mutect2_vcf_wes,mutect2_vcf_wgs,participant,stripped_cell_line_name
6,ACH-000017,gs://cclebams/RRBS/G29750/SK-BR-3/v1/SK-BR-3.bai,gs://cclebams/RRBS/G29750/SK-BR-3/v1/SK-BR-3.bam,gs://cclebams/hybrid_capture/SKBR3_BREAST.bai,gs://cclebams/hybrid_capture/SKBR3_BREAST.bam,gs://cclebams/raindance/G16640/SK-BR-3/current...,gs://cclebams/raindance/G16640/SK-BR-3/current...,,,gs://cclebams/rnasq_hg38/CDS-2MfLwH.Aligned.so...,...,gs://cclebams/hg38_wes/CDS-4Klk1G.hg38.bai,gs://cclebams/hg38_wes/CDS-4Klk1G.hg38.bam,,,[gs://ccle-mutation/mutect2_parquet/2b27f200f3...,,gs://ccle-mutation/mutect2_vcf/CDS-4Klk1G_fixe...,,ACH-000017,SKBR3


# Local IGV run

1. Install IGV here: https://software.broadinstitute.org/software/igv/download (tested on version 2.15)
1. BEFORE running the Mutation Reviewer:
    1. Open IGV
    1. Go to "Google" and log in
    1. Go to "Google" and enter google project id. This is required for requestor pays. If the mutations are loading but the bams are not, this is likely the problem.
1. Run the notebook

In [22]:
reviewer.set_review_app(
    mutation_table_display_cols=['Hugo_Symbol', 'chr', 'pos', 'type', 'classification', 'gnomADg_AF', 't_alt_count', 't_ref_count', 'tumor_f', 'purity'],
    bam_table_display_cols=['sample_id'],
    igv_mode='igv_local',
)

In [23]:
reviewer.run(port=8094)

Dash is running on http://0.0.0.0:8094/

Dash app running on http://0.0.0.0:8094/


# Both options


In [27]:
reviewer.set_review_app(
    mutation_table_display_cols=['Hugo_Symbol', 'chr', 'pos', 'type', 'classification', 'gnomADg_AF', 't_alt_count', 't_ref_count', 'tumor_f', 'purity'],
    bam_table_display_cols=['sample_id'],
    igv_mode='both',
)

In [28]:
reviewer.run(port=8094)

Dash is running on http://0.0.0.0:8094/

Dash app running on http://0.0.0.0:8094/
socket initialized
set_saveopts
Snapshots are available in /Users/cchu/Desktop/Methods/MutationReviewer/example_notebooks/igv_snapshots
set viewopts
{'chr0': '1', 'pos0': 12726048}
loci
Position to view: 1:12,726,028-12,726,068
('gs://fc-02e4b730-cb29-48bb-9c3b-562e075457fe/Getz_Ebert_IBM_13-583_Exomes_PDO-23428_182samples_July2021/RP-1886/Exome/MDA4021076/v3/MDA4021076.bam',)
socket closed


# Local IGV run on a VM

## Set up VM

1. Install VNC server (on VM and local)
1. Start VNC server
1. Port forward

## Install IGV

1. In the VNC browser go to IGV download page
1. Unzip file
1. In terminal run the sh file to open IGV
1. If reading bams from google buckets, go to the "google" tab at the top and log in. Do this BEFORE starting your mutation reviewer
1. Start revieweing