# Check notebook is using the correct conda environment

In [1]:
!which python

/afs/bx.psu.edu/user/d/dzb5732/miniconda3/envs/starrseq/bin/python


# Import modules

In [2]:
import os
import sys
import json
import numpy as np
import pandas as pd
import subprocess
import itertools
from itertools import starmap
from argparse import Namespace

from bokeh.io import output_notebook
from bokeh.plotting import show, figure
from bokeh.layouts import row,column
from bokeh.palettes import Blues9, Spectral10
from bokeh.models import ColorBar, LinearColorMapper, ColumnDataSource, FactorRange, FixedTicker, NumeralTickFormatter

output_notebook()

In [3]:
# internal modules
sys.path.append("./src/")

In [4]:
from utils.verify_reads import create_coverage_beds
from utils.process_regions import get_intersects, get_non_intersects
from utils.validation_helper import get_tsv, rep_bed_to_corr_mat, get_intra_lib_rep_corr_plot, get_inter_lib_scatter_plot, get_fractional_base_coverage_plot, print_library_bp_coverage_stats

# Arguments

In [5]:
# TODO: make the arguments proper
##### METADATA #####
meta_file = "./data/metadata.json"
with open(meta_file, "r") as f: 
    meta_dict = json.load(f)

##### GLOBALS #####
KO_NAME = "atf2" # can be 16p12.1, atf2, ctcf, foxa1, lef1, scrt1, tcf7l2
REFGEN = "/data5/deepro/genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
ROISORTED = "/afs/bx.psu.edu/user/d/dzb5732/work/girirajan_lab/starrseq/data/master.sorted.bed"
ROIMASTER = "/afs/bx.psu.edu/user/d/dzb5732/work/girirajan_lab/starrseq/data/enhancer_master_list_hek293.bed"

##### FLAGS #####
RUN=False
INPUT_FLAG=True
CONTROL_FLAG=True

args = Namespace(
    # from metadata and globals
    input_library_prefix = meta_dict["input"]["prefix"],
    control_library_prefix = meta_dict["control"]["prefix"],
    ko_library_prefix = meta_dict[KO_NAME]["prefix"],
    input_library_reps = meta_dict["input"]["replicates"],
    control_library_reps = meta_dict["control"]["replicates"],
    ko_library_reps = meta_dict[KO_NAME]["replicates"],
    input_library_pair = meta_dict["input"]["read_pairs"],
    control_library_pair= meta_dict["control"]["read_pairs"],
    ko_library_pair = meta_dict[KO_NAME]["read_pairs"],
    input_library_umi = meta_dict["input"]["umi"],
    control_library_umi = meta_dict["control"]["umi"],
    ko_library_umi = meta_dict[KO_NAME]["umi"],
    input_library_suffix = meta_dict["input"]["suffix"],
    control_library_suffix = meta_dict["control"]["suffix"],
    ko_library_suffix = meta_dict[KO_NAME]["suffix"],
    reference_genome = REFGEN,
    region_of_interest_master = ROIMASTER,
    region_of_interest_sorted = ROISORTED,

    # for internal use
    region_of_interest_sorted_seq = ROISORTED.replace("sorted", "sorted.seq"),
    input_library_aligned_prefix = meta_dict["input"]["prefix"].replace("raw_data", "aligned_reads"),
    control_library_aligned_prefix = meta_dict["control"]["prefix"].replace("raw_data", "aligned_reads"),
    ko_library_aligned_prefix = meta_dict[KO_NAME]["prefix"].replace("raw_data", "aligned_reads"),
    input_library_filtered_prefix = meta_dict["input"]["prefix"].replace("raw_data", "filtered_libraries"),
    control_library_filtered_prefix = meta_dict["control"]["prefix"].replace("raw_data", "filtered_libraries"),
    ko_library_filtered_prefix = meta_dict[KO_NAME]["prefix"].replace("raw_data", "filtered_libraries"),
    input_library_filtered_bam = meta_dict["input"]["prefix"].replace("raw_data", "filtered_libraries") + ".bam",
    control_library_filtered_bam = meta_dict["control"]["prefix"].replace("raw_data", "filtered_libraries") + ".bam",
    ko_library_filtered_bam = meta_dict[KO_NAME]["prefix"].replace("raw_data", "filtered_libraries") + ".bam",
    input_library_coverage_bed = meta_dict["input"]["prefix"].replace("raw_data", "filtered_libraries") + ".coverage.bed",
    control_library_coverage_bed = meta_dict["control"]["prefix"].replace("raw_data", "filtered_libraries") + ".coverage.bed",
    ko_library_coverage_bed = meta_dict[KO_NAME]["prefix"].replace("raw_data", "filtered_libraries") + ".coverage.bed",
    input_library_depth_bed = meta_dict["input"]["prefix"].replace("raw_data", "filtered_libraries") + ".depth.bed",
    control_library_depth_bed = meta_dict["control"]["prefix"].replace("raw_data", "filtered_libraries") + ".depth.bed",
    ko_library_depth_bed = meta_dict[KO_NAME]["prefix"].replace("raw_data", "filtered_libraries") + ".depth.bed",
    control_peaks_prefix = meta_dict["control"]["prefix"].replace("raw_data", "results/peaks"),
    ko_peaks_prefix = meta_dict[KO_NAME]["prefix"].replace("raw_data", "results/peaks"),
    control_peak_file = meta_dict["control"]["prefix"].replace("raw_data", "results/peaks") + ".peak.final.bed",
    ko_peak_file = meta_dict[KO_NAME]["prefix"].replace("raw_data", "results/peaks") + ".peak.final.bed",
    ko_activated_file = meta_dict[KO_NAME]["prefix"].replace("raw_data", "results/peaks") + ".peak.activated.bed",
    ko_repressed_file = meta_dict[KO_NAME]["prefix"].replace("raw_data", "results/peaks") + ".peak.repressed.bed",
    ko_avalidated_file = meta_dict[KO_NAME]["prefix"].replace("raw_data", "results/peaks") + ".peak.avalidated.bed",
    ko_rvalidated_file = meta_dict[KO_NAME]["prefix"].replace("raw_data", "results/peaks") + ".peak.rvalidated.bed",
    ko_name = "eGFP-ATF2", # TODO: add this information in metadata and get this information from metadata

    # from global flags
    umi_flag = True if meta_dict["input"]["umi"] else False,
    input_flag = INPUT_FLAG,
    control_flag = CONTROL_FLAG,

)

# STARRSeq region wise composite figure
TODO: only include peaks for now; later include all regions with peak information

In [None]:
depth_dfs = list(map(get_tsv, (args.input_library_depth_bed, args.control_library_depth_bed, args.ko_library_depth_bed)))

In [16]:
df_in_depth = depth_dfs[0].set_index([0, 1, 2])
df_cc_depth = depth_dfs[1].set_index([0, 1, 2])
df_ko_depth = depth_dfs[2].set_index([0, 1, 2])

In [17]:
# TODO: bokeh plot showing enhancer region chr X: A-B and it's sequence

df_sorted_seq = get_tsv(args.region_of_interest_sorted_seq)

In [18]:
reg = df_sorted_seq.iloc[40]

In [19]:
reg

0                                                 chr1
1                                              1906648
2                                              1908213
3    GAGATGCTTCTAAATGTTTGGGTGGAGAAGCAGGCATGGGGAGGGT...
Name: 40, dtype: object

In [20]:
chrm, st, en, seq = reg

In [21]:
in_vbars = df_in_depth.loc[(chrm, st, en), 4].values
cc_vbars = df_cc_depth.loc[(chrm, st, en), 4].values
ko_vbars = df_ko_depth.loc[(chrm, st, en), 4].values

In [22]:
TOOLS = "save,pan,box_zoom,reset,wheel_zoom"
color_map = {"A": "red", "T": "green", "G": "blue", "C":"yellow", "N":"black"}

seq_0 = figure(title=f"Region {chrm}: {st} - {en}",
            y_range=["0", "0.5"], x_axis_location="below", width=900, height=150,
           tools=TOOLS, toolbar_location="above")
seq_0.yaxis.visible=False
seq_0.xaxis.formatter = NumeralTickFormatter(format="00")

seq_0.text([st + i for i in range(len(seq))], [0.5 for i in range(len(seq))], text=list(seq), 
              text_color=[color_map[n] for n in seq],
              text_baseline="bottom",
              text_align="center", text_font_size="60px")
              

starrseq_in = figure(title=f"STARRSeq input", x_range=seq_0.x_range,
            x_axis_location="below", width=900, height=250,
           tools=TOOLS, toolbar_location='above')
starrseq_in.xaxis.visible=False
starrseq_in.vbar([st + i for i in range(len(seq))], top=in_vbars, width=0.5, color="gray")

starrseq_cc = figure(title=f"STARRSeq control", x_range=seq_0.x_range, y_range=starrseq_in.y_range,
            x_axis_location="below", width=900, height=250,
           tools=TOOLS, toolbar_location="above")
starrseq_cc.xaxis.visible=False
starrseq_cc.vbar([st + i for i in range(len(seq))], width=0.5, top=cc_vbars, color="navy")

starrseq_ko = figure(title=f"STARRSeq KO", x_range=seq_0.x_range, y_range=starrseq_in.y_range,
            x_axis_location="below", width=900, height=250,
           tools=TOOLS, toolbar_location="above")
starrseq_ko.xaxis.formatter = NumeralTickFormatter(format="00")
starrseq_ko.xaxis.visible=False
starrseq_ko.vbar([st + i for i in range(len(seq))], width=0.5, top=ko_vbars, color="firebrick")


show(column(starrseq_in, starrseq_cc, starrseq_ko, seq_0))
