# Check notebook is using the correct conda environment

In [1]:
!which python

/afs/bx.psu.edu/user/d/dzb5732/miniconda3/envs/starrseq/bin/python


# Import modules

In [2]:
import sys
from itertools import starmap

from bokeh.io import output_notebook
from bokeh.plotting import show
from bokeh.layouts import row

output_notebook()

In [3]:
# internal modules
sys.path.append("./src/")

In [4]:
from create_arguments import create_args
from utils.verify_reads import create_coverage_beds
from utils.validation_helper import get_tsv, rep_bed_to_corr_mat, get_intra_lib_rep_corr_plot, get_fractional_base_coverage_plot, print_library_bp_coverage_stats

# Arguments

In [6]:
# TODO: get arguments should be a separate module
##### METADATA #####
meta_file = "./data/metadata.json"

##### GLOBALS #####
KO_NAME = "atf2" # can be 16p12.1, atf2, ctcf, foxa1, lef1, scrt1, tcf7l2
REFGEN = "/data5/deepro/genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
ROISORTED = "/afs/bx.psu.edu/user/d/dzb5732/work/girirajan_lab/starrseq/data/master.sorted.bed"
ROIMASTER = "/afs/bx.psu.edu/user/d/dzb5732/work/girirajan_lab/starrseq/data/enhancer_master_list_hek293.bed"

##### FLAGS #####
RUN=False
INPUT_FLAG=True
CONTROL_FLAG=True

args = create_args(meta_file, KO_NAME, REFGEN, ROIMASTER, ROISORTED, run=RUN, input_flag=INPUT_FLAG, control_flag=CONTROL_FLAG)

# Library validation

## Check coverage of the filtered libraries against the regions of interest

In [7]:
# library replicates coverage with ROI
if args.run:
    create_coverage_beds(args.region_of_interest_sorted, 
                        args.input_library_aligned_prefix, args.input_library_reps, 
                        args.control_library_aligned_prefix, args.control_library_reps, 
                        args.ko_library_aligned_prefix, args.ko_library_reps,
                        iflag=args.input_flag, cflag=args.control_flag)

In [8]:
# merged library coverage with ROI
if args.run:
    create_coverage_beds(args.region_of_interest_sorted, 
                        args.input_library_filtered_prefix, "", 
                        args.control_library_filtered_prefix, "", 
                        args.ko_library_filtered_prefix, "",
                        iflag=args.input_flag, cflag=args.control_flag)

## Replicate wise intra library correlation

### Coverage
Counts of reads that maps, partially (1bp) or fully to a region of interest

In [9]:
map_args = [
    (args.input_library_aligned_prefix, args.input_library_reps, "filtered.coverage.bed", 3),
    (args.control_library_aligned_prefix, args.control_library_reps, "filtered.coverage.bed", 3),
    (args.ko_library_aligned_prefix, args.ko_library_reps, "filtered.coverage.bed", 3)
]

corrmats = list(starmap(rep_bed_to_corr_mat, map_args))

In [10]:
# Correlation matrices together
get_intra_lib_rep_corr_plot(*corrmats, fig_title="Intra library replicate correlation of read counts")

### Depth
Per base pair coverage of a region of interest

In [11]:
map_args = [
    (args.input_library_aligned_prefix, args.input_library_reps, "filtered.depth.bed", 4),
    (args.control_library_aligned_prefix, args.control_library_reps, "filtered.depth.bed", 4),
    (args.ko_library_aligned_prefix, args.ko_library_reps, "filtered.depth.bed", 4)
]

corrmats = list(starmap(rep_bed_to_corr_mat, map_args))

In [12]:
# Correlation matrices together
get_intra_lib_rep_corr_plot(*corrmats, fig_title="Intra library replicate correlation of read depths")

## Merged library statistics

### Fraction of Regions of Interest covered by libraries

In [13]:
cov_dfs = list(map(get_tsv, (args.input_library_coverage_bed, args.control_library_coverage_bed, args.ko_library_coverage_bed)))
col_idx = [6, 6, 6]
lib_names = ["input", "control", "ko"]

In [14]:
plots = list(starmap(get_fractional_base_coverage_plot, zip(cov_dfs, col_idx, lib_names)))

In [15]:
show(row(*plots))

### Base pair wise coverage

1. Average reads assigned to each base pair of a region of interest
2. Coverage info = (number of reads * read length / genomic size)

In [16]:
depth_dfs = list(map(get_tsv, (args.input_library_depth_bed, args.control_library_depth_bed, args.ko_library_depth_bed)))

In [20]:
list(starmap(print_library_bp_coverage_stats, zip(depth_dfs, lib_names)))

----input library stats----

Number of Bases which have at least 50 reads assigned to it: 32167665
Total number of Bases in the library: 32963604
Percentage of Bases which have greater than 50 reads assigned to it: 0.9758540055268229

----control library stats----

Number of Bases which have at least 50 reads assigned to it: 29651356
Total number of Bases in the library: 32963604
Percentage of Bases which have greater than 50 reads assigned to it: 0.8995180260022538

----ko library stats----

Number of Bases which have at least 50 reads assigned to it: 30003053
Total number of Bases in the library: 32963604
Percentage of Bases which have greater than 50 reads assigned to it: 0.9101872780658329



[None, None, None]