# Check notebook is using the correct conda environment

In [1]:
!which python

/afs/bx.psu.edu/user/d/dzb5732/miniconda3/envs/starrseq/bin/python


# Import modules

In [2]:
import os
import sys
import json
import numpy as np
import pandas as pd
import subprocess
import itertools
from itertools import starmap
from argparse import Namespace

In [3]:
# internal modules
sys.path.append("./src/")

In [4]:
from reads_to_peaks import remove_dups, align_reads, filter_reads, call_peaks_helper

# Arguments

In [5]:
##### METADATA #####
meta_file = "./data/metadata.json"
with open(meta_file, "r") as f: 
    meta_dict = json.load(f)

##### GLOBALS #####
KO_NAME = "atf2" # can be 16p12.1, atf2, ctcf, foxa1, lef1, scrt1, tcf7l2
REFGEN = "/data5/deepro/genomes/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta"
ROISORTED = "/afs/bx.psu.edu/user/d/dzb5732/work/girirajan_lab/starrseq/data/master.sorted.bed"

##### FLAGS #####
RUN_DEDUP=False
RUN_ALIGN=False
RUN_FILTER=True
RUN_PEAKS=False
INPUT_FLAG=True
CONTROL_FLAG=True

args = Namespace(
    # from metadata and globals
    input_library_prefix = meta_dict["input"]["prefix"],
    control_library_prefix = meta_dict["control"]["prefix"],
    ko_library_prefix = meta_dict[KO_NAME]["prefix"],
    input_library_reps = meta_dict["input"]["replicates"],
    control_library_reps = meta_dict["control"]["replicates"],
    ko_library_reps = meta_dict[KO_NAME]["replicates"],
    input_library_pair = meta_dict["input"]["read_pairs"],
    control_library_pair= meta_dict["control"]["read_pairs"],
    ko_library_pair = meta_dict[KO_NAME]["read_pairs"],
    input_library_umi = meta_dict["input"]["umi"],
    control_library_umi = meta_dict["control"]["umi"],
    ko_library_umi = meta_dict[KO_NAME]["umi"],
    input_library_suffix = meta_dict["input"]["suffix"],
    control_library_suffix = meta_dict["control"]["suffix"],
    ko_library_suffix = meta_dict[KO_NAME]["suffix"],
    reference_genome = REFGEN,
    region_of_interest_sorted = ROISORTED,
    
    # for internal use
    input_library_aligned_prefix = meta_dict["input"]["prefix"].replace("raw_data", "aligned_reads"),
    control_library_aligned_prefix = meta_dict["control"]["prefix"].replace("raw_data", "aligned_reads"),
    ko_library_aligned_prefix = meta_dict[KO_NAME]["prefix"].replace("raw_data", "aligned_reads"),
    input_library_filtered_prefix = meta_dict["input"]["prefix"].replace("raw_data", "filtered_libraries"),
    control_library_filtered_prefix = meta_dict["control"]["prefix"].replace("raw_data", "filtered_libraries"),
    ko_library_filtered_prefix = meta_dict[KO_NAME]["prefix"].replace("raw_data", "filtered_libraries"),
    input_library_filtered_bam = meta_dict["input"]["prefix"].replace("raw_data", "filtered_libraries") + ".bam",
    control_library_filtered_bam = meta_dict["control"]["prefix"].replace("raw_data", "filtered_libraries") + ".bam",
    ko_library_filtered_bam = meta_dict[KO_NAME]["prefix"].replace("raw_data", "filtered_libraries") + ".bam",
    input_library_coverage_bed = meta_dict["input"]["prefix"].replace("raw_data", "filtered_libraries") + ".coverage.bed",
    control_library_coverage_bed = meta_dict["control"]["prefix"].replace("raw_data", "filtered_libraries") + ".coverage.bed",
    ko_library_coverage_bed = meta_dict[KO_NAME]["prefix"].replace("raw_data", "filtered_libraries") + ".coverage.bed",
    input_library_depth_bed = meta_dict["input"]["prefix"].replace("raw_data", "filtered_libraries") + ".depth.bed",
    control_library_depth_bed = meta_dict["control"]["prefix"].replace("raw_data", "filtered_libraries") + ".depth.bed",
    ko_library_depth_bed = meta_dict[KO_NAME]["prefix"].replace("raw_data", "filtered_libraries") + ".depth.bed",
    control_peaks_prefix = meta_dict["control"]["prefix"].replace("raw_data", "results/peaks"),
    ko_peaks_prefix = meta_dict[KO_NAME]["prefix"].replace("raw_data", "results/peaks"),

    # from global flags
    umi_flag = True if meta_dict["input"]["umi"] else False,
    input_flag = INPUT_FLAG,
    control_flag = CONTROL_FLAG,
)

# Deduplicate

In [6]:
# use starrdust if umi's are present else use picard later while filtering reads
# TODO: starrdust deduplication function

if args.umi_flag:
    if RUN_DEDUP:
        print("Running STARRdust")
        remove_dups(args.input_library_prefix, args.input_library_reps, args.input_library_pair, args.input_library_umi, args.input_library_suffix,
                    args.control_library_prefix, args.control_library_reps, args.control_library_pair, args.control_library_umi, args.control_library_suffix,
                    args.ko_library_prefix, args.ko_library_reps, args.ko_library_pair, args.ko_library_umi, args.ko_library_suffix, 
                    input_flag=args.input_flag, control_flag=args.control_flag)
        
    new_suffix = "001.deduped.fastq"
    # overwrite library suffix
    args.input_library_suffix = new_suffix
    args.control_library_suffix = new_suffix
    args.ko_library_suffix = new_suffix

# Align read files to the reference genome

In [7]:
if RUN_ALIGN:
    align_reads(args.input_library_prefix, args.input_library_reps, args.input_library_pair, args.input_library_suffix, args.input_library_aligned_prefix,
                args.control_library_prefix, args.control_library_reps, args.control_library_pair, args.control_library_suffix, args.control_library_aligned_prefix,
                args.ko_library_prefix, args.ko_library_reps, args.ko_library_pair, args.ko_library_suffix, args.ko_library_aligned_prefix, 
                args.reference_genome, input_flag=args.input_flag, control_flag=args.control_flag)

# Filter aligned read files

In [8]:
if RUN_FILTER:
    filter_reads(args.input_library_aligned_prefix, args.input_library_reps, args.input_library_filtered_prefix,
                 args.control_library_aligned_prefix, args.control_library_reps, args.control_library_filtered_prefix,
                 args.ko_library_aligned_prefix, args.ko_library_reps, args.ko_library_filtered_prefix, args.region_of_interest_sorted, 
                 umi=args.umi_flag, input_flag=args.input_flag, control_flag=args.control_flag)

Duplicates removed by the user
Duplicates removed by the user
Duplicates removed by the user
Duplicates removed by the user
Duplicates removed by the user
Duplicates removed by the user
Duplicates removed by the user
Duplicates removed by the user
Duplicates removed by the user


# Call peaks

In [23]:
if RUN_PEAKS:
    call_peaks_helper(args.input_library_filtered_prefix, args.control_library_filtered_prefix, args.ko_library_filtered_prefix, control_flag=args.control_flag)