In [None]:
# ## A01_mergefastq_preptargets overall commands

# qsub Scripts/A01a_merge_lanes.sub # *
# qsub Scripts/A01b_plate_metadata.sub # ‡


# # * = job array based on "platenum"
# # † = job array based on "batchnum" (two rows at a time)
# # ‡ fast enough to run interactively




# # if not done from script A00, run the following:
# projdir=/u/project/cluo/chliu/Analyses/IGVF
# mkdir $projdir; cd $projdir
# mkdir fastq_demultip fastq_raw fastq_trimmed mapping_bismark mapping_star
# mkdir Metadata Notebooks Scripts sublogs

In [None]:
%%bash
# before proceeding, also check naming convention of
# the raw .fastq files in $dir_originalfastq: 
# example shown for a 16-plate IGVF pilot experiment

# 128 .fastq files --> 64 read pairs (R1 and R2)
# 64/4 = 16 plates
ls /u/project/cluo/Shared_Datasets/IGVF/202208_Pilot/snmCT-seq/fastq | wc -l
echo -e "\n\n"

# print .fastq.gz examples names
ls /u/project/cluo/Shared_Datasets/IGVF/202208_Pilot/snmCT-seq/fastq | head
echo -e "\n\n"

# print unique plate names, number of lanes per plate
# our lab's convention is date-project-platemetadata-plateindexid
# (check that this final lane-merged file is unique for each plate!)
for fastqfile in /u/project/cluo/Shared_Datasets/IGVF/202208_Pilot/snmCT-seq/fastq/*R1*;
do
    echo $(basename ${fastqfile%_L00[1-4]_*});
done | uniq -c

## (A01a) merge .fastq.gz by lane

In [None]:
%%bash
cat > ../Scripts/A01a_merge_lanes.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A01a_merge_lanes.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=12:00:00,h_data=16G
#$ -N A01a_merge_lanes
#$ -t 1-16



echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "





# environment init ------------------—------------------—-----------------------

. /u/local/Modules/default/init/modules.sh # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs)  # <--



# get list of plates, files ------------------—------------------—--------------

cd $dir_proj
mkdir fastq_raw

list_of_plates=(
  $(for plateid in $dir_originalfastq/*R1*;
    do
    echo $(basename ${plateid%_L00[1-4]_*});
    done | uniq | sort))
target_plate=${list_of_plates[$SGE_TASK_ID - 1]}


# print array task and plate name
# make sure $target_plate is uniquely identifiable &
# doesn't group more than the four lanes typically excepected
echo -e "\n\ntarget plate number (SGE_TASK_ID):" $SGE_TASK_ID
echo "target plate prefix:" $target_plate



# merge R1, then R2 files across lanes ------------------—------------------—---

filesin_r1=($(ls $dir_originalfastq/*$target_plate*R1*fastq.gz))
filesin_r2=($(ls $dir_originalfastq/*$target_plate*R2*fastq.gz))

echo -e "\n\nmerging Read 1 files:"
for file in ${filesin_r1[@]}
do 
    du -h $file
done
cat ${filesin_r1[@]} > fastq_raw/${target_plate}_R1.fastq.gz

echo -e "\n\nmerging Read 2 files:"
for file in ${filesin_r2[@]}
do 
    du -h $file
done
cat ${filesin_r2[@]} > fastq_raw/${target_plate}_R2.fastq.gz



# check output files ------------------—------------------—---------------------

echo -e "\n\nchecking output file sizes."
du -h fastq_raw/${target_plate}*fastq.gz

echo -e "\n\n'A01a_merge_lanes' completed.\n\n"





echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `
echo " "


## (A01b) parse plate metadata 

In [None]:
%%bash
cat > ../Scripts/A01b_plate_metadata.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A01b_plate_metadata.$JOB_ID
#$ -j y
#$ -N A01b_plate_metadata
#$ -l h_rt=0:10:00,h_data=4G
#$ -hold_jid A01a_merge_lanes


echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "



# environment init ------------------—------------------—-----------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# run metadata compilation ------------------—------------------—---------------

# because the two scripts are so fast,
# violating tidy convention and just running both here
# (suggest running these in interactive mode anyway)

python Scripts/A01b_plate_metadata.py
python Scripts/A01c_well_filepaths.py



echo -e "\n\n'A01b_plate_metadata' completed.\n\n"



echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
echo " "

In [None]:
%%bash
cat > ../Scripts/A01b_plate_metadata.py

# ==============================================================================
# Scripts/A01b_plate_metadata.py
# should parse list of lane-merged plates -->
# extract plate-level metadata saved to $dir_proj/Metadata
# ==============================================================================

# recommend running interactively in python/Jupyter to check outputs,
# the relevant metadata parameters very likely to change between studies


# load packages ----------------------------------------------------------------

import glob
import sys
import pandas as pd
import os

# if running interactively, check snmCT_parameters.env loaded or manually spec os.environ e.g.,
# os.environ['projdir'] ="/u/project/cluo/chliu/Analyses/IGVF"; os.chdir(os.environ['projdir'])
# os.environ['ref_dir'] = "/u/project/cluo/chliu/Genomes/human_gencode_v40"
# os.environ['dir_originalfastq'] = "/u/project/cluo/Shared_Datasets/IGVF/202208_Pilot/snmCT-seq/fastq/"
# os.environ['metadat_plate'] = "Metadata/A01b_plate_metadata.csv"



# check fastq.gz names ---------------------------------------------------------

fastq_dir = os.environ['dir_originalfastq']
filepaths_raw_fastq = glob.glob(fastq_dir + "*fastq.gz")
print( filepaths_raw_fastq[0:4] )


# data.frame of plate names ----------------------------------------------------

# split before lane (L00[1-4]) to get unique plate names
plates_df = pd.DataFrame(
    {'plate' : pd.unique([filepath.split("/")[-1].split("_L")[0] for filepath in filepaths_raw_fastq])}
    ).sort_values('plate').reindex()

# study specific metadata, usually separated by -
# example presented here is for IGVF cell lines
plates_df['dateseq'] = plates_df['plate'].transform(lambda platename: platename.split("-")[0])
plates_df['line'] = plates_df['plate'].transform(lambda platename: platename.split("-")[2])
plates_df['time'] = plates_df['plate'].transform(lambda platename: platename.split("-")[3])
plates_df['plateindex'] = plates_df['plate'].transform(lambda platename: platename.split("-")[4])

# number each plate, "platenum" used for batch submission later on
# platenum indexed by 1-Nplates for compatibility with SGE (can't qsub -t 0)
plates_df['platenum'] = plates_df.index.astype(int) + 1
plates_df.index = plates_df.index.astype(int) + 1

# export to "Metadata/A01b_plate_metadata.csv" by default
print( plates_df.head() )
print ( plates_df.shape )
plates_df.to_csv(os.environ['metadat_plate'])



## (A01c) expand plate --> all 384 wells --> final "targets" file

In [None]:
%%bash
cat > ../Scripts/A01c_well_filepaths.py

# ==============================================================================
# Scripts/A01c_well_filepaths.py
# expands plate-level metadata (A01b) into well-level metadata
# ==============================================================================

# recommend running interactively in python/Jupyter to check outputs,
# but shouldn't require any changes to defaults

# load packages ----------------------------------------------------------------

import itertools
import pandas as pd
import numpy as np
import os

# if running interactively, check snmCT_parameters.env loaded or manually spec os.environ e.g.,
# os.environ['projdir'] ="/u/project/cluo/chliu/Analyses/IGVF"; os.chdir(os.environ['projdir'])
# os.environ['metadat_plate'] = "Metadata/A01b_plate_metadata.csv"
# os.environ['metadat_well'] = "Metadata/A01c_well_filepath.csv"



# expand A01b metadata by well -------------------------------------------------

# load A01b
plates_df = pd.read_csv(os.environ['metadat_plate'], index_col=0)

# from pandas documentation
def expand_grid(data_dict):
    """Create a dataframe from every combination of given values."""
    rows = itertools.product(*data_dict.values())
    return pd.DataFrame.from_records(rows, columns=data_dict.keys())

filepath_df = expand_grid({'plate': plates_df['plate'],
    'row' : [chr(x) for x in range(65, 65+16)],
    'col' : [str(x + 1) for x in range(24)]})
filepath_df['well'] = filepath_df[['row', 'col']].agg(''.join, axis = 1)
filepath_df['wellprefix'] = filepath_df['plate'] + "_" + filepath_df['well']

filepath_df = pd.merge(filepath_df, plates_df, how = "left", on = "plate")



# batch into sets of 24 for bismark, STAR processing steps ---------------------
# (by default, one row at a time, incremented by platenum)

# - alternatively, could make smaller batches of wells (e.g., n = 5) for compute
#   environments that favor many small jobs versus a few long jobs,
# - or two sets of batches e.g., filepath_df['batchnum_A04a_bismark']
#   pulled by the sub scripts for the A04a script only

nwellstot = filepath_df.shape[0]
wells_per_batch = 24
filepath_df['batchnum'] =\
    pd.Series(range(0, np.ceil(nwellstot / wells_per_batch).astype(int))
             ).repeat(wells_per_batch)[0:nwellstot].reset_index(drop = True) + 1

print( "number of total wells:" )
print( nwellstot )

filepath_df.index = filepath_df.index.astype(int) + 1

def basename(pathin):
    return(pathin.split("/")[-1])

print( "number of plates:" )
print( "Nplates: " + str( filepath_df['platenum'].max() ) )

print( "number of batches:" )
print( "Nbatches: " + str( filepath_df['batchnum'].max() ) )



# then extensive file paths for sections A02-A06 -------------------------------
# (inelegant, but useful for file checking/compiling info)

# A02: demultiplexing 
# all in dir: fastq_demultip/

filepath_df['A02a_fqgz_demultip_R1'] = "fastq_demultip/" + filepath_df[['plate', 'well']].agg('_'.join, axis = 1) + "_indexed_R1.fastq.gz"
filepath_df['A02a_fqgz_demultip_R2'] = "fastq_demultip/" + filepath_df[['plate', 'well']].agg('_'.join, axis = 1) + "_indexed_R2.fastq.gz"

filepath_df['A02a_txt_summary1'] = "fastq_demultip/" + filepath_df['plate'] + "_summary_1.txt"
filepath_df['A02a_txt_summary2'] = "fastq_demultip/" + filepath_df['plate'] + "_summary_2.txt"



# A03: trimming ----------------------------------------------------------------
# all in dir: fastq_trimmed/

filepath_df['A03a_fqgz_paired_R1'] = "fastq_trimmed/" + filepath_df['wellprefix'] + "_paired_R1.fastq.gz"
filepath_df['A03a_fqgz_paired_R2'] = "fastq_trimmed/" + filepath_df['wellprefix'] + "_paired_R2.fastq.gz"

filepath_df['A03a_fqgz_singletrim_R1'] = "fastq_trimmed/" + filepath_df['wellprefix'] + "_singletrim_R1.fastq.gz"
filepath_df['A03a_fqgz_singletrim_R2'] = "fastq_trimmed/" + filepath_df['wellprefix'] + "_singletrim_R2.fastq.gz"

filepath_df['A03a_json_fastp'] = "fastq_trimmed/" + filepath_df['wellprefix'] + ".json"



# A04: bismark -----------------------------------------------------------------

filepath_df['A04a_dir_bismark'] = "mapping_bismark/" + filepath_df['wellprefix'] + "/"

# (i) paired-end mapping outputs
filepath_df['A04a_bam_bismark_PE'] = \
filepath_df['A04a_dir_bismark'] + filepath_df['A03a_fqgz_paired_R1'].apply(basename).str.replace(".fastq.gz", "_bismark_bt2_pe.bam")
filepath_df['A04a_fqgz_unmap_R1'] = \
filepath_df['A04a_dir_bismark'] + filepath_df['A03a_fqgz_paired_R1'].apply(basename) + "_unmapped_reads_1.fq.gz"
filepath_df['A04a_fqgz_unmap_R2'] = \
filepath_df['A04a_dir_bismark'] + filepath_df['A03a_fqgz_paired_R2'].apply(basename) + "_unmapped_reads_2.fq.gz"

# single-end mapping outputs
filepath_df['A04a_bam_bismark_SE1trim'] = filepath_df['A04a_dir_bismark'] + filepath_df['A03a_fqgz_singletrim_R1'].apply(basename).str.replace(".fastq.gz", "_bismark_bt2.bam")
filepath_df['A04a_bam_bismark_SE2trim'] = filepath_df['A04a_dir_bismark'] + filepath_df['A03a_fqgz_singletrim_R2'].apply(basename).str.replace(".fastq.gz", "_bismark_bt2.bam")

filepath_df['A04a_bam_bismark_SE1unmap'] = filepath_df['A04a_dir_bismark'] + filepath_df['A04a_fqgz_unmap_R1'].str.replace(".fq.gz", "_bismark_bt2.bam")
filepath_df['A04a_bam_bismark_SE2unmap'] = filepath_df['A04a_dir_bismark'] + filepath_df['A04a_fqgz_unmap_R2'].str.replace(".fq.gz", "_bismark_bt2.bam")

# bismark logs
filepath_df['A04a_txt_bismark_PE'] = filepath_df['A04a_dir_bismark'] +\
filepath_df['wellprefix'] + "_paired_R1_bismark_bt2_PE_report.txt"
filepath_df['A04a_txt_bismark_SE1unmap'] = filepath_df['A04a_dir_bismark'] +\
filepath_df['wellprefix'] + "_paired_R1.fastq.gz_unmapped_reads_1_bismark_bt2_SE_report.txt"
filepath_df['A04a_txt_bismark_SE2unmap'] = filepath_df['A04a_dir_bismark'] +\
filepath_df['wellprefix'] + "_paired_R2.fastq.gz_unmapped_reads_2_bismark_bt2_SE_report.txt"
filepath_df['A04a_txt_bismark_SE1trim'] = filepath_df['A04a_dir_bismark'] +\
filepath_df['wellprefix'] + "_singletrim_R1_bismark_bt2_SE_report.txt"
filepath_df['A04a_txt_bismark_SE2trim'] = filepath_df['A04a_dir_bismark'] +\
filepath_df['wellprefix'] + "_singletrim_R2_bismark_bt2_SE_report.txt"

# (ii) picard de-duplication
filepath_df['A04a_bam_dedupe_PE'] = filepath_df['A04a_dir_bismark'] + "PE_dedupe.bam"
filepath_df['A04a_bam_merge_SE'] = filepath_df['A04a_dir_bismark'] + "SE_merge.bam"
filepath_df['A04a_bam_mergesort_SE'] = filepath_df['A04a_dir_bismark'] + "SE_mergesort.bam"
filepath_df['A04a_bam_mergesortdedupe_SE'] = filepath_df['A04a_dir_bismark'] + "SE_mergesortdedupe.bam"

filepath_df['A04a_txt_picard_PE'] = filepath_df['A04a_dir_bismark'] + "picard_PE.log"
filepath_df['A04a_txt_picard_SE'] = filepath_df['A04a_dir_bismark'] + "picard_SE.log"

# (iii) read-level filtering
filepath_df['A04a_sam_dedupeq10_PE'] = filepath_df['A04a_dir_bismark'] + "PE.dedupe_q10.sam"
filepath_df['A04a_sam_dedupeq10_SE'] = filepath_df['A04a_dir_bismark'] + "SE.dedupe_q10.sam"

filepath_df['A04a_allc_final'] = filepath_df['A04a_dir_bismark'] + "allc.tsv.gz"
filepath_df['A04a_allctbi_final'] = filepath_df['A04a_dir_bismark'] + "allc.tsv.gz.tbi"
filepath_df['A04a_txt_allccheck'] = filepath_df['A04a_dir_bismark'] + "allc_check.txt"

# sam stats for coverage, final counts
filepath_df['A04e_txt_samstats_PE'] = filepath_df['A04a_dir_bismark'] + "samstats_PE"
filepath_df['A04e_txt_samstats_SE'] = filepath_df['A04a_dir_bismark'] + "samstats_SE"

filepath_df['A04f_txt_covtot'] = filepath_df['A04a_dir_bismark'] + "total_cov_by_chr"
filepath_df['A04f_txt_covnsites'] = filepath_df['A04a_dir_bismark'] + "nbases_cov_by_chr"



# A05: STAR mapping ------------------------------------------------------------

filepath_df['A05a_dir_star'] = "mapping_star/" + filepath_df['wellprefix'] + "/"

# paired-end mapping outputs (A05a)
filepath_df['A05a_bam_star_PE'] = filepath_df['A05a_dir_star'] + "PE.Aligned.out.bam"
filepath_df['A05a_bam_star_SE1'] = filepath_df['A05a_dir_star'] + "SE1.Aligned.out.bam"
filepath_df['A05a_bam_star_SE2'] = filepath_df['A05a_dir_star'] + "SE2.Aligned.out.bam"

filepath_df['A05a_fq_unmap_R1'] = filepath_df['A05a_dir_star'] + "PE.Unmapped.out.mate1"
filepath_df['A05a_fq_unmap_R2'] = filepath_df['A05a_dir_star'] + "PE.Unmapped.out.mate2"

filepath_df['A05a_txt_star_PE'] = filepath_df['A05a_dir_star'] + "PE.Log.final.out"
filepath_df['A05a_txt_star_SE1'] = filepath_df['A05a_dir_star'] + "SE1.Log.final.out"
filepath_df['A05a_txt_star_SE2'] = filepath_df['A05a_dir_star'] + "SE2.Log.final.out"

# filtered outputs (A05c)
filepath_df['A05c_bam_starfilt_PE'] = filepath_df['A05a_dir_star'] + "PE.Final.bam"
filepath_df['A05c_bam_starfilt_SE1'] = filepath_df['A05a_dir_star'] + "SE1.Final.bam"
filepath_df['A05c_bam_starfilt_SE2'] = filepath_df['A05a_dir_star'] + "SE2.Final.bam"

# samtools & picard output (A05e)
filepath_df['A05e_txt_samtools_PE'] = filepath_df['A05a_dir_star'] + "samstats_PE"
filepath_df['A05e_txt_samtools_SE1'] = filepath_df['A05a_dir_star'] + "samstats_SE1"
filepath_df['A05e_txt_samtools_SE2'] = filepath_df['A05a_dir_star'] + "samstats_SE2"

filepath_df['A05e_txt_picard_PE'] = filepath_df['A05a_dir_star'] + "picard_PE"
filepath_df['A05e_txt_picard_SE1'] = filepath_df['A05a_dir_star'] + "picard_SE1"
filepath_df['A05e_txt_picard_SE2'] = filepath_df['A05a_dir_star'] + "picard_SE2"



# finally, export --------------------------------------------------------------

print(filepath_df.shape)
filepath_df.to_csv(os.environ['metadat_well'])



