In [1]:
# # A01_mergefastq_preptargets overall commands ================================

# qsub Scripts/A01a_merge_lanes.sub # *
# qsub Scripts/A01b_plate_metadata.sub # ‡

# # * = job array based on "platenum"
# # † = job array based on "batchnum" (two rows at a time)
# # ‡ fast enough to run interactively

In [1]:
%%bash
# before proceeding, check naming convention of
# the raw .fastq files in $dir_originalfastq: 
# example shown for a 32-plate experiment from the IGVF m3C dataset

# 512 .fastq files --> 256 read pairs (R1 and R2)
# 256 read pairs/8 lanes = 32 plates
dir_originalfastq=/u/project/cluo/Shared_Datasets/source_fastq/yzcl47

echo "number of .fastq R1+R2 files"
ls ${dir_originalfastq}/*fastq.gz | wc -l
echo -e "\n\n"

# print .fastq.gz examples names
echo "example fastq names"
ls ${dir_originalfastq} | head
echo -e "\n\n"

# print unique plate names, number of lanes per plate
# our lab's convention is date-project-platemetadata-plateindexid
# (check that this final lane-merged file is unique for each plate!)
echo "plate names (based on file name before lane 'L00*')"
echo "check that these are unique -- will be used to merge across lane!"
for fastqfile in ${dir_originalfastq}/*R1*.fastq.gz;
do
    echo $(basename ${fastqfile%_L00[1-8]_*});
done | uniq -c
echo -e "\n\n"

# number of samples
echo "number of plates"
for fastqfile in ${dir_originalfastq}/*R1*.fastq.gz;
do
    echo $(basename ${fastqfile%_L00[1-8]_*});
done | uniq | wc -l

number of .fastq R1+R2 files
512



example fastq names
20231005-3C29D12-Pos1-B06_S16_L001_R1_001.fastq.gz
20231005-3C29D12-Pos1-B06_S16_L001_R2_001.fastq.gz
20231005-3C29D12-Pos1-B06_S16_L002_R1_001.fastq.gz
20231005-3C29D12-Pos1-B06_S16_L002_R2_001.fastq.gz
20231005-3C29D12-Pos1-B06_S16_L003_R1_001.fastq.gz
20231005-3C29D12-Pos1-B06_S16_L003_R2_001.fastq.gz
20231005-3C29D12-Pos1-B06_S16_L004_R1_001.fastq.gz
20231005-3C29D12-Pos1-B06_S16_L004_R2_001.fastq.gz
20231005-3C29D12-Pos1-B06_S16_L005_R1_001.fastq.gz
20231005-3C29D12-Pos1-B06_S16_L005_R2_001.fastq.gz



plate names (based on file name before lane 'L00*')
check that these are unique -- will be used to merge across lane!
      8 20231005-3C29D12-Pos1-B06_S16
      8 20231005-3C29D12-Pos2-C03_S24
      8 20231005-3C29D16-Pos1-B01_S11
      8 20231005-3C29D16-Pos2-B10_S19
      8 20231005-3C29D1-Pos1-C04_S25
      8 20231005-3C29D1-Pos2-A02_S1
      8 20231005-3C37D5-Pos1-C10_S31
      8 20231005-3C37D5-Pos2-A08_S7
      8 2023100

## (A01a) merge .fastq.gz by lane

In [3]:
%%bash
cat > ../Scripts/A01a_merge_lanes.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A01a_merge_lanes.$JOB_ID.$TASK_ID
#$ -j y
#$ -l h_rt=8:00:00,h_data=16G
#$ -N A01a_merge_lanes
#$ -t 1-32



echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh # <--

export $(cat snm3C_parameters.env | grep -v '^#' | xargs)  # <--



# get list of plates, files ----------------------------------------------------

if [[ ! -s fastq_raw ]]
then
    mkdir fastq_raw
fi

list_of_plates=(
  $(for plateid in ${dir_originalfastq}/*R1*;
    do
    echo $(basename ${plateid%_L00[1-8]_*});
    done | uniq | sort))
target_plate=${list_of_plates[$SGE_TASK_ID - 1]}


# print array task and plate name
# make sure ${target_plate} is uniquely identifiable &
# doesn't group more than the four lanes typically excepected
echo -e "\n\ntarget plate number (SGE_TASK_ID):" $SGE_TASK_ID
echo "target plate prefix:" ${target_plate}



# merge R1, then R2 files across lanes -----------------------------------------

filesin_r1=($(ls ${dir_originalfastq}/*${target_plate}*R1*fastq.gz))
filesin_r2=($(ls ${dir_originalfastq}/*${target_plate}*R2*fastq.gz))

echo -e "\n\nmerging Read 1 files:"
for file in ${filesin_r1[@]}
do 
    du -h $file
done
cat ${filesin_r1[@]} > fastq_raw/${target_plate}_R1.fastq.gz

echo -e "\n\nmerging Read 2 files:"
for file in ${filesin_r2[@]}
do 
    du -h $file
done
cat ${filesin_r2[@]} > fastq_raw/${target_plate}_R2.fastq.gz



# check output files -----------------------------------------------------------

echo -e "\n\nchecking output file sizes."
du -h fastq_raw/${target_plate}*fastq.gz

echo -e "\n\n'A01a_merge_lanes' completed.\n\n"





echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `hostname -s`
echo "Job $JOB_ID.$SGE_TASK_ID ended on:   " `date `


## (A01b) parse plate metadata 

In [4]:
%%bash
cat > ../Scripts/A01b_plate_metadata.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A01b_plate_metadata.$JOB_ID
#$ -j y
#$ -N A01b_plate_metadata
#$ -l h_rt=0:10:00,h_data=4G
#$ -hold_jid A01a_merge_lanes


echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "





# environment init -------------------------------------------------------------

. /u/local/Modules/default/init/modules.sh
module load anaconda3 # <--
conda activate snm3Cseq_taurus # <--

export $(cat snm3C_parameters.env | grep -v '^#' | xargs) # <--



# run metadata compilation -----------------------------------------------------

# because the two scripts are so fast,
# violating my .sub & .py paired tidy convention and just running both here
# (suggest running these in interactive mode anyway)

python Scripts/A01b_plate_metadata.py
python Scripts/A01c_well_filepaths.py





echo -e "\n\n'A01b_plate_metadata' completed.\n\n"



echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `


In [5]:
%%bash
cat > ../Scripts/A01b_plate_metadata.py

# ==============================================================================
# Scripts/A01b_plate_metadata.py
# should parse list of lane-merged plates -->
# extract plate-level metadata saved to $dir_proj/Metadata
# ==============================================================================

# recommend running interactively in python/Jupyter to check outputs,
# the relevant metadata parameters very likely to change between studies


# load packages ----------------------------------------------------------------

import glob
import sys
import pandas as pd
import os

# # if running interactively, need to load some lines from snm3C_parameters.env
# # or manually spec os.environ -- e.g., via os.environ['dir_proj'] = "mydirectory" or this below loop
# # (check relative path of parameters.env file or change to absolute if below not working!)
# envvar_needed = ['dir_proj', 'dir_originalfastq', 'metadat_plate']
# try:
#     os.environ['dir_proj']
# except KeyError:
#     envspec = pd.read_csv("../snm3C_parameters.env", sep = "=", comment="#", header = None
#                ).set_axis(['varname', 'varpath'], axis = 1
#                ).query('varname in @envvar_needed')
#     for index, row in envspec.iterrows():
#         os.environ[row["varname"]] = row["varpath"]
# os.chdir(os.environ['dir_proj'])



# check fastq.gz names ---------------------------------------------------------

fastq_dir = os.environ['dir_originalfastq']
filepaths_raw_fastq = glob.glob(fastq_dir + "*fastq.gz")
print( filepaths_raw_fastq[0:4] )


# data.frame of plate names ----------------------------------------------------

# split before lane (L00[1-8]) to get unique plate names
plates_df = pd.DataFrame(
    {'plate' : pd.unique([filepath.split("/")[-1].split("_L")[0] for filepath in filepaths_raw_fastq])}
    ).sort_values('plate').reset_index(drop = True)

# study specific metadata: edit these! # <--
# example presented here is for IGVF cell lines (e.g., 20231005-3C29D1-Pos1-C04_S25)
# info in filenames separated by -, change accordingly; will throw errors if fewer fields than in example
plates_df['dateseq'] = plates_df['plate'].transform(lambda platename: platename.split("-")[0])
plates_df['sample'] = plates_df['plate'].transform(lambda platename: platename.split("-")[1])
plates_df['sort'] = plates_df['plate'].transform(lambda platename: platename.split("-")[2])
plates_df['plateindex'] = plates_df['plate'].transform(lambda platename: platename.split("-")[3])

# separating by "D" and removing "3" prefix from sample ID
# because in this example, IGVF sample identifiers are "3" (for 3C) + linename + "D" + timepoint
plates_df['line'] = plates_df['sample'].transform(lambda platename: platename.split("D")[0]
                                                 ).str.replace("^3", "", regex = True) 
plates_df['time'] = plates_df['sample'].transform(lambda platename: platename.split("D")[1])

# number each plate, "platenum" used for batch submission later on
# platenum indexed by 1-Nplates for compatibility with SGE (can't qsub -t 0)
plates_df['platenum'] = plates_df.index.astype(int) + 1
plates_df.index = plates_df.index.astype(int) + 1

# export to "Metadata/A01b_plate_metadata.csv" by default
print( plates_df.head() )
print ( plates_df.shape )
plates_df.to_csv(os.environ['metadat_plate'])



## (A01c) expand plate --> all 384 wells --> final "targets" file

In [6]:
%%bash
cat > ../Scripts/A01c_well_filepaths.py

# ==============================================================================
# Scripts/A01c_well_filepaths.py
# expands plate-level metadata (A01b) into well-level metadata
# ==============================================================================

# recommend running interactively in python/Jupyter to check outputs,
# but shouldn't require any changes to defaults

# load packages ----------------------------------------------------------------

import itertools
import pandas as pd
import numpy as np
import os

# # if running interactively, need to load some lines from snm3C_parameters.env
# # or manually spec os.environ -- e.g., via os.environ['dir_proj'] = "mydirectory" or this below loop
# # (check relative path of parameters.env file or change to absolute if below not working!)
# envvar_needed = ['dir_proj', 'dir_originalfastq', 'metadat_plate', 'metadat_well']
# try:
#     os.environ['dir_proj']
# except KeyError:
#     envspec = pd.read_csv("../snm3C_parameters.env", sep = "=", comment="#", header = None
#                ).set_axis(['varname', 'varpath'], axis = 1
#                ).query('varname in @envvar_needed')
#     for index, row in envspec.iterrows():
#         os.environ[row["varname"]] = row["varpath"]
# os.chdir(os.environ['dir_proj'])



# expand A01b metadata by well -------------------------------------------------

# load A01b
plates_df = pd.read_csv(os.environ['metadat_plate'], index_col=0)

# from pandas documentation
def expand_grid(data_dict):
    """Create a dataframe from every combination of given values."""
    rows = itertools.product(*data_dict.values())
    return pd.DataFrame.from_records(rows, columns=data_dict.keys())

filepath_df = expand_grid({'plate': plates_df['plate'],
    'row' : [chr(x) for x in range(65, 65+16)],
    'col' : [str(x + 1) for x in range(24)]})
filepath_df['well'] = filepath_df[['row', 'col']].agg(''.join, axis = 1)
filepath_df['wellprefix'] = filepath_df['plate'] + "_" + filepath_df['well']

filepath_df = pd.merge(filepath_df, plates_df, how = "left", on = "plate")



# batch into sets of 24 for bismark mapping, contact calling, etc --------------
# (by default, one row at a time, incremented by "batchnum")

# - alternatively, could make smaller batches of wells (e.g., n = 5) for compute
#   environments that favor many small jobs versus a few long jobs,
# - or... two sets of batches e.g., filepath_df['batchnum_A04a_bismark']
#   pulled by the sub scripts for the A04a script / really resource-intensive jobs only

nwellstot = filepath_df.shape[0]
filepath_df['batchnum'] =\
    pd.Series(range(0, np.ceil(nwellstot / wells_per_batch).astype(int))
             ).repeat(wells_per_batch)[0:nwellstot].reset_index(drop = True) + 1

print( "number of total wells:" )
print( nwellstot )

print( "wells per 'batchnum':" )
print( wells_per_batch )

filepath_df.index = filepath_df.index.astype(int) + 1

def basename(pathin):
    return(pathin.split("/")[-1])

print( "number of plates:" )
print( "Nplates: " + str( filepath_df['platenum'].max() ) )

print( "number of batches:" )
print( "Nbatches: " + str( filepath_df['batchnum'].max() ) )



# then extensive file paths for sections A02-A06 -------------------------------
# (inelegant, but useful for file checking/compiling info)

# A02: demultiplexing 
# all in dir: fastq_demultip/

filepath_df['A02a_fqgz_demultip_R1'] = "fastq_demultip/" + filepath_df[['plate', 'well']].agg('_'.join, axis = 1) + "_indexed_R1.fastq.gz"
filepath_df['A02a_fqgz_demultip_R2'] = "fastq_demultip/" + filepath_df[['plate', 'well']].agg('_'.join, axis = 1) + "_indexed_R2.fastq.gz"

filepath_df['A02a_txt_summary1'] = "fastq_demultip/" + filepath_df['plate'] + "_summary_1.txt"
filepath_df['A02a_txt_summary2'] = "fastq_demultip/" + filepath_df['plate'] + "_summary_2.txt"



# A03: trimming ----------------------------------------------------------------
# all in dir: fastq_trimmed/

filepath_df['A03a_fqgz_paired_R1'] = "fastq_trimmed/" + filepath_df['wellprefix'] + "_paired_R1.fastq.gz"
filepath_df['A03a_fqgz_paired_R2'] = "fastq_trimmed/" + filepath_df['wellprefix'] + "_paired_R2.fastq.gz"

filepath_df['A03a_fqgz_singletrim_R1'] = "fastq_trimmed/" + filepath_df['wellprefix'] + "_singletrim_R1.fastq.gz"
filepath_df['A03a_fqgz_singletrim_R2'] = "fastq_trimmed/" + filepath_df['wellprefix'] + "_singletrim_R2.fastq.gz"

filepath_df['A03a_json_fastp'] = "fastq_trimmed/" + filepath_df['wellprefix'] + ".json"


# A04: bismark -----------------------------------------------------------------

filepath_df['A04a_dir_bismark'] = "mapping_bismark/" + filepath_df['wellprefix'] + "/"

# (i) taurus step 1 mapping outputs
filepath_df['A04a_bam_R1p'] = \
    filepath_df['A04a_dir_bismark'] + filepath_df['A03a_fqgz_paired_R1'].apply(basename).str.replace(".fastq.gz", "_bismark.bam")
filepath_df['A04a_bam_R2p'] = \
    filepath_df['A04a_dir_bismark'] + filepath_df['A03a_fqgz_paired_R2'].apply(basename).str.replace(".fastq.gz", "_bismark.bam")
filepath_df['A04a_bam_R1trims'] = \
        filepath_df['A04a_dir_bismark'] + filepath_df['A03a_fqgz_singletrim_R1'].apply(basename).str.replace(".fastq.gz", "_bismark.bam")
filepath_df['A04a_bam_R2trims'] = \
    filepath_df['A04a_dir_bismark'] + filepath_df['A03a_fqgz_singletrim_R2'].apply(basename).str.replace(".fastq.gz", "_bismark.bam")

# step 1 logs
filepath_df['A04a_bismarktxt_R1p'] = \
    filepath_df['A04a_dir_bismark'] + filepath_df['wellprefix'] + "_paired_R1_bismark_SE_report.txt"
filepath_df['A04a_bismarktxt_R2p'] = \
    filepath_df['A04a_dir_bismark'] + filepath_df['wellprefix'] + "_paired_R2_bismark_SE_report.txt"
filepath_df['A04a_bismarktxt_R1trims'] = \
        filepath_df['A04a_dir_bismark'] + filepath_df['wellprefix'] + "_singletrim_R1_bismark_SE_report.txt"
filepath_df['A04a_bismarktxt_R2trims'] = \
    filepath_df['A04a_dir_bismark'] + filepath_df['wellprefix'] + "_singletrim_R2_bismark_SE_report.txt"

# (ii) taurus step 2 logs
filepath_df['A04a_bismarktxt_R1p1'] = \
    filepath_df['A04a_dir_bismark'] + "subseq_R1_1_bismark_SE_report.txt"
filepath_df['A04a_bismarktxt_R1p2'] = \
    filepath_df['A04a_dir_bismark'] + "subseq_R1_2_bismark_SE_report.txt"
filepath_df['A04a_bismarktxt_R1p3'] = \
    filepath_df['A04a_dir_bismark'] + "subseq_R1_3_bismark_SE_report.txt"
filepath_df['A04a_bismarktxt_R2p1'] = \
    filepath_df['A04a_dir_bismark'] + "subseq_R2_1_bismark_SE_report.txt"
filepath_df['A04a_bismarktxt_R2p2'] = \
    filepath_df['A04a_dir_bismark'] + "subseq_R2_2_bismark_SE_report.txt"
filepath_df['A04a_bismarktxt_R2p3'] = \
    filepath_df['A04a_dir_bismark'] + "subseq_R2_3_bismark_SE_report.txt"

# (iii) picard de-duplication
filepath_df['A04a_log_picard'] = filepath_df['A04a_dir_bismark'] + "picard.log"

# (iv) final merged, sorted, dedupe bam
filepath_df['A04a_bam_final'] = filepath_df['A04a_dir_bismark'] + "merged_dedupe.bam"

# A04c: mapping stats ----------------------------------------------------------

filepath_df['A04c_txt_samstats'] = filepath_df['A04a_dir_bismark'] + "samstats.txt"
filepath_df['A04c_txt_covtot'] = filepath_df['A04a_dir_bismark'] + "nbases_cov_by_chr.txt"
filepath_df['A04c_txt_covnsites'] = filepath_df['A04a_dir_bismark'] + "total_cov_by_chr.txt"

# A05a: methylation quantification ---------------------------------------------

filepath_df['A05a_allc'] = filepath_df['A04a_dir_bismark'] + "allc.tsv.gz"
filepath_df['A05a_allctbi'] = filepath_df['A04a_dir_bismark'] + "allc.tsv.gz.tbi"

# A06: contact mapping ---------------------------------------------------------

filepath_df['A06a_pairs'] = filepath_df['A04a_dir_bismark'] + "pairs.tsv.gz"
filepath_df['A06a_3c_metadat'] = filepath_df['A04a_dir_bismark'] + "metadat_pairs.tsv"



# finally, export --------------------------------------------------------------
# by default exports to Metadata/A01c_well_filepath.csv


print("final metadata file dimensions:")
print(filepath_df.shape)
filepath_df.to_csv(os.environ['metadat_well'])



