In [None]:
# # A00_environment_and_genome_setup

# # in terminal, set up snmCT pipeline environment
# module load anaconda3 # or otherwise activate conda
# conda env create -f Documentation/snmCTseq.yml

# # optional: at this stage, i typically create a project-specific downstream analysis
# # environment as well (e.g., scverse/scanpy, etc) for reproducibility/working in juypter, e.g.,
# # conda env create --name IGVF_mCT # followed by conda/mamba pkg installs, incl ipykernel
# # conda env export | grep -v "^prefix: " > conda-environment.yml # export versions
# # python -m ipykernel install --user --name IGVF_mCT --display-name "IGVF" # juypter kernel

# # genome setup overall commands
# # ‡ fast enough to run interactively

# qsub Scripts/A00a_genome_dl_index.sub  # ‡
# qsub Scripts/A00b_genome_prep_bismark.sub
# qsub Scripts/A00c_genome_prep_star.sub
# qsub Scripts/A00d_annotations_bed.sub # ‡

## minimal project directory setup

In [None]:
# # run in terminal
# projdir=/u/project/cluo/chliu/Analyses/IGVF
# mkdir $projdir; cd $projdir
# mkdir fastq_demultip fastq_raw fastq_trimmed mapping_bismark mapping_star
# mkdir Metadata Notebooks Scripts sublogs

## snmCT_parameters (crucial to review)

In [None]:
%%bash
cat > ../snmCT_parameters.env

# parameters file --------------------------------------------------------------
# note: filepaths are relative to project directory or absolute paths
# recommend using absolute filepaths for all "ref_dir"/genome related files by
# find & replace "/u/project/cluo/chliu/Genomes/human_gencode_v40" --> 
#        folder where your genome assembly resides

# primary analysis/project folder
projdir=/u/project/cluo/chliu/Analyses/mCT_Pipeline

# scratch folder if available, otherwise can set as $projdir
dir_scratch=/u/project/cluo_scratch/chliu

# folder with raw data
# (.fastq, for our group usually split across 4 lanes of a novaseq run)
dir_originalfastq=/u/project/cluo/Shared_Datasets/IGVF/202208_Pilot/snmCT-seq/fastq/

# reference genome/files
# if a Hoffman2 user with access to our partition, can use the below hg38 paths
# (versions also exist +/- lambda spike-in)
ref_dir=/u/project/cluo/chliu/Genomes/human_gencode_v40
ref_fasta=/u/project/cluo/chliu/Genomes/human_gencode_v40/GRCh38.primary_assembly.genome.fa
ref_gtf=/u/project/cluo/chliu/Genomes/human_gencode_v40/gencode.v40.primary_assembly.annotation.gtf

# # (or this commented code for mouse)
# ref_dir=/u/project/cluo/chliu/Genomes/mouse_gencode_v29
# ref_fasta=/u/project/cluo/chliu/Genomes/mouse_gencode_v29/GRCm39.primary_assembly.genome.fa
# ref_gtf=/u/project/cluo/chliu/Genomes/mouse_gencode_v29/gencode.vM29.primary_assembly.annotation.gtf



# parameters that usually do not have to be changed -----------------

# reference genome
ref_chromsizes=/u/project/cluo/chliu/Genomes/human_gencode_v40/chromsizes.tsv
ref_genebody=/u/project/cluo/chliu/Genomes/human_gencode_v40/annotations/genebody.tsv
ref_geneslop2k=/u/project/cluo/chliu/Genomes/human_gencode_v40/annotations/geneslop2k.tsv
ref_flat=/u/project/cluo/chliu/Genomes/human_gencode_v40/annotations/refFlat.txt.gz
ref_rrna=/u/project/cluo/chliu/Genomes/human_gencode_v40/annotations/rRNA.intervallist

# metadata files
metadat_plate=Metadata/A01b_plate_metadata.csv
metadat_well=Metadata/A01c_well_filepath.csv

ref_starfolder=/u/project/cluo/chliu/Genomes/human_gencode_v40/STAR

## (A00a) download reference .fasta, index, get chrom sizes

In [None]:
%%bash
cat > ../Scripts/A00a_genome_dl_index.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A00a_genome_dl_index.$JOB_ID
#$ -j y
#$ -N A00a_genome_dl_index
#$ -l h_rt=12:00:00,h_data=8G,highp 
#$ -pe shared 4




echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "



# environment init ------------------—------------------—---------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# download/extract ref ------------------—------------------—-----------------

cd $ref_dir

# download gencode v40 (GRCh38.p13) .fa.gz & .gtf.gz
# double check filepaths and hard-coded names here
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_40/GRCh38.primary_assembly.genome.fa.gz
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_40/gencode.v40.primary_assembly.annotation.gtf.gz

# extract
gunzip -c GRCh38.primary_assembly.genome.fa.gz > GRCh38.primary_assembly.genome.fa
gunzip -c gencode.v40.primary_assembly.annotation.gtf.gz > gencode.v40.primary_assembly.annotation.gtf



# note on working with Lambda phage or other spike-in (bisulfite conversion efficiency QC)
# nice to add to reference genome at this step
# e.g., Escherichia phage Lambda, complete genome (GenBank: J02459.1) via NCBI
# wget "https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&save=file$=seqview&db=nuccore&report=fasta&id=215104" -O lambda.fa
# sed -i "1s/.*/>chrL/" lambda.fa
# cat GRCh38.primary_assembly.genome.fa lambda.fa > GRCh38_plus_lambda.fa 



# index, chrom sizes -----------------—------------------—----------------------

# extract bp length/chromosome
samtools faidx GRCh38.primary_assembly.genome.fa
cut -f 1-2 GRCh38.primary_assembly.genome.fa.fai > chromsizes.tsv

# .fa --> .dict 
picard CreateSequenceDictionary -R GRCh38.primary_assembly.genome.fa





echo -e "\n\n'A00a_bwa_index' completed.\n\n"


echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
echo " "

## (A00b) prep bismark genome ref

In [None]:
%%bash
cat > ../Scripts/A00b_genome_prep_bismark.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A00b_genome_prep_bismark.$JOB_ID
#$ -j y
#$ -N A00b_prep_bismark
#$ -l h_rt=24:00:00,h_data=8G,highp
#$ -pe shared 4
#$ -hold_jid A00a_genome_dl_index



echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "



# environment init ------------------—------------------—---------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# bismark index ------------------—------------------—---------------------------

cd $ref_dir
bismark_genome_preparation $ref_gtf --bowtie2




echo -e "\n\n'A00b_genome_prep_bismark' completed.\n\n"


echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
echo " "

## (A00c) prep STAR genome ref

In [None]:
%%bash
cat > ../Scripts/A00c_genome_prep_star.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A00c_genome_prep_star.$JOB_ID
#$ -j y
#$ -N A00c_prep_star
#$ -l h_rt=24:00:00,h_data=8G,highp
#$ -pe shared 4
#$ -hold_jid A00a_genome_dl_index




echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "



# environment init ------------------—------------------—---------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# star index ------------------—------------------—---------------------------

STAR --runThreadN 4 \
--runMode genomeGenerate \
--genomeDir $ref_dir/STAR \
--genomeFastaFiles $ref_fasta \
--sjdbGTFfile $ref_gtf \
--sjdbOverhang 149




echo -e "\n\n'A00c_genome_prep_star' completed.\n\n"

    
echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
echo " "

## (A00d) extract annotations from gtf

In [None]:
%%bash
cat > ../Scripts/A00d_gtf_annotations_bed.sub

#!/bin/bash
#$ -cwd
#$ -o sublogs/A00d_gtf_annotations_bed.$JOB_ID
#$ -j y
#$ -N A00d_gtf_annotations_bed
#$ -l h_rt=0:30:00,h_data=8G



echo "Job $JOB_ID started on:   " `hostname -s`
echo "Job $JOB_ID started on:   " `date `
echo " "



# environment init ------------------—------------------—---------------------

. /u/local/Modules/default/init/modules.sh # <--
module load anaconda3 # <--
conda activate snmCTseq # <--

export $(cat snmCT_parameters.env | grep -v '^#' | xargs) # <--



# process .gtf --> .bed ------------------—------------------—----------------

mkdir $ref_dir/annotations

python Scripts/A00d_gtf_annotations_bed.py


# UCSC/Picard utilities for QC metrics ------------------—------------------—-

cd $ref_dir

# rRNA interval list
picard BedToIntervalList -I annotations/rRNA.bed -O \
    annotations/rRNA.intervallist -SD GRCh38.primary_assembly.genome.dict

# ref flat
gtfToGenePred -genePredExt -geneNameAsName2 $ref_gtf refFlat.tmp.txt
cut -f 1-10 refFlat.tmp.txt > refFlat.tmp1
cut -f 12 refFlat.tmp.txt > refFlat.tmp2

paste refFlat.tmp1 refFlat.tmp2 > annotations/refFlat.txt
gzip annotations/$refFlat.txt

rm refFlat.tmp*




echo -e "\n\n'A00d_gtf_annotations_bed' completed.\n\n"


echo "Job $JOB_ID ended on:   " `hostname -s`
echo "Job $JOB_ID ended on:   " `date `
echo " "

In [None]:
%%bash
cat > ../Scripts/A00d_gtf_annotations_bed.py


# ==============================================================================
# A00d_gtf_annotations_bed.py 
# exports four annotation-related files to $ref_dir (reference genome)
# for mcds creation & down-stream analysis
# ==============================================================================

# recommend running interactively in python/Jupyter to check outputs
# works for .gtfs from GENCODE, but double check fmt for other sources

# if running interactively, check snmCT_parameters.env loaded or manually spec os.environ e.g.,
# os.environ['ref_dir'] = "/u/project/cluo/chliu/Genomes/human_gencode_v40"
# os.environ['ref_gtf'] = "/u/project/cluo/chliu/Genomes/human_gencode_v40/gencode.v40.primary_assembly.annotation.gtf"
# os.environ['ref_chromsizes'] = "/u/project/cluo/chliu/Genomes/human_gencode_v40/chromsizes.tsv"



# load packages -------------------------------------------------------------------

import pandas as pd
import os



# load reference info -------------------------------------------------------------

os.chdir(os.environ['ref_dir'])

gtf_file = pd.read_csv(os.environ['ref_gtf'],
                       comment = "#", delimiter="\t", header = None)
chrom_sizes = pd.read_csv(os.environ['ref_chromsizes'], sep = "\t", header = None)
chrom_sizes.columns = ['#chr', 'chrlen'] 



# genebody ------------------------------------------------------------------------
# .gtf to .bed (1-based --> 0 based)
# columns: chr, start, end, ENSG identifier
# note that this contains mitochondrial, ribosomal, lncRNAs, etc;
# this may or may not be desireable in downstream analyses

genebody = gtf_file[gtf_file.iloc[:, 2] == 'gene'].iloc[:, [0, 3, 4, 8]]
genebody.iloc[:, 1] = genebody.iloc[:, 1] - 1 # start changes to 0-pos
genebody.columns = ['#chr', 'start', 'end', 'annot']
genebody.reset_index(inplace=True, drop=True)

# extract info from the annot column (;, ")
genebody['gene'] = genebody['annot'].transform(lambda x: str(x).split('\"')[1])
genebody['symbol'] = genebody['annot'].transform(lambda x: str(x).split('\"')[5])
genebody['type'] = genebody['annot'].transform(lambda x: str(x).split('\"')[3])

# .gtf checks: should be zero
if sum(genebody.gene.duplicated()) != 0 | sum(genebody.start >= genebody.end) != 0:
    print("WARNING: check .bed outputs; was gene info was processed correctly from .gtf?")

# export ENSG --> Symbol
genebody.drop('annot', axis = 1, inplace = True)
genebody.to_csv("annotations/ensembl_to_symbol.tsv", sep = "\t", index = False)

# bed4 format for .allcools
genebody = genebody.iloc[:, 0:4]
genebody.to_csv("annotations/genebody.bed", sep = "\t", index = False)



# gene +/- 2kb --------------------------------------------------------------------
# above, but padding a 2kb region
# past manuscripts do this for higher mC modality coverage, but less interpretable

g2k = genebody.copy()
g2k.iloc[:, 1] = g2k.iloc[:, 1] - 2000
g2k.iloc[:, 2] = g2k.iloc[:, 2] + 2000

# but check +/-2kb still within chromosome length
# (low # genes affected, but may cause downstream issues)
g2k.loc[g2k.start < 0, 'start'] = 0

g2k = pd.merge(g2k, chrom_sizes, on = '#chr') 
filter_chrlen = g2k.end > g2k.chrlen
g2k.loc[filter_chrlen, 'end'] = g2k.chrlen[filter_chrlen]
g2k.drop('chrlen', axis = 1, inplace=True)
g2k.to_csv("annotations/geneslop2k.bed", sep = "\t", index = False)



# rRNA genes -----------------------------------------------------------------------
# for a later QC metric after RNA alignments

rRNA = genebody.loc[gtf_file.iloc[:, 8].str.contains("rRNA"), :]
rRNA.to_csv("annotations/rRNA.bed", sep = "\t", index = False)


