# Get main genome

1. [A super pan-genomic landscape of rice](https://www.nature.com/articles/s41422-022-00685-z)
Getting Oxford nanopore data from there

2. [Comprehensive mapping and modelling of the rice regulome landscape unveils the regulatory architecture underlying complex traits](https://www.nature.com/articles/s41467-024-50787-y)

In [None]:
%%bash
# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_034140825.1/
# Grab the genome and gene annotations from NCBI using genome ASM3414082v1
wget -O oryza_sativa_genome.fa.gz https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/034/140/825/GCF_034140825.1_ASM3414082v1/GCF_034140825.1_ASM3414082v1_genomic.fna.gz
wget -O oryza_sativa_genes.gff.gz https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/034/140/825/GCF_034140825.1_ASM3414082v1/GCF_034140825.1_ASM3414082v1_genomic.gff.gz
# Also grabbing OLD genome that's compatible w/ the ChipHub
wget -O oryza_sativa_genome_IRGSP-1.0.fa.gz https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/433/935/GCF_001433935.1_IRGSP-1.0/GCF_001433935.1_IRGSP-1.0_genomic.fna.gz
wget -O oryza_sativa_genes_IRGSP-1.0.gff.gz https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/433/935/GCF_001433935.1_IRGSP-1.0/GCF_001433935.1_IRGSP-1.0_genomic.gtf.gz


In [None]:
#Rename ncbi headers because not matching downstream genome in viz
seqkit replace -p ".+chromosome "  oryza_sativa_genome_IRGSP-1.0.fa  | \
    seqkit replace -p ".+Syng" -r "Syng"  \
    | seqkit replace -p ",.+" \
    | seqkit replace -p "\s.+" > \
    oryza_sativa_genome_IRGSP_rename.fa
#Rename GFF3 
seqkit replace -p ".+chromosome "  oryza_sativa_genome_IRGSP-1.0.fa  | \
    seqkit replace -p ".+Syng" -r "Syng"  \
    | seqkit replace -p ",.+" \
    | seqkit replace -p "\s.+" > \
    oryza_sativa_genome_IRGSP_rename.fa


In [None]:
#make minimap2 ref
IN_FOLDER="/data/Epigenetics_Workshop/input_data/"
mkdir -p ${IN_FOLDER}/oryza_sativa_mm2
minimap2 -d${IN_FOLDER}/oryza_sativa_mm2/ref_rename.mmi \
    ${IN_FOLDER}/oryza_sativa_genome_IRGSP_rename.fa


In [None]:
# Get oxford nanopore data from 1
wget -O SRR15421487_1.fq.gz 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR154/087/SRR15421487/SRR15421487_1.fastq.gz'


In [None]:
SAMPLE_NAME="SRR15421487"
IN_FOLDER="/data/Epigenetics_Workshop/input_data/"
OUT_FOLDER="/data/Epigenetics_Workshop/input_data/"
fastp -w 16 -i ${IN_FOLDER}/${SAMPLE_NAME}_1.fq.gz \
      -o ${OUT_FOLDER}/${SAMPLE_NAME}_r1_filt.fq.gz \
      -h ${OUT_FOLDER}/${SAMPLE_NAME}_filt.html \
      -j ${OUT_FOLDER}/${SAMPLE_NAME}_filt.json 


In [None]:
IX_DIR="/data/Epigenetics_Workshop/input_data/oryza_sativa_mm2/ref_rename.mmi"
IN_DIR="/data/Epigenetics_Workshop/input_data/"
OUT_DIR="/data/Epigenetics_Workshop/input_data/"
TEMP_DIR="/data/Epigenetics_Workshop/input_data/tmp1"
SAMPLE_NAME="SRR15421487"
cd "${IN_DIR}"

minimap2 -t 16 -ax map-ont \
            ${IX_DIR} \
            ${IN_DIR}/${SAMPLE_NAME}_r1_filt.fq.gz | \
            samtools sort  -T ${TEMP_DIR} -@ 8 - | \
            samtools view -hbS - > ${OUT_DIR}/${SAMPLE_NAME}_oryza_sativa_nanopore_mm_cordSorted.bam
samtools index ${OUT_DIR}/${SAMPLE_NAME}_oryza_sativa_nanopore_mm_cordSorted.bam


In [None]:
bamCoverage -b ${OUT_DIR}/${SAMPLE_NAME}_oryza_sativa_nanopore_mm_cordSorted.bam \
            -o ${OUT_DIR}/${SAMPLE_NAME}_oryza_sativa_nanopore_mm.bw

In [None]:
# get some chromatin accessibility data from 2
wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR236/090/SRR23699890/SRR23699890_1.fastq.gz
wget -nc ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR236/090/SRR23699890/SRR23699890_2.fastq.gz


In [None]:
SAMPLE_NAME="SRR23699890"
IN_FOLDER="/data/Epigenetics_Workshop/input_data/"
OUT_FOLDER="/data/Epigenetics_Workshop/input_data/"
fastp -w 16 -i ${IN_FOLDER}/${SAMPLE_NAME}_1.fastq.gz \
      -I ${IN_FOLDER}/${SAMPLE_NAME}_2.fastq.gz \
      -o ${OUT_FOLDER}/${SAMPLE_NAME}_r1_filt.fq.gz \
      -O ${OUT_FOLDER}/${SAMPLE_NAME}_r2_filt.fq.gz \
      -h ${OUT_FOLDER}/${SAMPLE_NAME}_filt.html \
      -j ${OUT_FOLDER}/${SAMPLE_NAME}_filt.json \
      --detect_adapter_for_pe 


In [None]:
#make index
IN_FOLDER="/data/Epigenetics_Workshop/input_data/"
cd ${IN_FOLDER}
chromap -i \
    -r oryza_sativa_genome_IRGSP-1.0.fa \
    -o oryza_sativa_IRGSP_chromap.idx


In [None]:
IN_FOLDER="/data/Epigenetics_Workshop/input_data/"
IX_MAP=${IN_FOLDER}/"oryza_sativa_IRGSP_chromap.idx"
SAMPLE_NAME="SRR23699890"

cd ${IN_FOLDER}

chromap   -x ${IX_MAP} -t 8 \
    -r oryza_sativa_genome_IRGSP-1.0.fa \
    -q 10 \
    -1 t1.fq \
    -2 t2.fq \
    -o test.bed

chromap --preset atac -x ${IX_MAP} -t 8 \
    -r oryza_sativa_genome_IRGSP-1.0.fa \
    -1 ${SAMPLE_NAME}_r1_filt.fq.gz \
    -2 ${SAMPLE_NAME}_r2_filt.fq.gz \
    -o ${SAMPLE_NAME}_oryza_sativa_chromap.bed

chromap --preset atac -x ${IX_MAP} -t 8 \
    -r oryza_sativa_genome_IRGSP-1.0.fa \
    -1 ${SAMPLE_NAME}_1.fastq.gz \
    -2 ${SAMPLE_NAME}_2.fastq.gz \
    -o ${SAMPLE_NAME}_oryza_sativa_chromap.bed


In [None]:
IX_DIR="/data/Epigenetics_Workshop/input_data/oryza_sativa_mm2/ref_rename.mmi"
IN_DIR="/data/Epigenetics_Workshop/input_data/"
OUT_DIR="/data/Epigenetics_Workshop/input_data/"
TEMP_DIR="/data/Epigenetics_Workshop/input_data/tmp"
SAMPLE_NAME="SRR23699890"
cd "${IN_DIR}"

#Map using minimap2 w/ sr params and sort bam for caling peaks
minimap2 -t 16 -ax sr \
            ${IX_DIR} \
            ${SAMPLE_NAME}_r1_filt.fq.gz ${SAMPLE_NAME}_r2_filt.fq.gz | \
            samtools sort  -T ${TEMP_DIR} -@ 8 - | \
            samtools view -hbS - > ${SAMPLE_NAME}_oryza_sativa_mm2_sort.bam
samtools index ${SAMPLE_NAME}_oryza_sativa_mm2_sort.bam

#Make bw for viz
bamCoverage --Offset 5 -1 -b ${SAMPLE_NAME}_oryza_sativa_mm2_sort.bam \
            -o ${SAMPLE_NAME}_oryza_sativa_mm2.bw


In [None]:
#Using Khmer tools to estimage the effective genome size of rice 340451842 using read length to aid
unique-kmers.py -k 80 oryza_sativa_genome_IRGSP-1.0.fa 

In [None]:
%%bash
IN_DIR="/data/Epigenetics_Workshop/input_data/"
SAMPLE_NAME="SRR23699890"
GENOME_SIZE="340451842"
cd "${IN_DIR}"

# Call peaks using two methods
macs3 callpeak -f BAMPE --call-summits \
    -t ${SAMPLE_NAME}_oryza_sativa_mm2_sort.bam \
    -g ${GENOME_SIZE} -B -q 0.05 --trackline \
    -n ${SAMPLE_NAME}_oryza_sativa_atac.macs3.default.summits.bampe 
macs3 hmmratac -i ${SAMPLE_NAME}_oryza_sativa_mm2_sort.bam \
    -n ${SAMPLE_NAME}_oryza_sativa_atac.macs3.hmmratac.bampe

#Filter by chromosome only 1-12
awk '$1 > 0 && $1 <= 12' \
    ${SAMPLE_NAME}_oryza_sativa_atac.macs3.default.summits.bampe_summits.bed > \
    ${SAMPLE_NAME}_oryza_sativa_atac_summits_filt.bed
    
bedtools flank -b 350 -g \
    -i ${SAMPLE_NAME}_oryza_sativa_atac.macs3.default.summits.bampe_summits.bed 

# Interpreting results with public data
Lots of rice chip data (and other plants in [ChiPHub](https://biobigdata.nju.edu.cn/ChIPHub/) )
## Build a session that loads into the browser
[Oryza Sativa Genome Viewer](https://biobigdata.nju.edu.cn/browser/?genome=oryza_sativa)

