# Get main genome

1. [Chromosome-scale genome assembly of an important medicinal plant honeysuckle](https://www.nature.com/articles/s41597-022-01385-4)
Best genome it seems, contains genome fasta, gene annotations, rna fasta, and protein fa.
[Attachments from above paper](https://figshare.com/articles/online_resource/honeysuckle_genome_final_gene_gff3/18092708/6?file=34639925)

2. [The honeysuckle genome provides insight into the molecular mechanism of carotenoid metabolism underlying dynamic flower coloration](https://nph.onlinelibrary.wiley.com/doi/10.1111/nph.16552)

3. [Epigenetic changes in the regulation of carotenoid metabolism during honeysuckle flower development](https://www.sciencedirect.com/science/article/pii/S2468014122001352)



In [None]:
%%bash
# Grab the genome and gene annotations from 1
wget -O lonicera_japonica_genome.fa.gz https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/021/464/415/GCA_021464415.1_ASM2146441v1/GCA_021464415.1_ASM2146441v1_genomic.fna.gz
wget -O lonicera_japonica_genes.gff3 https://figshare.com/ndownloader/files/32821655
wget -O lonicera_japonica_trans_pep.fa https://figshare.com/ndownloader/files/34632194
wget -O lonicera_japonica_repeats.gff3 https://figshare.com/ndownloader/files/34639925
wget -O lonicera_japonica_pep_annotation.txt https://figshare.com/ndownloader/files/34631303


In [None]:
# Massaging / munging of the reference files -_-

#Rename ncbi headers because not matching gene annotation
seqkit replace -p ".+Sijihua "  lonicera_japonica_genome.fa  | \
    seqkit replace -p "chromosome " -r "Chr0" | \
    seqkit replace -p ",.+" > lonicera_japonica_rename_genome.fa

#Using AGAT to make cDNA from genome fasta and gene models
agat_sp_extract_sequences.pl \
    -g lonicera_japonica_genes.gff3 \
    -f lonicera_japonica_rename_genome.fa \
    --cdna -t exon --merge > lonicera_japonica_cdna.fa

#Get chromosome sizes using pyfaidx
faidx lonicera_japonica_rename_genome.fa -i chromsizes > lonicera_japonica_rename_sizes.genome


In [None]:
# Grab some long-read PacBio data from 1 
wget -O lonicera_japonica_pacbio_dna_SRR17509642.fq.gz \
    ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR175/042/SRR17509642/SRR17509642_subreads.fastq.gz


In [None]:
#make minimap2 ref
IN_FOLDER="/data/Epigenetics_Workshop/input_data/"
mkdir -p ${IN_FOLDER}/lonicera_japonica_mm2
minimap2 -d${IN_FOLDER}/lonicera_japonica_mm2/ref.mmi \
    ${IN_FOLDER}/lonicera_japonica_rename_genome.fa


In [None]:
SAMPLE_NAME="lonicera_japonica_pacbio_dna_SRR17509642"
IN_FOLDER="/data/Epigenetics_Workshop/input_data/"
OUT_FOLDER="/data/Epigenetics_Workshop/input_data/"
#Disabling quality filtering because pacbio 
fastp -w 16 -Q -i ${IN_FOLDER}/${SAMPLE_NAME}.fq.gz \
      -o ${OUT_FOLDER}/${SAMPLE_NAME}_filt.fq.gz \
      -h ${OUT_FOLDER}/${SAMPLE_NAME}_filt.html \
      -j ${OUT_FOLDER}/${SAMPLE_NAME}_filt.json 


In [None]:
IX_DIR="/data/Epigenetics_Workshop/input_data/lonicera_japonica_mm2/ref.mmi"
IN_DIR="/data/Epigenetics_Workshop/input_data/"
OUT_DIR="/data/Epigenetics_Workshop/input_data/"
TEMP_DIR="/data/Epigenetics_Workshop/input_data/tmp1"
SAMPLE_NAME="lonicera_japonica_pacbio_dna_SRR17509642"
cd "${IN_DIR}"

#Map using minimap2 w/ pacbio hifi params and sort bam for viz
minimap2 -t 16 -ax map-hifi \
            ${IX_DIR} \
            ${IN_DIR}/${SAMPLE_NAME}_filt.fq.gz | \
            samtools sort  -T ${TEMP_DIR} -@ 8 - | \
            samtools view -hbS - > ${OUT_DIR}/${SAMPLE_NAME}_mm_cordSorted.bam
samtools index ${OUT_DIR}/${SAMPLE_NAME}_mm_cordSorted.bam


In [None]:
%%bash
# Generate kallisto reference using cDNA
kallisto index -i lonicera_japonica_kallisto.idx  lonicera_japonica_cdna.fa


In [None]:
apt-get install cmake zlib1g-dev  autoconf libhdf5-dev

In [None]:
#Get some RNA-seq from 2 RNA-seq in PRJNA813701
#Green third stage (SRX14408209,SRX14408210,SRX14408211)
wget -O rep1_green_r1.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/018/SRR18269918/SRR18269918_1.fastq.gz
wget -O rep1_green_r2.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/018/SRR18269918/SRR18269918_2.fastq.gz
wget -O rep2_green_r1.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/017/SRR18269917/SRR18269917_1.fastq.gz
wget -O rep2_green_r2.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/017/SRR18269917/SRR18269917_1.fastq.gz
wget -O rep3_green_r1.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/016/SRR18269916/SRR18269916_1.fastq.gz
wget -O rep3_green_r2.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/016/SRR18269916/SRR18269916_1.fastq.gz

#Golder flower stage (SRX14408205,SRX14408206,SRX14408207) 
wget -O rep1_golden_r1.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/022/SRR18269922/SRR18269922_1.fastq.gz
wget -O rep1_golden_r2.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/022/SRR18269922/SRR18269922_2.fastq.gz
wget -O rep2_golden_r1.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/021/SRR18269921/SRR18269921_1.fastq.gz
wget -O rep2_golden_r2.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/021/SRR18269921/SRR18269921_2.fastq.gz
wget -O rep3_golden_r1.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/020/SRR18269920/SRR18269920_1.fastq.gz
wget -O rep3_golden_r2.fq.gz  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR182/020/SRR18269920/SRR18269920_2.fastq.gz



In [None]:
SAMPLE_NAME="rep1_golden"
IN_FOLDER="/data/Epigenetics_Workshop/input_data/"
OUT_FOLDER="/data/Epigenetics_Workshop/input_data/"
# Output in fq because for whatever ERE*(*F reason STAR can't deal w/ zcat in NTFS windows drives
fastp -w 16 -i ${IN_FOLDER}/${SAMPLE_NAME}_r1.fq.gz \
      -I ${IN_FOLDER}/${SAMPLE_NAME}_r2.fq.gz \
      -o ${OUT_FOLDER}/${SAMPLE_NAME}_r1_filt.fq \
      -O ${OUT_FOLDER}/${SAMPLE_NAME}_r2_filt.fq \
      -h ${OUT_FOLDER}/${SAMPLE_NAME}_filt.html \
      -j ${OUT_FOLDER}/${SAMPLE_NAME}_filt.json 
#One sample non-gz others yes because only kallisto
SAMPLE_NAME="rep2_golden"
fastp -w 16 -i ${IN_FOLDER}/${SAMPLE_NAME}_r1.fq.gz \
      -I ${IN_FOLDER}/${SAMPLE_NAME}_r2.fq.gz \
      -o ${OUT_FOLDER}/${SAMPLE_NAME}_r1_filt.fq.gz \
      -O ${OUT_FOLDER}/${SAMPLE_NAME}_r2_filt.fq.gz \
      -h ${OUT_FOLDER}/${SAMPLE_NAME}_filt.html \
      -j ${OUT_FOLDER}/${SAMPLE_NAME}_filt.json 
SAMPLE_NAME="rep3_golden"
fastp -w 16 -i ${IN_FOLDER}/${SAMPLE_NAME}_r1.fq.gz \
      -I ${IN_FOLDER}/${SAMPLE_NAME}_r2.fq.gz \
      -o ${OUT_FOLDER}/${SAMPLE_NAME}_r1_filt.fq.gz \
      -O ${OUT_FOLDER}/${SAMPLE_NAME}_r2_filt.fq.gz \
      -h ${OUT_FOLDER}/${SAMPLE_NAME}_filt.html \
      -j ${OUT_FOLDER}/${SAMPLE_NAME}_filt.json 
#Green side
SAMPLE_NAME="rep1_green"
fastp -w 16 -i ${IN_FOLDER}/${SAMPLE_NAME}_r1.fq.gz \
      -I ${IN_FOLDER}/${SAMPLE_NAME}_r2.fq.gz \
      -o ${OUT_FOLDER}/${SAMPLE_NAME}_r1_filt.fq.gz \
      -O ${OUT_FOLDER}/${SAMPLE_NAME}_r2_filt.fq.gz \
      -h ${OUT_FOLDER}/${SAMPLE_NAME}_filt.html \
      -j ${OUT_FOLDER}/${SAMPLE_NAME}_filt.json 
SAMPLE_NAME="rep2_green"
fastp -w 16 -i ${IN_FOLDER}/${SAMPLE_NAME}_r1.fq.gz \
      -I ${IN_FOLDER}/${SAMPLE_NAME}_r2.fq.gz \
      -o ${OUT_FOLDER}/${SAMPLE_NAME}_r1_filt.fq.gz \
      -O ${OUT_FOLDER}/${SAMPLE_NAME}_r2_filt.fq.gz \
      -h ${OUT_FOLDER}/${SAMPLE_NAME}_filt.html \
      -j ${OUT_FOLDER}/${SAMPLE_NAME}_filt.json 
SAMPLE_NAME="rep3_green"
fastp -w 16 -i ${IN_FOLDER}/${SAMPLE_NAME}_r1.fq.gz \
      -I ${IN_FOLDER}/${SAMPLE_NAME}_r2.fq.gz \
      -o ${OUT_FOLDER}/${SAMPLE_NAME}_r1_filt.fq.gz \
      -O ${OUT_FOLDER}/${SAMPLE_NAME}_r2_filt.fq.gz \
      -h ${OUT_FOLDER}/${SAMPLE_NAME}_filt.html \
      -j ${OUT_FOLDER}/${SAMPLE_NAME}_filt.json 


In [None]:
%%bash

#Make STAR reference
STAR --runThreadN 16 \
     --runMode genomeGenerate --outTmpDir /data/Epigenetics_Workshop/input_data/tmp_star \
     --genomeDir /data/Epigenetics_Workshop/input_data/lonicera_japonica_star \
     --genomeFastaFiles /data/Epigenetics_Workshop/input_data/lonicera_japonica_rename_genome.fa \
     --sjdbGTFfile /data/Epigenetics_Workshop/input_data/lonicera_japonica_genes.gff3 \
     --sjdbOverhang 149 --limitGenomeGenerateRAM 30000000000 --genomeSAindexNbases 12



In [None]:
#Align reads to transcriptome using STAR for one sample to viz
IX_DIR="/data/Epigenetics_Workshop/input_data/lonicera_japonica_star"
IN_DIR="/data/Epigenetics_Workshop/input_data/"
OUT_DIR="/data/Epigenetics_Workshop/input_data/"
TEMP_DIR="/data/Epigenetics_Workshop/input_data/tmp1"
SAMPLE_NAME="rep1_golden"
cd "${IN_DIR}"

#No unzipping because STAR mad
STAR --runMode alignReads \
     --genomeDir $IX_DIR \
     --outSAMtype BAM SortedByCoordinate --runThreadN 32 \
     --outFileNamePrefix $OUT_DIR"/${SAMPLE_NAME}_star/" \
     --outTmpDir $TEMP_DIR \
     --limitBAMsortRAM 25000000000 \
     --outWigType wiggle \
     --quantMode TranscriptomeSAM GeneCounts \
     --readFilesIn ${SAMPLE_NAME}_r1_filt.fq ${SAMPLE_NAME}_r2_filt.fq



In [None]:
#Flip output wig to bigwig for viz
CHROM_SIZES="lonicera_japonica_rename_sizes.genome"
IN_DIR="/data/Epigenetics_Workshop/input_data/"
SAMPLE_NAME="rep1_golden"

cd "${IN_DIR}/${SAMPLE_NAME}_star"
../wigToBigWig Signal.UniqueMultiple.str1.out.wig \
    ${IN_DIR}/${CHROM_SIZES} \
    ${SAMPLE_NAME}_lonicera_japonica_pos.bw
../wigToBigWig Signal.UniqueMultiple.str2.out.wig \
    ${IN_DIR}/${CHROM_SIZES} \
    ${SAMPLE_NAME}_lonicera_japonica_neg.bw

In [None]:
# Quantify using kallisto all samples
#Quantify golden flower stage
SAMPLE_NAME="rep1_golden"
kallisto \
    quant -t 8 -i lonicera_japonica_kallisto.idx \
    -o lonicera_japonica_${SAMPLE_NAME} --plaintext \
    ${SAMPLE_NAME}_r1_filt.fq.gz ${SAMPLE_NAME}_r2_filt.fq.gz
SAMPLE_NAME="rep2_golden"
kallisto \
    quant -t 8 -i lonicera_japonica_kallisto.idx \
    -o lonicera_japonica_${SAMPLE_NAME} --plaintext \
    ${SAMPLE_NAME}_r1_filt.fq.gz ${SAMPLE_NAME}_r2_filt.fq.gz
SAMPLE_NAME="rep3_golden"
kallisto \
    quant -t 8 -i lonicera_japonica_kallisto.idx \
    -o lonicera_japonica_${SAMPLE_NAME} --plaintext \
    ${SAMPLE_NAME}_r1_filt.fq.gz ${SAMPLE_NAME}_r2_filt.fq.gz

#Quantify green bud stage
SAMPLE_NAME="rep1_green"
kallisto \
    quant -t 8 -i lonicera_japonica_kallisto.idx \
    -o lonicera_japonica_${SAMPLE_NAME} --plaintext \
    ${SAMPLE_NAME}_r1_filt.fq.gz ${SAMPLE_NAME}_r2_filt.fq.gz
SAMPLE_NAME="rep2_green"
kallisto \
    quant -t 8 -i lonicera_japonica_kallisto.idx \
    -o lonicera_japonica_${SAMPLE_NAME} --plaintext \
    ${SAMPLE_NAME}_r1_filt.fq.gz ${SAMPLE_NAME}_r2_filt.fq.gz
SAMPLE_NAME="rep3_green"
kallisto \
    quant -t 8 -i lonicera_japonica_kallisto.idx \
    -o lonicera_japonica_${SAMPLE_NAME} --plaintext \
    ${SAMPLE_NAME}_r1_filt.fq.gz ${SAMPLE_NAME}_r2_filt.fq.gz



In [None]:
import pandas as pd
import os
# Differential expression
#Read in abundance estimates for all samples and make count matrix

in_dir = "/data/Epigenetics_Workshop/input_data"
samples = ["rep1_golden", 
           "rep2_golden", 
           "rep3_golden", 
           "rep1_green", 
           "rep2_green", 
           "rep3_green"]
sample_df_list = []
for sample in samples:
    sample_df = pd.read_csv(os.path.join(in_dir,f'lonicera_japonica_{sample}/abundance.tsv'), sep="\t")
    sample_df_list.append(sample_df)

sample_counts = pd.concat([
    sample_df_list[0]['est_counts'].astype(int), 
    sample_df_list[1]['est_counts'].astype(int), 
    sample_df_list[2]['est_counts'].astype(int), 
    sample_df_list[3]['est_counts'].astype(int), 
    sample_df_list[4]['est_counts'].astype(int), 
    sample_df_list[5]['est_counts'].astype(int), 
                          ], axis=1, keys=samples)

sample_counts.index=sample_df_list[0]["target_id"]

In [None]:
sample_counts = sample_counts[sample_counts.sum(axis = 1) > 0]
sample_counts

In [None]:
sample_counts = sample_counts.T
metadata = pd.DataFrame(zip(sample_counts.index, ['Golden','Golden','Golden','Green', 'Green', 'Green']),
                        columns = ['Sample', 'Condition'])
metadata = metadata.set_index('Sample')
metadata

In [None]:
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

dds = DeseqDataSet(counts=sample_counts,
            metadata=metadata,
            design_factors="Condition")



In [None]:

dds.deseq2()

In [None]:
from pydeseq2.default_inference import DefaultInference
inference = DefaultInference(n_cpus=8)
stat_res = DeseqStats(dds, inference=inference, contrast = ('Condition','Golden','Green'))
stat_res.summary()


In [None]:
res = stat_res.results_df
res

In [None]:
res = res[res.baseMean >= 10]
res

In [None]:
sigs = res[(res.padj < 0.005) & (abs(res.log2FoldChange) > 2)]
sigs

In [None]:
sigs.iloc[1:10].index

In [None]:
sigs.iloc[1].name.split(".")[0]

In [None]:
%%bash

#See what annotations exist
grep "EVM0011999.1" "../input_data/lonicera_japonica_pep_annotation.txt"
grep "EVM0018585.2" "../input_data/lonicera_japonica_pep_annotation.txt"
grep "EVM0009674.1" "../input_data/lonicera_japonica_pep_annotation.txt"
grep "EVM0010779.2" "../input_data/lonicera_japonica_pep_annotation.txt"



# Places to go from here:

https://github.com/mousepixels/sanbomics_scripts/blob/main/PyDeseq2_DE_tutorial.ipynb