## Analysis 

In [None]:
%%bash
# Download african green monkey genome and gene annotations

#Make African monkey annotation dir
mkdir /input_dir/corona_analysis/annotations/AGM
cd /input_dir/corona_analysis/annotations/AGM

In [1]:
%%bash
#Make african green monkey genome from chromosomes fa, no primary_assembly file 
cd /input_dir/corona_analysis/annotations/AGM/
array=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 X Y MT )
for i in "${array[@]}"
do
    curl -s -L ftp://ftp.ensembl.org/pub/release-99/fasta/chlorocebus_sabaeus/dna/Chlorocebus_sabaeus.ChlSab1.1.dna_rm.chromosome."$i".fa.gz > \
        Chlorocebus_sabaeus.ChlSab1.1.dna_rm.chromosome."$i".fa.gz   
done

cat Chlorocebus_sabaeus.ChlSab1.1.dna_rm.chromosome.*.fa.gz > Chlorocebus_sabaeus.ChlSab1.1.dna_rm.all_chr.fa.gz
gunzip Chlorocebus_sabaeus.ChlSab1.1.dna_rm.all_chr.fa.gz

In [None]:
%%bash
#Get gene annotation
curl -s -L ftp://ftp.ensembl.org/pub/release-99/gtf/chlorocebus_sabaeus/Chlorocebus_sabaeus.ChlSab1.1.99.gtf.gz > \
    Chlorocebus_sabaeus.ChlSab1.1.99.gtf.gz
gunzip Chlorocebus_sabaeus.ChlSab1.1.99.gtf.gz

    

# Get reference transcriptome for vero cells

* DRX017021 - DRR018832
* DRX017022 - DRR018833
* DRX017023 - DRR018834
* DRX017024 - DRR018835


In [None]:
%%bash
##prefetch SRA files
prefetch -o /input_dir/corona_analysis/input_reads/DRX017021.sra DRX017021
prefetch -o /input_dir/corona_analysis/input_reads/DRX017022.sra DRX017022
prefetch -o /input_dir/corona_analysis/input_reads/DRX017023.sra DRX017023
prefetch -o /input_dir/corona_analysis/input_reads/DRX017024.sra DRX017024


In [None]:
%%bash
#Unpack SRA files, toss technicals, zip into fq.gz files
parallel-fastq-dump -t 8 --tmpdir /input_dir/corona_analysis/temp/ \
    -s /input_dir/corona_analysis/input_reads/DRX017021.sra \
    --dumpbase --clip --gzip --skip-technical --readids \
    --read-filter pass --split-files \
    --outdir /input_dir/corona_analysis/input_reads/

parallel-fastq-dump -t 8 --tmpdir /input_dir/corona_analysis/temp/ \
    -s /input_dir/corona_analysis/input_reads/DRX017022.sra \
    --dumpbase --clip --gzip --skip-technical --readids \
    --read-filter pass --split-files \
    --outdir /input_dir/corona_analysis/input_reads/

parallel-fastq-dump -t 8 --tmpdir /input_dir/corona_analysis/temp/ \
    -s /input_dir/corona_analysis/input_reads/DRX017023.sra \
    --dumpbase --clip --gzip --skip-technical --readids \
    --read-filter pass --split-files \
    --outdir /input_dir/corona_analysis/input_reads/

parallel-fastq-dump -t 8 --tmpdir /input_dir/corona_analysis/temp/ \
    -s /input_dir/corona_analysis/input_reads/DRX017024.sra \
    --dumpbase --clip --gzip --skip-technical --readids \
    --read-filter pass --split-files \
    --outdir /input_dir/corona_analysis/input_reads/


In [None]:
# Filter reads and QC runs using Fastp

fastp -p 8 -i /input_dir/corona_analysis/input_reads/DRX017021_pass_1.fastq.gz \
      -I /input_dir/corona_analysis/input_reads/DRX017021_pass_2.fastq.gz \
      -o /input_dir/corona_analysis/input_reads/DRX017021_filt_1.fastq.gz \
      -O /input_dir/corona_analysis/input_reads/DRX017021_filt_2.fastq.gz \
      -h /input_dir/corona_analysis/input_reads/DRX017021_fastp.html \
      -j /input_dir/corona_analysis/input_reads/DRX017021_fastp.json 

fastp -p 8 -i /input_dir/corona_analysis/input_reads/DRX017022_pass_1.fastq.gz \
      -I /input_dir/corona_analysis/input_reads/DRX017022_pass_2.fastq.gz \
      -o /input_dir/corona_analysis/input_reads/DRX017022_filt_1.fastq.gz \
      -O /input_dir/corona_analysis/input_reads/DRX017022_filt_2.fastq.gz \
      -h /input_dir/corona_analysis/input_reads/DRX017022_fastp.html \
      -j /input_dir/corona_analysis/input_reads/DRX017022_fastp.json 

fastp -p 8 -i /input_dir/corona_analysis/input_reads/DRX017023_pass_1.fastq.gz \
      -I /input_dir/corona_analysis/input_reads/DRX017023_pass_2.fastq.gz \
      -o /input_dir/corona_analysis/input_reads/DRX017023_filt_1.fastq.gz \
      -O /input_dir/corona_analysis/input_reads/DRX017023_filt_2.fastq.gz \
      -h /input_dir/corona_analysis/input_reads/DRX017023_fastp.html \
      -j /input_dir/corona_analysis/input_reads/DRX017023_fastp.json 

fastp -p 8 -i /input_dir/corona_analysis/input_reads/DRX017024_pass_1.fastq.gz \
      -I /input_dir/corona_analysis/input_reads/DRX017024_pass_2.fastq.gz \
      -o /input_dir/corona_analysis/input_reads/DRX017024_filt_1.fastq.gz \
      -O /input_dir/corona_analysis/input_reads/DRX017024_filt_2.fastq.gz \
      -h /input_dir/corona_analysis/input_reads/DRX017024_fastp.html \
      -j /input_dir/corona_analysis/input_reads/DRX017024_fastp.json 


In [None]:
%%bash

#Align vero transcriptome control reads to AGM transcriptome using STAR
#Note sjdbOverhang is wrong for this dataset, read length here is 80bp while in other 100bp
cd /input_dir/corona_analysis/input_reads/
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/AGM/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/AGM_vero_control\
     --outTmpDir /input_dir/corona_analysis/temp/star2\
     --outReadsUnmapped Fastx --limitBAMsortRAM 25000000000\
     --outWigType wiggle --quantMode TranscriptomeSAM   --readFilesCommand zcat \
     --readFilesIn DRX017021_filt_1.fastq.gz,DRX017022_filt_1.fastq.gz,DRX017023_filt_1.fastq.gz,DRX017024_filt_1.fastq.gz \
                   DRX017021_filt_2.fastq.gz,DRX017022_filt_2.fastq.gz,DRX017023_filt_2.fastq.gz,DRX017024_filt_2.fastq.gz


In [None]:
%%bash 

#Quantify control vero transcriptome using RSEM
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       AGM_vero_controlAligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/AGM/RSEM_ix/Chlorocebus_sabaeus_rsem \
                       AGM_rsem_control_vero


In [None]:
%%bash
#Generate genome indices for STAR
mkdir /input_dir/corona_analysis/annotations/AGM/STAR_ix
STAR --runThreadN 16 \
     --runMode genomeGenerate \
     --genomeDir /input_dir/corona_analysis/annotations/AGM/STAR_ix \
     --genomeFastaFiles /input_dir/corona_analysis/annotations/AGM/Chlorocebus_sabaeus.ChlSab1.1.dna_rm.all_chr.fa \
     --sjdbGTFfile /input_dir/corona_analysis/annotations/AGM/Chlorocebus_sabaeus.ChlSab1.1.99.gtf \
     --sjdbOverhang 99 --limitGenomeGenerateRAM 18000000000


In [None]:
%%bash
#Align reads that didn't map to covid to AGM transcriptome
STAR --runMode alignReads \
     --genomeDir /input_dir/corona_analysis/annotations/AGM/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/AGM_host_GCF_000409795 \
     --outTmpDir /input_dir/corona_analysis/temp/star \
     --outReadsUnmapped Fastx --limitBAMsortRAM 25000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM \
     --readFilesIn /input_dir/corona_analysis/alignment_out/Covid_EPI_ISL_407193Unmapped.out.mate1 \
                   /input_dir/corona_analysis/alignment_out/Covid_EPI_ISL_407193Unmapped.out.mate2


# Use  RSEM to quantify

In [None]:
rsem-prepare-reference --gtf /input_dir/corona_analysis/annotations/AGM/Chlorocebus_sabaeus.ChlSab1.1.99.gtf  \
                       /input_dir/corona_analysis/annotations/AGM/Chlorocebus_sabaeus.ChlSab1.1.dna_rm.all_chr.fa \
                       /input_dir/corona_analysis/annotations/AGM/RSEM_ix/Chlorocebus_sabaeus_rsem

In [None]:
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       AGM_host_GCF_000409795Aligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/AGM/RSEM_ix/Chlorocebus_sabaeus_rsem \
                       AGM_rsem_corona_vero

# Differential expression between vero normal and infecteed

In [None]:
rsem-generate-data-matrix AGM_rsem_control_vero.genes.results AGM_rsem_corona_vero.genes.results > AGM_rsem_counts.txt

``` conda install -c conda-forge readline=6.* ```

In [None]:
rsem-run-ebseq AGM_rsem_counts.txt 1,1 AGM_corona_geneDiff_rsem

In [None]:
#Use mygene to flip to symbols and find mappings between there and human