
## Make human genome annotation directory

## Get human genome fasta

Get human fasta file using 2bit from UCSC

## Get human gene annotations

Get human gene annotations in GTF form from Gencode (v33)


In [None]:
%%bash

mkdir /input_dir/corona_analysis/annotations/human
cd /input_dir/corona_analysis/annotations/human


In [None]:
%%bash
curl -s -L ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz > \
    /input_dir/corona_analysis/annotations/human/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz
gunzip Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz
#Toss everything except canonical chromosomes
awk "/^>/ {n++} n>25 {exit} {print}" /input_dir/corona_analysis/annotations/human/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa > GRCh38_filt_dna_sm.fa


In [None]:
%%bash
curl -s -L ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.gtf.gz > \
    /input_dir/corona_analysis/annotations/human/Homo_sapiens.GRCh38.99.gtf.gz
gunzip /input_dir/corona_analysis/annotations/human/Homo_sapiens.GRCh38.99.gtf.gz

# Get reference transcriptome for normal BLFs

Transcriptome comes in the form of 3 normal controls RNA-seq BALF sequenced on Illumina Hiseq 2000

* SRX7253176 - SRR10571724 Ctrl1
* SRX7253170 - SRR10571730 Ctlr2
* SRX7253168 - SRR10571732 Ctlr3


In [None]:
%%bash
##prefetch SRA files
prefetch -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176.sra SRX7253176
prefetch -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170.sra SRX7253170
prefetch -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168.sra SRX7253168


In [None]:
%%bash 
#Unpack SRA files, toss technicals, zip into fq.gz files
parallel-fastq-dump -t 8 --tmpdir /input_dir/corona_analysis/temp/ \
    -s /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176.sra \
    --dumpbase --clip --gzip --skip-technical --readids \
    --read-filter pass --split-files \
    --outdir /input_dir/corona_analysis/input_reads/BALF_control/

parallel-fastq-dump -t 8 --tmpdir /input_dir/corona_analysis/temp/ \
    -s /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170.sra \
    --dumpbase --clip --gzip --skip-technical --readids \
    --read-filter pass --split-files \
    --outdir /input_dir/corona_analysis/input_reads/BALF_control/

parallel-fastq-dump -t 8 --tmpdir /input_dir/corona_analysis/temp/ \
    -s /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168.sra \
    --dumpbase --clip --gzip --skip-technical --readids \
    --read-filter pass --split-files \
    --outdir /input_dir/corona_analysis/input_reads/BALF_control/


In [None]:
%%bash

#Filter input RNA-seq reads from control BALF lung of 3 donors
#Ctrl1
fastp -p 14 -i /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_pass_1.fastq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_pass_2.fastq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_fastp.json 

#Ctrl2
fastp -p 14 -i /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_pass_1.fastq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_pass_2.fastq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_fastp.json 

#Ctrl3
fastp -p 14 -i /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_pass_1.fastq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_pass_2.fastq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_fastp.json 


In [None]:
%%bash
mkdir /input_dir/corona_analysis/alignment_out/BALF_control_quant/

#Align reads against human transcriptome for BALF control
cd /input_dir/corona_analysis/input_reads/BALF_control
#Ctlr1
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/BALF_control_quant/BALF_control_s1_\
     --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --outReadsUnmapped Fastx --limitBAMsortRAM 27000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM   --readFilesCommand zcat \
     --readFilesIn SRX7253176_filt_1.fq.gz \
                   SRX7253176_filt_2.fq.gz
#Ctlr2
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/BALF_control_quant/BALF_control_s2_\
     --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --outReadsUnmapped Fastx --limitBAMsortRAM 27000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM   --readFilesCommand zcat \
     --readFilesIn SRX7253170_filt_1.fq.gz \
                   SRX7253170_filt_2.fq.gz

#Ctlr3
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/BALF_control_quant/BALF_control_s3_\
     --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --outReadsUnmapped Fastx --limitBAMsortRAM 27000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM   --readFilesCommand zcat \
     --readFilesIn SRX7253168_filt_1.fq.gz \
                   SRX7253168_filt_2.fq.gz


In [None]:
%%bash

cd /input_dir/corona_analysis/alignment_out/BALF_control_quant/
#Quantify expression in control BALF samples 1-3

#Ctlr1
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       BALF_control_s1_Aligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/human/RSEM_ix \
                       BALF_control_rsem_s1_
#Ctlr2
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       BALF_control_s2_Aligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/human/RSEM_ix \
                       BALF_control_rsem_s2_
#Ctlr3
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       BALF_control_s3_Aligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/human/RSEM_ix \
                       BALF_control_rsem_s3_


# Get transcriptome for infected BLFs

Transcriptome data comes in the form of RNA-seq from paper [Xiong et al 2020](https://www.tandfonline.com/doi/full/10.1080/22221751.2020.1747363). 2 patients Brancheolar Lavage Fluid (BALF) had RNA-seq libraries generated and sequenced on MiSeq
CRR119897 - patient2_rep2
CRR119896 - patient2_rep1
CRR119895 - patient1_rep2
CRR119894 - patient1_rep1

In [None]:
%%bash
mkdir /input_dir/corona_analysis/input_reads/BALF_rna

#Get fq.gz from database for 
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119897/CRR119897_f1.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_f1.fq.gz
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119897/CRR119897_r2.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_r2.fq.gz
    
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119896/CRR119896_f1.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_f1.fq.gz
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119896/CRR119896_r2.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_r2.fq.gz
    
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119895/CRR119895_f1.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_f1.fq.gz
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119895/CRR119895_r2.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_r2.fq.gz
    
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119894/CRR119894_f1.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_f1.fq.gz
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119894/CRR119894_r2.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_r2.fq.gz
    

In [None]:
# Filter reads and QC runs using Fastp

fastp -p 8 -i /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_f1.fq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_r2.fq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_fastp.json 

fastp -p 8 -i /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_f1.fq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_r2.fq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_fastp.json 

fastp -p 8 -i /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_f1.fq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_r2.fq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_fastp.json 

fastp -p 8 -i /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_f1.fq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_r2.fq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_fastp.json 


# Make genome annotations for STAR and RSEM


In [None]:
%%bash
#Generate genome indices for STAR
mkdir /input_dir/corona_analysis/annotations/human/STAR_ix
STAR --runThreadN 12 \
     --runMode genomeGenerate --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --genomeFastaFiles /input_dir/corona_analysis/annotations/human/GRCh38_filt_dna_sm.fa \
     --sjdbGTFfile /input_dir/corona_analysis/annotations/human/Homo_sapiens.GRCh38.99.gtf \
     --sjdbOverhang 149 --limitGenomeGenerateRAM 30000000000 --genomeSAsparseD 2


In [None]:
%%bash

#Generate genome annotations for RSEM
rsem-prepare-reference --gtf /input_dir/corona_analysis/annotations/human/Homo_sapiens.GRCh38.99.gtf  \
                       /input_dir/corona_analysis/annotations/human/GRCh38_filt_dna_sm.fa \
                       /input_dir/corona_analysis/annotations/human/RSEM_ix 


# Align reads to human transcriptome using STAR

In [None]:
%%bash

#Align coronavirus BALF reads to human transcriptome
cd /input_dir/corona_analysis/input_reads/BALF_rna
#Patient 1
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/BALF_corona_s1_\
     --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --outReadsUnmapped Fastx --limitBAMsortRAM 27000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM   --readFilesCommand zcat \
     --readFilesIn CRR119894_filt_1.fq.gz,CRR119895_filt_1.fq.gz \
                   CRR119894_filt_2.fq.gz,CRR119895_filt_2.fq.gz
#Patient 2
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/BALF_corona_s2_\
     --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --outReadsUnmapped Fastx --limitBAMsortRAM 27000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM   --readFilesCommand zcat \
     --readFilesIn CRR119896_filt_1.fq.gz,CRR119897_filt_1.fq.gz \
                   CRR119896_filt_2.fq.gz,CRR119897_filt_2.fq.gz


In [None]:
%%bash 

cd /input_dir/corona_analysis/alignment_out/
#Quantify infected BALF transcriptome
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       BALF_corona_s1_Aligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/human/RSEM_ix \
                       BALF_infected_s1_


In [None]:
%%bash

#Quantify infected BALF transcriptome
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       BALF_corona_s2_Aligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/human/RSEM_ix \
                       BALF_infected_s2_


# Differential expression between vero normal and infected

In [None]:
%%bash
cd /input_dir/corona_analysis/alignment_out/
rsem-generate-data-matrix BALF_control_quant/BALF_control_rsem_s1_.genes.results \
                          BALF_control_quant/BALF_control_rsem_s2_.genes.results \
                          BALF_control_quant/BALF_control_rsem_s3_.genes.results \
                          BALF_infected_quant/BALF_infected_s1_.genes.results \
                          BALF_infected_quant/BALF_infected_s2_.genes.results \
                            > BALF_rsem_gene_counts.txt

rsem-generate-data-matrix BALF_control_quant/BALF_control_rsem_s1_.isoforms.results \
                          BALF_control_quant/BALF_control_rsem_s2_.isoforms.results \
                          BALF_control_quant/BALF_control_rsem_s3_.isoforms.results \
                          BALF_infected_quant/BALF_infected_s1_.isoforms.results \
                          BALF_infected_quant/BALF_infected_s2_.isoforms.results \
                            > BALF_rsem_isoform_counts.txt


In [None]:
%%bash
cd /input_dir/corona_analysis/diffExp_out/
rsem-run-ebseq BALF_rsem_gene_counts.txt 3,2 BALF_corona_geneDiff_rsem.txt