# Get reference transcriptome for normal BLFs

Transcriptome comes in the form of 3 normal controls RNA-seq BALF sequenced on Illumina Hiseq 2000

* SRX7253176 - SRR10571724 Ctrl1
* SRX7253170 - SRR10571730 Ctlr2
* SRX7253168 - SRR10571732 Ctlr3


In [None]:
%%bash
##prefetch SRA files
prefetch -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176.sra SRX7253176
prefetch -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170.sra SRX7253170
prefetch -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168.sra SRX7253168


In [None]:
%%bash 
#Unpack SRA files, toss technicals, zip into fq.gz files
parallel-fastq-dump -t 8 --tmpdir /input_dir/corona_analysis/temp/ \
    -s /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176.sra \
    --dumpbase --clip --gzip --skip-technical --readids \
    --read-filter pass --split-files \
    --outdir /input_dir/corona_analysis/input_reads/BALF_control/

parallel-fastq-dump -t 8 --tmpdir /input_dir/corona_analysis/temp/ \
    -s /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170.sra \
    --dumpbase --clip --gzip --skip-technical --readids \
    --read-filter pass --split-files \
    --outdir /input_dir/corona_analysis/input_reads/BALF_control/

parallel-fastq-dump -t 8 --tmpdir /input_dir/corona_analysis/temp/ \
    -s /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168.sra \
    --dumpbase --clip --gzip --skip-technical --readids \
    --read-filter pass --split-files \
    --outdir /input_dir/corona_analysis/input_reads/BALF_control/


In [None]:
%%bash

#Filter input RNA-seq reads from control BALF lung of 3 donors
#Ctrl1
fastp -p 14 -i /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_pass_1.fastq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_pass_2.fastq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_control/SRX7253176_fastp.json 

#Ctrl2
fastp -p 14 -i /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_pass_1.fastq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_pass_2.fastq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_control/SRX7253170_fastp.json 

#Ctrl3
fastp -p 14 -i /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_pass_1.fastq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_pass_2.fastq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_control/SRX7253168_fastp.json 


In [None]:
%%bash
mkdir /input_dir/corona_analysis/alignment_out/BALF_control_quant/

#Align reads against human transcriptome for BALF control
cd /input_dir/corona_analysis/input_reads/BALF_control
#Ctlr1
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/BALF_control_quant/BALF_control_s1_\
     --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --outReadsUnmapped Fastx --limitBAMsortRAM 27000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM   --readFilesCommand zcat \
     --readFilesIn SRX7253176_filt_1.fq.gz \
                   SRX7253176_filt_2.fq.gz
#Ctlr2
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/BALF_control_quant/BALF_control_s2_\
     --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --outReadsUnmapped Fastx --limitBAMsortRAM 27000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM   --readFilesCommand zcat \
     --readFilesIn SRX7253170_filt_1.fq.gz \
                   SRX7253170_filt_2.fq.gz

#Ctlr3
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/BALF_control_quant/BALF_control_s3_\
     --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --outReadsUnmapped Fastx --limitBAMsortRAM 27000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM   --readFilesCommand zcat \
     --readFilesIn SRX7253168_filt_1.fq.gz \
                   SRX7253168_filt_2.fq.gz


In [None]:
%%bash

cd /input_dir/corona_analysis/alignment_out/BALF_control_quant/
#Quantify expression in control BALF samples 1-3

#Ctlr1
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       BALF_control_s1_Aligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/human/RSEM_ix \
                       BALF_control_rsem_s1_
#Ctlr2
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       BALF_control_s2_Aligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/human/RSEM_ix \
                       BALF_control_rsem_s2_
#Ctlr3
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       BALF_control_s3_Aligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/human/RSEM_ix \
                       BALF_control_rsem_s3_


# Get scRNA-seq (and scTCR-seq) from BALF CoVid patients

## Paper: [Single-cell landscape of bronchoalveolar immune cells in patients with COVID-19](https://www.nature.com/articles/s41591-020-0901-9#additional-information)

### Bioproject: PRJNA608742

Input metadata for raw data stored in PRJNA608742_metadata.txt


In [6]:
import pandas as pd

In [7]:
PRJNA608742_meta_in = pd.read_csv("PRJNA608742_metadata.txt", header=1, delimiter=r'\t+')

  PRJNA608742_meta_in = pd.read_csv("PRJNA608742_metadata.txt", header=1, delimiter=r'\t+')


In [8]:
PRJNA608742_meta_in

Unnamed: 0,SRR11537951,RNA-Seq,Total cell,SRX8108997,GSM4475053,severe COVID-19 patient,GSM4475053.1,SRP250732
0,SRR11181954,RNA-Seq,Total cell,SRX7802546,GSM4339769,mild,GSM4339769,SRP250732
1,SRR11181955,RNA-Seq,Total cell,SRX7802547,GSM4339770,mild,GSM4339770,SRP250732
2,SRR11181956,RNA-Seq,Total cell,SRX7802548,GSM4339771,severe,GSM4339771,SRP250732
3,SRR11181957,RNA-Seq,Total cell,SRX7802549,GSM4339772,mild,GSM4339772,SRP250732
4,SRR11181958,RNA-Seq,Total cell,SRX7802550,GSM4339773,severe,GSM4339773,SRP250732
5,SRR11181959,RNA-Seq,Total cell,SRX7802551,GSM4339774,severe,GSM4339774,SRP250732
6,SRR11537946,RNA-Seq,CD45+,SRX8108992,GSM4475048,healthy control,GSM4475048,SRP250732
7,SRR11537947,RNA-Seq,CD45+,SRX8108993,GSM4475049,healthy control,GSM4475049,SRP250732
8,SRR11537948,RNA-Seq,CD45+,SRX8108994,GSM4475050,healthy control,GSM4475050,SRP250732
9,SRR11537949,RNA-Seq,Total cell,SRX8108995,GSM4475051,severe COVID-19 patient,GSM4475051,SRP250732


In [9]:
%%bash

#Prefetch BALF covid
#Do one severe patient
prefetch -o /input_dir/corona_analysis/input_reads/lung_scRNA_BALF/SRR11181956.sra \
    --max_size 100000000 SRR11181956




2020-07-25T23:54:16 prefetch.2.10.3: 1) 'SRR11181956' (95GB) is larger than maximum allowed: skipped 

Download of some files was skipped because they are too large
You can change size download limit by setting
--min-size and --max-size command line arguments


2020-07-25T23:54:16 prefetch.2.10.3 int: buffer insufficient while reading uri within cloud module - cannot Get Cloud Location
2020-07-25T23:54:16 prefetch.2.10.3 warn: Maximum file size download limit is 20GB 


In [None]:
%%bash
#BALF covid
parallel-fastq-dump -t 12 --tmpdir /input_dir/corona_analysis/temp/ \
    -s /input_dir/corona_analysis/input_reads/lung_scRNA_BALF/SRR11181956.sra \
    --dumpbase --clip --skip-technical --readids  \
    --read-filter pass --split-files --origfmt \
    --outdir /input_dir/corona_analysis/input_reads/lung_scRNA_BALF


In [None]:
%%bash

### optional (tbd) Fastp based read filtering
fastp -p 8 -i /input_dir/corona_analysis/input_reads/kidney_scRNA_control/SRR11181956_pass_1.fastq.gz \
      -I /input_dir/corona_analysis/input_reads/lung_scRNA_BALF/SRR11181956_pass_2.fastq.gz \
      -o /input_dir/corona_analysis/input_reads/lung_scRNA_BALF/SRR11181956_1_filt.fq.gz \
      -O /input_dir/corona_analysis/input_reads/lung_scRNA_BALF/SRR11181956_2_filt.fq.gz \
      -h /input_dir/corona_analysis/input_reads/lung_scRNA_BALF/SRR11181956_fastp.html \
      -j /input_dir/corona_analysis/input_reads/lung_scRNA_BALF/SRR11181956_fastp.json \
      --umi --umi_loc read1 --umi_len 10


In [None]:
%%bash
mkdir /input_dir/corona_analysis/alignment_out/lung_scRNA_BALF/

#Align reads against human transcriptome for BALF CoVid 
cd /input_dir/corona_analysis/alignment_out/lung_scRNA_BALF

#Severe covid sample1
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/lung_scRNA_BALF/lung_scRNA_covid_SRR11181956_ \
     --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --outReadsUnmapped Fastx --limitBAMsortRAM 27000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM  \
#     --soloType CB_UMI_Complex --soloCBwhitelist None --soloCBposition  0_0_0_6 0_22_0_27 0_43_0_48 --soloUMIposition  0_49_0_54 \
     --soloFeatures Gene --soloOutFileNames lung_scRNA_covid_SRR11181956_solo_ \
     --readFilesIn SRR11181956_filt_1.fq.gz \
                   SRR11181956_filt_2.fq.gz


In [None]:
%%bash

cd /input_dir/corona_analysis/alignment_out/lung_scRNA_BALF

#Attempt doing a severe scRNA-seq CoVid sample
salmon alevin -l ISR \
              -1 SRR11181956_pass_1.fastq \
              -2 SRR11181956_pass_2.fastq \
              --chromium \
              -i /input_dir/corona_analysis/annotations/human/salmon/salmon_hg38_index \
              -p 3 \
              --mrna /input_dir/corona_analysis/annotations/human/gencode_mt.txt \
              --rrna /input_dir/corona_analysis/annotations/human/rRNA_ensembl.txt \
              -o /input_dir/corona_analysis/alignment_out/lung_scRNA_covid_/lung_scRNA_covid_SRR11181956_salmon \
              --tgMap /input_dir/corona_analysis/annotations/human/salmon_grch38_gencode_tran2gene.txt


# Get transcriptome for infected BALFs

Transcriptome data comes in the form of RNA-seq from paper [Xiong et al 2020](https://www.tandfonline.com/doi/full/10.1080/22221751.2020.1747363). 2 patients Brancheolar Lavage Fluid (BALF) had RNA-seq libraries generated and sequenced on MiSeq
CRR119897 - patient2_rep2
CRR119896 - patient2_rep1
CRR119895 - patient1_rep2
CRR119894 - patient1_rep1

In [None]:
%%bash
mkdir /input_dir/corona_analysis/input_reads/BALF_rna

#Get fq.gz from database for 
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119897/CRR119897_f1.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_f1.fq.gz
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119897/CRR119897_r2.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_r2.fq.gz
    
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119896/CRR119896_f1.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_f1.fq.gz
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119896/CRR119896_r2.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_r2.fq.gz
    
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119895/CRR119895_f1.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_f1.fq.gz
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119895/CRR119895_r2.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_r2.fq.gz
    
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119894/CRR119894_f1.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_f1.fq.gz
curl -s -L ftp://download.big.ac.cn/gsa3/CRA002390/CRR119894/CRR119894_r2.fq.gz  > \
    /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_r2.fq.gz
    

In [None]:
# Filter reads and QC runs using Fastp

fastp -p 8 -i /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_f1.fq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_r2.fq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_rna/CRR119897_fastp.json 

fastp -p 8 -i /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_f1.fq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_r2.fq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_rna/CRR119896_fastp.json 

fastp -p 8 -i /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_f1.fq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_r2.fq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_rna/CRR119895_fastp.json 

fastp -p 8 -i /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_f1.fq.gz \
      -I /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_r2.fq.gz \
      -o /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_filt_1.fq.gz \
      -O /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_filt_2.fq.gz \
      -h /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_fastp.html \
      -j /input_dir/corona_analysis/input_reads/BALF_rna/CRR119894_fastp.json 


# Align reads to human transcriptome using STAR

In [None]:
%%bash

#Align coronavirus BALF reads to human transcriptome
cd /input_dir/corona_analysis/input_reads/BALF_rna
#Patient 1
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/BALF_corona_s1_\
     --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --outReadsUnmapped Fastx --limitBAMsortRAM 27000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM   --readFilesCommand zcat \
     --readFilesIn CRR119894_filt_1.fq.gz,CRR119895_filt_1.fq.gz \
                   CRR119894_filt_2.fq.gz,CRR119895_filt_2.fq.gz
#Patient 2
STAR --runMode alignReads      \
     --genomeDir /input_dir/corona_analysis/annotations/human/STAR_ix \
     --outSAMtype BAM SortedByCoordinate --runThreadN 14 \
     --outFileNamePrefix /input_dir/corona_analysis/alignment_out/BALF_corona_s2_\
     --outTmpDir /input_dir/corona_analysis/temp/star2 \
     --outReadsUnmapped Fastx --limitBAMsortRAM 27000000000 \
     --outWigType wiggle --quantMode TranscriptomeSAM   --readFilesCommand zcat \
     --readFilesIn CRR119896_filt_1.fq.gz,CRR119897_filt_1.fq.gz \
                   CRR119896_filt_2.fq.gz,CRR119897_filt_2.fq.gz


In [None]:
%%bash 

cd /input_dir/corona_analysis/alignment_out/
#Quantify infected BALF transcriptome
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       BALF_corona_s1_Aligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/human/RSEM_ix \
                       BALF_infected_s1_


In [None]:
%%bash

#Quantify infected BALF transcriptome
rsem-calculate-expression --paired-end \
                       --alignments \
                       -p 14 --no-bam-output \
                       BALF_corona_s2_Aligned.toTranscriptome.out.bam \
                      /input_dir/corona_analysis/annotations/human/RSEM_ix \
                       BALF_infected_s2_


# Differential expression between vero normal and infected

In [None]:
%%bash
cd /input_dir/corona_analysis/alignment_out/
rsem-generate-data-matrix BALF_control_quant/BALF_control_rsem_s1_.genes.results \
                          BALF_control_quant/BALF_control_rsem_s2_.genes.results \
                          BALF_control_quant/BALF_control_rsem_s3_.genes.results \
                          BALF_infected_quant/BALF_infected_s1_.genes.results \
                          BALF_infected_quant/BALF_infected_s2_.genes.results \
                            > BALF_rsem_gene_counts.txt

rsem-generate-data-matrix BALF_control_quant/BALF_control_rsem_s1_.isoforms.results \
                          BALF_control_quant/BALF_control_rsem_s2_.isoforms.results \
                          BALF_control_quant/BALF_control_rsem_s3_.isoforms.results \
                          BALF_infected_quant/BALF_infected_s1_.isoforms.results \
                          BALF_infected_quant/BALF_infected_s2_.isoforms.results \
                            > BALF_rsem_isoform_counts.txt


In [None]:
%%bash
cd /input_dir/corona_analysis/diffExp_out/
rsem-run-ebseq BALF_rsem_gene_counts.txt 3,2 BALF_corona_geneDiff_rsem.txt