<a href="https://colab.research.google.com/github/djgarayb/RNA-Seq_introduction/blob/master/S3_Notebook_From_fastq_to_matrix_v0_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install packages

In [0]:
pip install --upgrade rpy2

In [0]:
!wget https://github.com/pachterlab/kallisto/releases/download/v0.46.1/kallisto_linux-v0.46.1.tar.gz

In [0]:
!tar -xf kallisto_linux-v0.46.1.tar.gz

In [0]:
!wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.9.zip

In [0]:
!unzip fastqc_v0.11.9.zip

In [0]:
!chmod +x FastQC/fastqc

In [0]:
!pip install multiqc

# Get the cDNA file

In [0]:

!wget ftp://ftp.ensembl.org/pub/release-95/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz

In [0]:
!kallisto/kallisto index --make-unique -i kallisto_index.idx Homo_sapiens.GRCh38.cdna.all.fa.gz

In [0]:
!R

options(Ncpus = 2)

install.packages("BiocManager")

BiocManager::install()

a

q()

n

In [0]:
%load_ext rpy2.ipython

In [0]:
%%R
options(Ncpus = 4)
options(repos=structure(c(CRAN="http://cloud.r-project.org/")))
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install(c("tximport","biomaRt", "dplyr", "tidyverse","ensembldb","EnsDb.Hsapiens.v86","rhdf5"))

# Creating folder structure

In [0]:
!mkdir fastq
!mkdir kalisto_results

In [0]:
%cd fastq 

# Download fastq

sample1


In [0]:
!wget -P sample1/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR691/000/SRR6914400/SRR6914400_1.fastq.gz
!wget -P sample1/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR691/000/SRR6914400/SRR6914400_2.fastq.gz

sample2

In [0]:
!wget -P sample2/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR691/001/SRR6914401/SRR6914401_1.fastq.gz
!wget -P sample2/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR691/001/SRR6914401/SRR6914401_2.fastq.gz

sample3


In [0]:
!wget -P sample3/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR691/002/SRR6914402/SRR6914402_1.fastq.gz
!wget -P sample3/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR691/002/SRR6914402/SRR6914402_2.fastq.gz

sample4

In [0]:
!wget -P sample4/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR691/003/SRR6914403/SRR6914403_1.fastq.gz
!wget -P sample4/ ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR691/003/SRR6914403/SRR6914403_2.fastq.gz

In [0]:
%cd ..

# Alingment 

## Alignment for running with Txtimport

In [0]:
%%time
%%!
cd fastq
for DIR in $(ls -d */); do
  echo $DIR
  cd $DIR
  seq_file1=*_1.fastq.gz
  seq_file2=*_2.fastq.gz
  base=$(echo $seq_file1|awk '{print substr($0, 1, length()-11)}')
  echo $base
  echo "Running Kallisto with the following code"
  echo "/content/kallisto/kallisto quant -i /content/kallisto_index.idx --bias -o /content/kalisto_results/$DIR -t 4 $seq_file1 $seq_file2 >>log 2>&1"
  echo $seq_file1
  echo $seq_file2
  /content/kallisto/kallisto quant -i /content/kallisto_index.idx --bias -o /content/kalisto_results/$base -t 4 $seq_file1 $seq_file2 >>log 2>&1
  cd ..
done

## Alignment for running with Sleuth

In [0]:
%%!
cd fastq
for DIR in $(ls -d */); do
  echo $DIR
  cd $DIR
  sample1=*_1.fastq.gz
  sample2=*_2.fastq.gz
  echo "Running Kallisto with the following code"
  echo "../../kallisto/kallisto quant -i ../../kallisto_index.idx --bias -b 50 -o /content/kalisto_results/$DIR -t 2 $sample1 $sample2"
  echo $sample1
  echo $sample2
  ../../kallisto/kallisto quant -i ../../kallisto_index.idx --bias -b 50 -o /content/kalisto_results/$DIR -t 2 $sample1 $sample2 >>log 2>&1
  cd ..
done

# QC

## FastQC

In [0]:
ls

In [0]:
%%!
cd fastq
for DIR in $(ls -d */); do
  echo $DIR
  cd $DIR
  /content/FastQC/fastqc *.fastq.gz
  cd ..
done

In [0]:
ls

## MultiQC

In [0]:
!multiqc .

# Load packages


In [0]:
%%R
# Load all the R libraries we will be using in the notebook
library(tximport)
library(biomaRt)
library(Biobase)
library(ggplot2)
library(dplyr)
library(tidyverse) 
library(Biostrings)
library(ensembldb)
library(EnsDb.Hsapiens.v86) 
library(rhdf5)


## Set WD

In [0]:
%%R
setwd("/content/kalisto_results")

## T2G table 

In [0]:
%%R
listTables(EnsDb.Hsapiens.v86)
listColumns(EnsDb.Hsapiens.v86, "tx")
Tx <- transcripts(EnsDb.Hsapiens.v86, columns=c(listColumns(EnsDb.Hsapiens.v86,"tx"), "gene_name"))
Tx <- as_tibble(Tx)
Tx <- dplyr::rename(Tx, target_id = tx_id)
Tx <- dplyr::select(Tx, target_id, gene_name)
print(dim(Tx))
head(Tx)

## Create metadata

In [0]:
%%R
metadata <- matrix(c("sample1","SRR6914400","sample2","SRR6914401","sample3","SRR6914402","sample4","SRR6914403"),ncol=2,byrow=TRUE)
colnames(metadata) <- c("sample","folder")
metadata <- as.data.frame(metadata)
metadata

## Upload metadata from csv

In [0]:
metadata<-read.csv("Path to file")

## Check paths

In [0]:
%%R
path <- file.path(metadata$folder, "abundance.h5")
all(file.exists(path)) 

In [0]:
%%R
Txi_gene <- tximport(path, 
                     type = "kallisto", 
                     tx2gene = Tx, 
                     txOut = FALSE, #How does the result change if this =FALSE vs =TRUE?
                     countsFromAbundance = "lengthScaledTPM",
                     ignoreTxVersion=TRUE)

colSums(Txi_gene$counts)

In [0]:
%%R
raw.data <- Txi_gene$counts
colnames(raw.data) <- metadata$folder
dim(raw.data)
write.csv(raw.data,file="Data_kallisto_counts.csv")

In [0]:
%%R
head(raw.data)

In [0]:
%%R -o raw_data
raw_data <- Txi_gene$abundance
colnames(raw_data) <- metadata$folder
dim(raw_data)
write.csv(raw_data,file="Data_kallisto_TPM.csv")

In [0]:
%%R
head(raw_data)