# Processing Syed Single-cell RNA-seq rep1

This tutorial is a step by step guide to do the pre-processing of Single-cell RNA-seq data in Hydra. 

In [2]:
import subprocess, os, sys, signal, pip
import rpy2
from joblib import Parallel, delayed
import multiprocessing
import threading
import time
import numpy as np

We set up the input and the output foloder. The input folder contains all the fastq files and the output folder is the place where all the results would be saved

In [3]:
# inputFolder : Folder name with all the fastq files
intputFolder = '/mnt/fls01-bcf01/ngsdata/Analysis/2016/nextseq/161102_NB500968_0059_AHG7LJAFXX_analysis/Connor_Rogerson_17oct16/fastqs'

# Setting the output folder where all the results would be stored
outputFolder = '/mnt/mr01-home01/mqbsxsm2/scratch/Syed_scRNA_seq_rep1/output/'

## Custom Functions

Here we define all the custom functions

We use `lambda` to write a one off function. Functions that we are going to use once.

In [4]:
remove_extension = lambda x: x.split('.')[0]

In [5]:
def get_args(read1, read2, ref_genome, genome_dir, output_folders):
    '''Set the input and output path for a given pair of reads'''
    r1_shortname = remove_extension(os.path.basename(read1))

    args = {  
        'r1_input': read1,
        'r2_input': read2,
        'ref_genome': ref_genome,
        'genome_dir': genome_dir,
    }
    
    output_paths = {folder: os.path.join(outputFolder, folder, r1_shortname) for folder in output_folders}
    
    return dict(args, **output_paths)

## Setting up the Genome folder

We are using Mouse cells, so we set mouse genome

In [6]:
ref_genome = '/mnt/data-sets/bcf/genomeIndexes/hg38_analysisSet/annotation/gencode.v24.annotation.gtf'
genome_dir = '/mnt/data-sets/bcf/genomeIndexes/hg38_analysisSet/STAR_gencode24'
#ref_genome = '/mnt/data-sets/bcf/genomeIndexes/hg38_analysisSet/annotation/gencode.v24.annotation.gtf'
#genome_dir = '/mnt/data-sets/bcf/genomeIndexes/hg38_analysisSet/STAR_gencode24'

## Setting up the Software paths

Now we set the path for all the tools we will be using. For this session we will only use trimmomatic and STAR.

In [7]:
trimmomatic = 'module load apps/trimmomatic/0.36/noarch; trimmomatic'
STAR = 'module load apps/star/2.4.2a/gcc-4.8.5; STAR'
htseq = 'module load apps/htseq/0.6.1p1/gcc-4.8.5+python-2.7.8+numpy-1.9.2; module load libs/pysam/0.8.3/gcc-4.8.5+python-2.7.8; python -m HTSeq.scripts.count'

In [8]:
softwares = {    
    'trimmomatic':trimmomatic,
    'STAR':STAR,
    'htseq':htseq}

## Creating output folder

Next we create the output folder. 

In [9]:
output_folders = [ 'Trimmomatic_Output', 'STAR_output'         # Trimmomatic, Star
                 , 'htseq_output'
                 ]

In [10]:
for folder in output_folders:
    os.makedirs(os.path.join(outputFolder, folder))
    #print(os.path.join(outputFolder, folder))

## Reading all the fastq files for read1 and read2

We now read all the fasta files

In [11]:
root, folders, files = os.walk(intputFolder).__next__()

files = [f for f in files if not f.startswith('.')] #remove hidden files if there exist
reads1 = sorted([os.path.join(root, f) for f in files if 'R1' in f])
reads2 = sorted([os.path.join(root, f) for f in files if 'R2' in f])

In [12]:
print(reads1)
print(reads2)

['/mnt/fls01-bcf01/ngsdata/Analysis/2016/nextseq/161102_NB500968_0059_AHG7LJAFXX_analysis/Connor_Rogerson_17oct16/fastqs/A01_S17_R1_001.fastq.gz', '/mnt/fls01-bcf01/ngsdata/Analysis/2016/nextseq/161102_NB500968_0059_AHG7LJAFXX_analysis/Connor_Rogerson_17oct16/fastqs/A02_S9_R1_001.fastq.gz', '/mnt/fls01-bcf01/ngsdata/Analysis/2016/nextseq/161102_NB500968_0059_AHG7LJAFXX_analysis/Connor_Rogerson_17oct16/fastqs/A03_S1_R1_001.fastq.gz', '/mnt/fls01-bcf01/ngsdata/Analysis/2016/nextseq/161102_NB500968_0059_AHG7LJAFXX_analysis/Connor_Rogerson_17oct16/fastqs/A04_S65_R1_001.fastq.gz', '/mnt/fls01-bcf01/ngsdata/Analysis/2016/nextseq/161102_NB500968_0059_AHG7LJAFXX_analysis/Connor_Rogerson_17oct16/fastqs/A05_S57_R1_001.fastq.gz', '/mnt/fls01-bcf01/ngsdata/Analysis/2016/nextseq/161102_NB500968_0059_AHG7LJAFXX_analysis/Connor_Rogerson_17oct16/fastqs/A06_S49_R1_001.fastq.gz', '/mnt/fls01-bcf01/ngsdata/Analysis/2016/nextseq/161102_NB500968_0059_AHG7LJAFXX_analysis/Connor_Rogerson_17oct16/fastqs/A07_S

## Setting up the command list

Here we set up  the commands that we will use for the pre-processing. For this example we will only use `Trimmomatic` and `STAR` aligner.

In [13]:
cmds = [    
    '{trimmomatic} PE -phred33 {r1_input} {r2_input} {Trimmomatic_Output}_r1_paired.fq {Trimmomatic_Output}_r1_unpaired.fq {Trimmomatic_Output}_r2_paired.fq {Trimmomatic_Output}_r2_upaired.fq ILLUMINACLIP:/opt/gridware/depots/4baff5c5/el7/pkg/apps/trimmomatic/0.36/noarch/share/adapters/TruSeq3-PE-2.fa:2:30:10 SLIDINGWINDOW:4:20 MINLEN:36', 
    '{STAR} --runMode alignReads \
    --runThreadN 1 \
    --genomeLoad NoSharedMemory \
    --genomeDir {genome_dir} \
    --sjdbGTFfile {ref_genome} \
    --readFilesIn {Trimmomatic_Output}_r1_paired.fq {Trimmomatic_Output}_r2_paired.fq \
    --outSAMtype BAM Unsorted \
    --outFileNamePrefix {STAR_output}',
    '{htseq} --format=bam --stranded=reverse --type=exon --idattr=gene_id {STAR_output}Aligned.out.bam {ref_genome} > {htseq_output}.txt'
    ]

__We define the function in case we want to parallelize the run__

In [14]:
def run_cmd(cmds, args):
    for cmd in cmds: 
        print(threading.currentThread().getName(), 'Starting')
        #print(cmd.format(**args))
        subprocess.call(cmd.format(**args), shell=True)

Looking at the Number of cores in the node

In [15]:
print("Total Cores:",multiprocessing.cpu_count())

Total Cores: 16


__Multithread starts __

In [16]:
#for read1, read2 in zip(reads1, reads2):
#    args = get_args(read1, read2, ref_genome, genome_dir, output_folders)
#    args = dict(args, **softwares)
#    try:
#        t = threading.Thread(target=run_cmd, args = (cmds,args))
#        t.daemon = True       
#        t.start()        
#        #t.join(timeout=None)
#    except:
#        print("Cannot Start Thread")

## Sequential Run

In [16]:
for read1, read2 in zip(reads1, reads2):
    args = get_args(read1, read2, ref_genome, genome_dir, output_folders)
    args = dict(args, **softwares)
    run_cmd(cmds,args)

MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting
MainThread Starting


In [60]:
! lscpu

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                16
On-line CPU(s) list:   0-15
Thread(s) per core:    1
Core(s) per socket:    8
Socket(s):             2
NUMA node(s):          2
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 62
Stepping:              4
CPU MHz:               2599.968
BogoMIPS:              5199.24
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              20480K
NUMA node0 CPU(s):     0-7
NUMA node1 CPU(s):     8-15


## Creating genome-index

In [49]:
! module load apps/star/2.4.2a/gcc-4.4.7; STAR  --runMode genomeGenerate   --runThreadN 4   --genomeDir /mnt/single-cell/users/mqbsxsm2/STAR_Gencode_vM5  --genomeFastaFiles /mnt/data-sets/bcf/genomeIndexes/mm10_random_chrM_chrUn/fasta/mm10_random_chrM_chrUn.fa     --sjdbGTFfile /mnt/data-sets/bcf/genomeIndexes/mm10_random_chrM_chrUn/annotation/Gencode-M5/gencode.vM5.annotation.gtf   --sjdbOverhang 100

apps/star/2.4.2a/gcc-4.4.7
 | -- libs/gcc/system
 |    * --> OK
 |
 OK
Jun 08 12:38:50 ..... Started STAR run
Jun 08 12:38:50 ... Starting to generate Genome files
Jun 08 12:40:05 ... starting to sort  Suffix Array. This may take a long time...
Jun 08 12:40:23 ... sorting Suffix Array chunks and saving them to disk...
Jun 08 13:07:05 ... loading chunks from disk, packing SA...
Jun 08 13:14:14 ... Finished generating suffix array
Jun 08 13:14:14 ... starting to generate Suffix Array index...
Jun 08 13:36:07 ..... Processing annotations GTF
Jun 08 13:36:23 ..... Inserting junctions into the genome indices
Jun 08 13:40:04 ... writing Genome to disk ...
Jun 08 13:40:31 ... writing Suffix Array to disk ...
Jun 08 13:43:53 ... writing SAindex to disk
Jun 08 13:44:10 ..... Finished successfully


### Testing htseq

In [65]:
! module load apps/htseq/0.6.1p1/gcc-4.4.7+python-2.7.5+numpy-1.7.1; module load libs/pysam/0.8.3/gcc-4.4.7+python-2.7.5; python -m HTSeq.scripts.count --format=bam --stranded=reverse --type=exon --idattr=gene_id /mnt/mr01-home01/mqbsxsm2/scratch/Single_cell_Read_Effect/AllLanes/output/STAR_output/C01_R1Aligned.out.bam /mnt/data-sets/bcf/genomeIndexes/mm10_random_chrM_chrUn/annotation/Gencode-M5/gencode.vM5.annotation.gtf > /mnt/mr01-home01/mqbsxsm2/scratch/Single_cell_Read_Effect/AllLanes/output/htseq_output/C01_R1.txt

apps/htseq/0.6.1p1/gcc-4.4.7+python-2.7.5+numpy-1.7.1
 | -- libs/gcc/system
 |    * --> OK
 | -- apps/python/2.7.5/gcc-4.4.7
 |    | -- libs/gcc/system ... SKIPPED (already loaded)
 |    * --> OK
 | -- libs/numpy/1.7.1/gcc-4.4.7+python-2.7.5+atlas-3.10.1
 |    | -- libs/gcc/system ... SKIPPED (already loaded)
 |    | -- apps/python/2.7.5/gcc-4.4.7 ... SKIPPED (already loaded)
 |    * --> OK
 |
 OK
libs/pysam/0.8.3/gcc-4.4.7+python-2.7.5
 | -- libs/gcc/system ... SKIPPED (already loaded)
 | -- apps/python/2.7.5/gcc-4.4.7 ... SKIPPED (already loaded)
 |
 OK
100000 GFF lines processed.
200000 GFF lines processed.
300000 GFF lines processed.
400000 GFF lines processed.
500000 GFF lines processed.
600000 GFF lines processed.
700000 GFF lines processed.
800000 GFF lines processed.
900000 GFF lines processed.
1000000 GFF lines processed.
1100000 GFF lines processed.
1200000 GFF lines processed.
1300000 GFF lines processed.
1400000 GFF lines processed.
1500000 GFF lines processed.
1523143 GFF 

In [23]:
! module load apps/cufflinks/2.2.1/gcc-4.4.7+boost-1.49.0+samtools-0.1.19+eigen-3.0.5; gffread -E gtf/All.gff -T -o gtf/AllConverted.gtf 
#! module load apps/cufflinks/2.2.1/gcc-4.4.7+boost-1.49.0+samtools-0.1.19+eigen-3.0.5; gffread -h

apps/cufflinks/2.2.1/gcc-4.4.7+boost-1.49.0+samtools-0.1.19+eigen-3.0.5
 | -- libs/gcc/system
 |    * --> OK
 | -- libs/boost/1.49.0/gcc-4.4.7+openmpi-1.6.5+python-2.7.5
 |    | -- libs/gcc/system ... SKIPPED (already loaded)
 |    | -- mpi/openmpi/1.6.5/gcc-4.4.7
 |    |    | -- libs/gcc/system ... SKIPPED (already loaded)
 |    |    * --> OK
 |    * --> OK
 | -- apps/samtools/0.1.19/gcc-4.4.7
 |    | -- libs/gcc/system ... SKIPPED (already loaded)
 |    * --> OK
 |
 OK


In [None]:
! pwd