# I) Overview
- This notebook is for HUMAN paired-end 75 bp RNA-seq libraries
- If read length is different, modify it within the main function

In [None]:
import sys
import pandas as pd
from multiprocessing.pool import Pool
import os

### Need to install QoRTs

In [None]:
#!conda install -c bioconda qorts --yes

In [None]:
config = pd.read_csv('/rnaseq_data/SampleID.txt', sep="\t")

In [None]:
config.head()

In [None]:
path_to_fastq = "/rnaseq_data/fastq/"

In [None]:
config['fastq_R1']= path_to_fastq + config['figueroa_id'] + '_R1.fastq.gz'
config['fastq_R2']= path_to_fastq + config['figueroa_id'] + '_R2.fastq.gz'
config['name']= config['figueroa_id']

In [None]:
genome_path="/genome/STAR_v2.7.3/Indices/" # Change if need different genome/species
fastq_path="/rnaseq_data/fastq/" # change as necceseary
working_dir="/rnaseq_data/"

In [None]:
aligned_dir = 'STAR/'
counts_dir = 'counts/'
aligned_path = os.path.join(working_dir, aligned_dir)
counts_path = os.path.join(working_dir, counts_dir)
os.makedirs(aligned_path)
os.makedirs(counts_path)

### Change to False In Order to Run!!

In [None]:
#DEBUG=True
DEBUG=False

In [None]:
def run_RNAseq(x):
    
    read_length = 75 # change as appropriate
    
    # 1) Perform fastqc on both R1 and R2 fastq files
    
    fastq_R1_fullpath=x['fastq_R1']
    fastq_R2_fullpath= x['fastq_R2']
    cmd1 = f'fastqc  --noextract -f fastq -t 5 {fastq_R1_fullpath} & fastqc  --noextract -f fastq -t 5 {fastq_R2_fullpath}'

    print ('=' * 30)
    print (cmd1)
    if DEBUG == False:
        os.system(cmd1)
        
        
    # 2) Trim adapters
    # Will run R1 and R2 together
    
    name=x['name']
    min_length = read_length - 3
    trim_length = read_length - 2
    cmd2 = f'cutadapt -a AGATCGGAAGAGC -A AGATCGGAAGAGC -m {min_length} --length {trim_length} -o {fastq_path}{name}_R1_trim.fastq.gz  -p {fastq_path}{name}_R2_trim.fastq.gz  {fastq_R1_fullpath} {fastq_R2_fullpath}'

        
    print ('=' * 30)
    print (cmd2)
    if DEBUG == False:
        os.system(cmd2)
            
     
    # 3) Align trimmed fastq to hg19 using STAR
    
    trim_R1_fullpath = fastq_path + name + '_R1_trim.fastq.gz'
    trim_R2_fullpath = fastq_path + name + '_R2_trim.fastq.gz'
    aligned_name = aligned_path + x['name'] + '_Aligned.sortedByCoord.out.bam'

    cmd3 = f'cd {aligned_path}; STAR --runThreadN 5  --genomeDir {genome_path} --readFilesIn {trim_R1_fullpath} {trim_R2_fullpath} --outFileNamePrefix {name}_ --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000  --readFilesCommand gunzip -c  --outSAMtype BAM SortedByCoordinate --outWigType bedGraph --outWigStrand Stranded --outWigNorm RPM --alignEndsType EndToEnd; cd ../'

    print ('=' * 30)
    print (cmd3)
    if DEBUG == False:
        os.system(cmd3)
        
        
    # 4) Name sort .bam files
    
    cmd4 = f'samtools sort -n {aligned_name} -@ 5 -o {aligned_path}{name}.sorted.bam'
    
    print ('=' * 30)
    print (cmd4)
    if DEBUG == False:
        os.system(cmd4)
        
        
    # 5) Call counts with qorts
    # Need to make directory for each file
    # This is stranded ( --stranded)
    # This is for paired-end data
    
    sorted_bam = aligned_path + name + ".sorted.bam"
    cmd5 = f'cd {counts_path}; mkdir {name}; cd ../; java -jar /opt/conda/pkgs/qorts-1.3.0-2/share/qorts-1.3.0-2/QoRTs.jar  QC  --nameSorted --maxReadLength {read_length} --stranded --generatePlots  {sorted_bam}  /genome/hg19_GTF/gencode.v19.ann_wERCC_wo_rRNA.gtf {counts_path}{name}/'

    print ('=' * 30)
    print (cmd5)
    if DEBUG == False:
        os.system(cmd5)


#     print ('========end========')  

In [None]:
PROCESSORS = 5 

In [None]:
p_dup =Pool(PROCESSORS)

In [None]:
p_dup.map(run_RNAseq, [config.iloc[x] for x in range(config.shape[0])] ) 