In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:

RNA_TABLE = pd.read_csv('./data/my_RNA_SRA_table.csv')
RNA_TABLE

Unnamed: 0,run_accession,SAMPLE_ID,TARGET,CELL_TYPE,REP,OLD_SAMPLE_ID
0,SRR900927,RNA_ESC_1_Wamstad_2013_PE,RNA,ESC,1,RNA_ESC_Rep_1_JW-H9B
1,SRR900928,RNA_ESC_2_Wamstad_2013_PE,RNA,ESC,2,RNA_ESC_Rep_2_JW-H10B
2,SRR900929,RNA_MES_1_Wamstad_2013_PE,RNA,MES,1,RNA_MES_Rep_1_JW-H3B
3,SRR900930,RNA_MES_2_Wamstad_2013_PE,RNA,MES,2,RNA_MES_Rep_2_JW-H4B
4,SRR900931,RNA_CP_1_Wamstad_2013_PE,RNA,CP,1,RNA_CP_Rep_1_JW-H5B
5,SRR900932,RNA_CP_2_Wamstad_2013_PE,RNA,CP,2,RNA_CP_Rep_2_JW-H6B
6,SRR900933,RNA_CM_1_Wamstad_2013_PE,RNA,CM,1,RNA_CM_Rep_1_JW-H7B
7,SRR900934,RNA_CM_2_Wamstad_2013_PE,RNA,CM,2,RNA_CM_Rep_2_JW-H8B


### Download SRA

In [3]:
import sys
import subprocess 
import shlex
OUTDIR= '/usr/local/molbio/01_raw_data/Wamstad/'
NT = '12'
RANGE=(1,8)

MAPFILES='/usr/local/molbio/02_map_files/mm10/'
SSD_DIR='/Volumes/MariusSSD/prj/crg/02_map_files/mm10/Wamstad/'


In [None]:
%%time

for i,row in RNA_TABLE.iterrows():
    if i in range(RANGE[0],RANGE[1]):
        SAMPLE_ID = row['SAMPLE_ID']
        SRA = row['run_accession']
        print(f'\nIndex   = {i}')
        print(f'SRA       = {SRA}')
        print(f'SAMPLE_ID = {SAMPLE_ID}') 
        print(f'OUTDIR    = {OUTDIR}\n')
        ! parallel-fastq-dump --tmpdir . --sra-id {SRA} --threads {NT} --outdir {OUTDIR} --gzip --split-files

### Map with TopHat2

In [10]:
%%time

for i,row in RNA_TABLE.iterrows():
    if i in range(RANGE[0],RANGE[1]):
        try:
            SRA = row['run_accession']
            SAMPLE_ID = row['SAMPLE_ID']
            FQ1 = OUTDIR+SRA+'_1.fastq.gz'
            FQ2 = OUTDIR+SRA+'_2.fastq.gz'
            print(f'\nIndex     = {i}')
            print(f'SRA       = {SRA}')
            print(f'SAMPLE_ID = {SAMPLE_ID}')
            print(f'FQ_DIR    = {OUTDIR}')
            print(f'FQ1 = {FQ1}')
            print(f'FQ2 = {FQ2}')
            print(f'OUTDIR = {MAPFILES}{SAMPLE_ID}\n')
            #subprocess.run(['pigz', '-d', '-k', '-p','6', FQ1 +'.gz'])
            #subprocess.run(['pigz', '-d', '-k', '-p','6', FQ2 +'.gz'])
            #subprocess.run(shlex.split(f'ProcessRNAseq.pl -v -p mouse mm10 {FQ1} {FQ2} {SAMPLE_ID}'))
            ! ./ProcessRNAseq.pl -v -p mouse mm10 {FQ1} {FQ2} {SAMPLE_ID}
            #subprocess.run(['rm', '-f', FQ1, FQ2])
            ! rsync -avh --remove-source-files {MAPFILES}{SAMPLE_ID} {SSD_DIR}
        except Exception:
            print('Error')
            break


Index     = 1
SRA       = SRR900928
SAMPLE_ID = RNA_ESC_2_Wamstad_2013_PE
FQ_DIR    = /usr/local/molbio/01_raw_data/Wamstad/
FQ1 = /usr/local/molbio/01_raw_data/Wamstad/SRR900928_1.fastq.gz
FQ2 = /usr/local/molbio/01_raw_data/Wamstad/SRR900928_2.fastq.gz
OUTDIR = /usr/local/molbio/02_map_files/mm10/RNA_ESC_2_Wamstad_2013_PE

[1;32m%%%% ProcessRNAseq.pl by Enrique Blanco @ CRG (2018)

[0m[1;32m%%%% Stage 0.  Reading options[0m[1;32m [OK]

[0m[1;32m%%%% Creating the info directory
[0m[1;32m%%%% It is already existing
[0m[1;32m%%%% Stage 1.  Mapping the RNAseq sample/s (/usr/local/molbio/01_raw_data/Wamstad/SRR900928_1.fastq.gz,/usr/local/molbio/01_raw_data/Wamstad/SRR900928_2.fastq.gz -> $MAPFILES/mm10/RNA_ESC_2_Wamstad_2013_PE/accepted_hits.bam)
[0m[1;32m%%%% Running tophat2 --zpacker pigz --transcriptome-index=$MOUSE10/transcriptome_index/refGene --no-coverage-search --mate-inner-dist 176 --mate-std-dev 11 -p 3 -g 1 -o $MAPFILES/mm10/RNA_ESC_2_Wamstad_2013_PE --library-ty

### Assert all the SAMPLE IDs have a BAM file


In [4]:
import os
SAMPLE_IDS = pd.Index([sample_id  for sample_id in RNA_TABLE['SAMPLE_ID']])
BAMS = pd.Index(os.listdir(SSD_DIR))
assert len(SAMPLE_IDS.intersection(BAMS)) == len(SAMPLE_IDS)

In [5]:

SAMPLE_IDS.intersection(BAMS)

Index(['RNA_ESC_1_Wamstad_2013_PE', 'RNA_ESC_2_Wamstad_2013_PE',
       'RNA_MES_1_Wamstad_2013_PE', 'RNA_MES_2_Wamstad_2013_PE',
       'RNA_CP_1_Wamstad_2013_PE', 'RNA_CP_2_Wamstad_2013_PE',
       'RNA_CM_1_Wamstad_2013_PE', 'RNA_CM_2_Wamstad_2013_PE'],
      dtype='object')