In [1]:
import os, subprocess 

import pandas as pd 

from ftplib import FTP
from tqdm import tqdm 

## 1. Run kallisto on reads of Aedes aegypti
    1. Collect TSA master file 
    2. Make Kallisto index
    3. Write Kallisto quant command lines in a bash script
        - $ Run parallel -j 8 < all_kallisto_quant.sh

In [55]:
def collect_urls(tsa_id):
    url_list = []
    prefix = tsa_id[:2]
    suffix = tsa_id[2:4]
    print('Getting data URLs...')
    ftp = FTP('ftp.ncbi.nih.gov')
    ftp.login()

    url_base = f'/sra/wgs_aux/{prefix}/{suffix}/{tsa_id}/'
    ftp.cwd(url_base)
    for i in range(2):
        i += 1 
        url = f'{url_base}{tsa_id}.{i}.fsa_nt.gz'
        url_list.append(url)
    return url_list

        
def get_tsa_data(tsa_id, folder_output):
    tsa = []
    urls = collect_urls(tsa_id)
    for url in urls:
        file_name = url.split('/')[-1]
        print(f'Downloading rsync://ftp.ncbi.nlm.nih.gov{url} {folder_output}')
        print(f'rsync --copy-links --recursive --times --verbose rsync://ftp.ncbi.nlm.nih.gov{url} {folder_output}')
        tsa.append(file_name)
    return tsa 


def build_tsa_input():
    folder_base = '/home/xqua/data/Oskar_Evolution'
    folder_output = f'{folder_base}/Data/02_Oskar_analyses/2.12/aedes_aegypti/'
    tsa = get_tsa_data('GFNA01', folder_output)

    os.chdir('..')
    os.chdir(folder_output)
    transcript_output = 'GFNA01_transcripts.fasta.gz'
    if os.path.isfile(f'{folder_output}{transcript_output}'):
        print(f'{transcript_output} already generated')
    else: 
        print('\n\nGenerating TSA master file')
        cmd = f'zcat {tsa[0]} {tsa[1]} > {transcript_output.replace(".gz", "")}'
        subprocess.run(cmd, shell=True)
        subprocess.run(f'gzip {transcript_output.replace(".gz", "")}', shell=True)
        
        
def run_kallisto_index():
    folder_base = '/home/xqua/data/Oskar_Evolution'
    folder_output = f'{folder_base}/Data/02_Oskar_analyses/2.12/aedes_aegypti/'
    transcripts_gz = 'GFNA01_transcripts.fasta.gz'
    transcripts_idx = 'GFNA01_transcripts.idx'
    print(f'\nBuilding index for {transcripts_gz.replace(".gz", "")}')
    cmd = f'kallisto index -i {transcripts_idx} {transcripts_gz}'
    print(f'Run ${cmd} in {folder_output} and move to /home/xqua/data/')
    # subprocess.run(cmd, shell=True)
    
            
def kallisto_quant_cmds():
    os.chdir('/home/xqua/data/')
    if not os.path.isdir(f'{os.getcwd()}/quant/'):
        os.mkdir(f'{os.getcwd()}/quant/')
    index = f'{os.getcwd()}/GFNA01_transcripts.idx'
    fastq_folders = '/home/xqua/bucket/'
    for folder in tqdm(os.listdir(fastq_folders)):
        if '.csv' in folder :
            sra_table_info = pd.read_csv(os.path.join(fastq_folders, folder), sep=',')
            library_layout = sra_table_info[['Run', 'LibraryLayout']]
    print('Writing bash command lines')
    f = open(f'{os.getcwd()}/all_kallisto_quant.sh', 'w')
    for run, layout in library_layout.values:
        fastq = ' '.join([ os.path.join(fastq_folders, run, file) for file in os.listdir(os.path.join(fastq_folders, run)) ])
        if layout == 'SINGLE':
            cmd = f'kallisto quant -i {index} -o {os.getcwd()}/quant/{run}_output --single -l 200 -s 20 {fastq}'
        if layout == 'PAIRED':
            cmd = f'kallisto quant -i {index} -o {os.getcwd()}/quant/{run}_output {fastq}'
        f.write(f'{cmd} \n')
    f.close()
    
def check_abundance_file():
    quant_path = '/home/xqua/data/quant'
    for folder

In [57]:
# Main

In [54]:
build_tsa_input()
run_kallisto_index()
kallisto_quant_cmds()

Getting data URLs...
Downloading /sra/wgs_aux/GF/NA/GFNA01/GFNA01.1.fsa_nt.gz /home/xqua/data/Oskar_Evolution/Data/02_Oskar_analyses/2.12/aedes_aegypti/
rsync --copy-links --recursive --times --verbose rsync://ftp.ncbi.nlm.nih.gov/sra/wgs_aux/GF/NA/GFNA01/GFNA01.1.fsa_nt.gz /home/xqua/data/Oskar_Evolution/Data/02_Oskar_analyses/2.12/aedes_aegypti/
Downloading /sra/wgs_aux/GF/NA/GFNA01/GFNA01.2.fsa_nt.gz /home/xqua/data/Oskar_Evolution/Data/02_Oskar_analyses/2.12/aedes_aegypti/
rsync --copy-links --recursive --times --verbose rsync://ftp.ncbi.nlm.nih.gov/sra/wgs_aux/GF/NA/GFNA01/GFNA01.2.fsa_nt.gz /home/xqua/data/Oskar_Evolution/Data/02_Oskar_analyses/2.12/aedes_aegypti/
Generating TSA master file


100%|██████████| 148/148 [00:00<00:00, 20016.03it/s]


Building index for GFNA01_transcripts.fasta
Run $kallisto index -i GFNA01_transcripts.idx GFNA01_transcripts.fasta.gz in /home/xqua/data/Oskar_Evolution/Data/02_Oskar_analyses/2.12/aedes_aegypti/ and move to /home/xqua/data/
Writing bash command lines





In [59]:
#### Check number of abundance.tsv done

In [62]:
!find /home/xqua/data/quant/ -type f -name '*.tsv' -print | wc

      4       4     216
