In [1]:
# 2. prepare for denose
import os
import pandas as pd
import qiime2
from qiime2.plugins import demux, vsearch, quality_filter
from tqdm import tqdm

base_dir = os.getcwd()

# data_dir: Stores various configuration files
data_dir = os.path.join(base_dir, 'data')
# Stores intermediate results and final outputs.
results_dir = os.path.join(base_dir, 'results')
# input_path_dir: Stores absolute paths of sequence files
input_path_dir = os.path.join(base_dir, 'input_path')
# demux_seqs_dir: Stores raw sequences imported via QIIME 2.
demux_seqs_dir = os.path.join(results_dir, 'demux_seqs')
# per_sample_sequences_dir: Stores visualization files generated by QIIME 2 from raw sequences.
per_sample_sequences_demux_dir = os.path.join(results_dir, 'per_sample_sequences_demux')
# quality_filtered_sequences_dir: Stores sequences after quality filtering.
quality_filtered_sequences_dir = os.path.join(results_dir, 'quality_filtered_sequences')
# quality_filtered_sequences_demux_dir: Stores visualization files after quality filtering.
quality_filtered_sequences_demux_dir = os.path.join(results_dir, 'quality_filtered_sequences_demux')

config_df = pd.read_csv(os.path.join(data_dir, 'config_temp.csv'))

In [2]:
for index, row in tqdm(config_df.iterrows()):
    if row['seq_type'] == 'single':
        demux_seqs = qiime2.Artifact.import_data(type='SampleData[SequencesWithQuality]', 
                                                 view_type='SingleEndFastqManifestPhred33V2',
                                                 view= os.path.join(input_path_dir, row['project']+ '.txt'))
        
    else:
        demux_seqs_t = qiime2.Artifact.import_data(type='SampleData[PairedEndSequencesWithQuality]',
                                                 view_type='PairedEndFastqManifestPhred33V2',
                                                 view= os.path.join(input_path_dir, row['project']+ '.txt'))
        demux_seqs = vsearch.methods.merge_pairs(demux_seqs_t,
                                                 threads = 8).merged_sequences
    demux_seqs.save(os.path.join(demux_seqs_dir, row['project']))
    demux_summary = demux.visualizers.summarize(demux_seqs)
    demux_summary.visualization.save(os.path.join(per_sample_sequences_demux_dir, row['project']))

    quality_filtered_sequences, quality_filter_stats = quality_filter.methods.q_score(demux_seqs, min_quality = 25)
    quality_filtered_sequences.save(os.path.join(quality_filtered_sequences_dir, row['project']))
    quality_filtered_sequences_demux = demux.visualizers.summarize(quality_filtered_sequences)
    quality_filtered_sequences_demux.visualization.save(os.path.join(quality_filtered_sequences_demux_dir, row['project']))
    print(row['project'] + '\t' + str(index+1) + ' / ' + str(len(config_df)) + ' done')
print('STEP 2 Done!')


0it [00:00, ?it/s]

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: vsearch --fastq_mergepairs /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216662_0_L001_R1_001.fastq.gz --reverse /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216662_6_L001_R2_001.fastq.gz --fastqout /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216662_0_L001_R1_001.fastq --fastq_ascii 33 --fastq_minlen 1 --fastq_minovlen 10 --fastq_maxdiffs 10 --fastq_qmin 0 --fastq_qminout 0 --fastq_qmax 41 --fastq_qmaxout 41 --fasta_width 0 --threads 8



vsearch v2.22.1_linux_x86_64, 503.8GB RAM, 20 cores
https://github.com/torognes/vsearch

Merging reads 100%
     61949  Pairs
     50853  Merged (82.1%)
     11096  Not merged (17.9%)

Pairs that failed merging due to various reasons:
       259  too few kmers found on same diagonal
         1  multiple potential alignments
      3341  too many differences
      7469  alignment score too low, or score drop too high
        26  staggered read pairs

Statistics of all reads:
    292.95  Mean read length

Statistics of merged reads:
    403.38  Mean fragment length
      8.92  Standard deviation of fragment length
      0.35  Mean expected error in forward sequences
      2.10  Mean expected error in reverse sequences
      0.15  Mean expected error in merged sequences
      0.18  Mean observed errors in merged region of forward sequences
      1.60  Mean observed errors in merged region of reverse sequences
      1.78  Mean observed errors in merged region


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: gzip /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216662_0_L001_R1_001.fastq

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: vsearch --fastq_mergepairs /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216663_1_L001_R1_001.fastq.gz --reverse /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216663_7_L001_R2_001.fastq.gz --fastqout /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216663_1_L001_R1_001.fastq --fastq_ascii 33 --fastq_minlen 1 --fastq_minovlen 10 --fastq_maxdiffs 10 --fastq_qmin 0 --fastq_qmi

vsearch v2.22.1_linux_x86_64, 503.8GB RAM, 20 cores
https://github.com/torognes/vsearch

Merging reads 100%
     70753  Pairs
     55308  Merged (78.2%)
     15445  Not merged (21.8%)

Pairs that failed merging due to various reasons:
       399  too few kmers found on same diagonal
      5505  too many differences
      9536  alignment score too low, or score drop too high
         1  overlap too short
         4  staggered read pairs

Statistics of all reads:
    292.83  Mean read length

Statistics of merged reads:
    402.24  Mean fragment length
     13.25  Standard deviation of fragment length
      0.27  Mean expected error in forward sequences
      2.49  Mean expected error in reverse sequences
      0.15  Mean expected error in merged sequences
      0.12  Mean observed errors in merged region of forward sequences
      2.12  Mean observed errors in merged region of reverse sequences
      2.25  Mean observed errors in merged region


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: gzip /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216663_1_L001_R1_001.fastq

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: vsearch --fastq_mergepairs /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216664_2_L001_R1_001.fastq.gz --reverse /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216664_8_L001_R2_001.fastq.gz --fastqout /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216664_2_L001_R1_001.fastq --fastq_ascii 33 --fastq_minlen 1 --fastq_minovlen 10 --fastq_maxdiffs 10 --fastq_qmin 0 --fastq_qmi

vsearch v2.22.1_linux_x86_64, 503.8GB RAM, 20 cores
https://github.com/torognes/vsearch

Merging reads 100%
     66670  Pairs
     55407  Merged (83.1%)
     11263  Not merged (16.9%)

Pairs that failed merging due to various reasons:
       172  too few kmers found on same diagonal
      4627  too many differences
      6367  alignment score too low, or score drop too high
        97  staggered read pairs

Statistics of all reads:
    292.93  Mean read length

Statistics of merged reads:
    400.64  Mean fragment length
      7.02  Standard deviation of fragment length
      0.34  Mean expected error in forward sequences
      2.27  Mean expected error in reverse sequences
      0.15  Mean expected error in merged sequences
      0.17  Mean observed errors in merged region of forward sequences
      1.86  Mean observed errors in merged region of reverse sequences
      2.04  Mean observed errors in merged region


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: gzip /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216664_2_L001_R1_001.fastq

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: vsearch --fastq_mergepairs /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216665_3_L001_R1_001.fastq.gz --reverse /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216665_9_L001_R2_001.fastq.gz --fastqout /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216665_3_L001_R1_001.fastq --fastq_ascii 33 --fastq_minlen 1 --fastq_minovlen 10 --fastq_maxdiffs 10 --fastq_qmin 0 --fastq_qmi

vsearch v2.22.1_linux_x86_64, 503.8GB RAM, 20 cores
https://github.com/torognes/vsearch

Merging reads 100%
     50057  Pairs
     41202  Merged (82.3%)
      8855  Not merged (17.7%)

Pairs that failed merging due to various reasons:
       111  too few kmers found on same diagonal
         1  multiple potential alignments
      3897  too many differences
      4706  alignment score too low, or score drop too high
       140  staggered read pairs

Statistics of all reads:
    292.54  Mean read length

Statistics of merged reads:
    399.64  Mean fragment length
     17.00  Standard deviation of fragment length
      0.35  Mean expected error in forward sequences
      2.55  Mean expected error in reverse sequences
      0.15  Mean expected error in merged sequences
      0.17  Mean observed errors in merged region of forward sequences
      2.05  Mean observed errors in merged region of reverse sequences
      2.23  Mean observed errors in merged region


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: gzip /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216665_3_L001_R1_001.fastq

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: vsearch --fastq_mergepairs /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216666_4_L001_R1_001.fastq.gz --reverse /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216666_10_L001_R2_001.fastq.gz --fastqout /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216666_4_L001_R1_001.fastq --fastq_ascii 33 --fastq_minlen 1 --fastq_minovlen 10 --fastq_maxdiffs 10 --fastq_qmin 0 --fastq_qm

vsearch v2.22.1_linux_x86_64, 503.8GB RAM, 20 cores
https://github.com/torognes/vsearch

Merging reads 100%
     87172  Pairs
     72950  Merged (83.7%)
     14222  Not merged (16.3%)

Pairs that failed merging due to various reasons:
       133  too few kmers found on same diagonal
      6973  too many differences
      7069  alignment score too low, or score drop too high
        47  staggered read pairs

Statistics of all reads:
    292.75  Mean read length

Statistics of merged reads:
    401.65  Mean fragment length
     15.59  Standard deviation of fragment length
      0.36  Mean expected error in forward sequences
      2.51  Mean expected error in reverse sequences
      0.16  Mean expected error in merged sequences
      0.19  Mean observed errors in merged region of forward sequences
      1.99  Mean observed errors in merged region of reverse sequences
      2.17  Mean observed errors in merged region


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: gzip /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216666_4_L001_R1_001.fastq

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: vsearch --fastq_mergepairs /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216667_5_L001_R1_001.fastq.gz --reverse /tmp/qiime2/gaoyuze/data/dc09e27f-8d3c-4322-97d3-844a82f3917c/data/SRR23216667_11_L001_R2_001.fastq.gz --fastqout /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216667_5_L001_R1_001.fastq --fastq_ascii 33 --fastq_minlen 1 --fastq_minovlen 10 --fastq_maxdiffs 10 --fastq_qmin 0 --fastq_qm

vsearch v2.22.1_linux_x86_64, 503.8GB RAM, 20 cores
https://github.com/torognes/vsearch

Merging reads 100%
     83363  Pairs
     65997  Merged (79.2%)
     17366  Not merged (20.8%)

Pairs that failed merging due to various reasons:
       312  too few kmers found on same diagonal
         1  multiple potential alignments
      6881  too many differences
     10072  alignment score too low, or score drop too high
       100  staggered read pairs

Statistics of all reads:
    292.70  Mean read length

Statistics of merged reads:
    401.52  Mean fragment length
     14.15  Standard deviation of fragment length
      0.38  Mean expected error in forward sequences
      2.65  Mean expected error in reverse sequences
      0.16  Mean expected error in merged sequences
      0.22  Mean observed errors in merged region of forward sequences
      2.11  Mean observed errors in merged region of reverse sequences
      2.33  Mean observed errors in merged region


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: gzip /tmp/q2-SingleLanePerSampleSingleEndFastqDirFmt-_rl9gsfd/SRR23216667_5_L001_R1_001.fastq



  context['result_data'] = context['result_data'].append(df)
  context['result_data'] = context['result_data'].append(df)
1it [02:16, 136.71s/it]

PRJNA924021	1 / 2 done


  context['result_data'] = context['result_data'].append(df)
  context['result_data'] = context['result_data'].append(df)
2it [04:55, 147.56s/it]

PRJEB14602	2 / 2 done
STEP 2 Done!



