In [1]:
# To run this file, configure the left_trim_len and trim_length parameters in the config_temp.csv file 
# based on the visualization results in the quality_filtered_sequences_demux_dir directory. 
# Save the modified configuration as config.csv.
# The required config.csv file for this script has already been configured.
import os
import pandas as pd
import qiime2
from qiime2.plugins import ( deblur, taxa, feature_classifier, feature_table)
from tqdm import tqdm


base_dir = os.getcwd()
data_dir = os.path.join(base_dir, 'data')
# gg_2022_10_backbone_full_length.nb.qza: A full-length 16S rRNA phylogenetic tree file from the October 2022 release of 
# the Greengenes 2 (GG2) database.
classifier_path = os.path.join(data_dir, 'gg_2022_10_backbone_full_length.nb.qza')

results_dir = os.path.join(base_dir, 'results')
quality_filtered_sequences_dir = os.path.join(results_dir, 'quality_filtered_sequences')

# rep_seqs_dir: Stores representative sequences
rep_seqs_dir = os.path.join(results_dir, 'rep_seqs')
if not os.path.exists(rep_seqs_dir):
    os.makedirs(rep_seqs_dir)

# table_dir: Stores feature tables
table_dir = os.path.join(results_dir, 'table')
if not os.path.exists(table_dir):
    os.makedirs(table_dir)

# merged_dir: Stores merged results from all projects
merged_dir = os.path.join(results_dir, 'merged')
if not os.path.exists(merged_dir):
    os.makedirs(merged_dir)


# sample_metadata.txt: Sample metadata manually configured according to each project.
merged_sample_metadata = qiime2.Metadata.load(os.path.join(data_dir, 'sample_metadata.txt'))
merged_sample_metadata_df = pd.read_csv(os.path.join(data_dir, 'sample_metadata.txt'), sep='\t' )

config_df = pd.read_csv(os.path.join(data_dir, 'config.csv'))
gg_classifier = qiime2.Artifact.load(classifier_path)

In [2]:
# Performs denoising on 16S rRNA sequences in each project using denoise_16S to generate feature tables and representative sequences, 
# and finally merges the results from all projects into a single dataset.
table_list = []
rep_seqs_list = []
for index, row in tqdm(config_df.iterrows()):
    print(row['project'] + ' start! ')
    quality_filtered_sequences = qiime2.Artifact.load(os.path.join(quality_filtered_sequences_dir, row['project'] + '.qza'))
    table, rep_seqs, deblur_stats = deblur.methods.denoise_16S(quality_filtered_sequences,
                                                                left_trim_len = row['left_trim_len'],
                                                                trim_length = row['trim_length'],
                                                                sample_stats = True,
                                                                jobs_to_start = 16)
    table.save(os.path.join(table_dir, row['project']))
    rep_seqs.save(os.path.join(rep_seqs_dir, row['project']))
    table_list.append(table)
    rep_seqs_list.append(rep_seqs)
    print(row['project'] + ' end! ')

merged_table = feature_table.methods.merge(table_list).merged_table
merged_table.save(os.path.join(merged_dir, 'merged_table_clean_original'))
merged_rep_seqs = feature_table.methods.merge_seqs(rep_seqs_list).merged_data
merged_rep_seqs.save(os.path.join(merged_dir, 'merged_rep_seqs_original'))

print('table & rep seqs merged!')

# Annotates representative sequences using the gg_classifier.
merged_taxonomy_class = feature_classifier.methods.classify_sklearn(reads=merged_rep_seqs,
                                                                    classifier=gg_classifier,
                                                                    n_jobs = 16).classification
merged_taxonomy_class.save(os.path.join(merged_dir, 'merged_taxonomy_class'))

print('merged_taxonomy_class saved!')

merged_table_clean = taxa.methods.filter_table(table = merged_table, 
                                               taxonomy = merged_taxonomy_class, 
                                               exclude = 'mitochondria,chloroplast').filtered_table

merged_table_clean = feature_table.methods.filter_features(merged_table_clean, min_samples=2).filtered_table

min_frequency = 1000
# filter sample by min_frequency
merged_table_clean = feature_table.methods.filter_samples(merged_table_clean, 
                                                          min_frequency= min_frequency,
                                                          metadata = merged_sample_metadata
                                                          ).filtered_table
merged_table_clean.save(os.path.join(merged_dir, 'merged_table_clean'))
print('merged table cleaned!')

# filter representative sequences based on the cleaned feature table
merged_rep_seqs = feature_table.methods.filter_seqs(data = merged_rep_seqs, 
                                                    table = merged_table_clean).filtered_data
merged_rep_seqs.save(os.path.join(merged_dir, 'merged_rep_seqs'))
print('merged rep seqs generated!')


print('STEP 3  Done!')

0it [00:00, ?it/s]

PRJNA924021 start! 


  logger.warn('deblur version %s workflow started on %s' %
  logger.warn('parameters: %s' % locals())
1it [01:45, 105.34s/it]

PRJNA924021 end! 
PRJEB14602 start! 


  logger.warn('deblur version %s workflow started on %s' %
  logger.warn('parameters: %s' % locals())
2it [04:34, 137.34s/it]

PRJEB14602 end! 



  for id_, seq in data.iteritems():


table & rep seqs merged!
merged_taxonomy_class saved!
merged table cleaned!


  for id_, seq in data.iteritems():


merged rep seqs generated!
STEP 3  Done!
