# Nanopore Microbiome Workflow

# 1. dorado Basecalling

#### Download Dorado basecaller
- if necessary

In [None]:
!wget https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.9.0-linux-x64.tar.gz -O /mnt/NanoporeRawData/dorado.tar.gz
!tar -xvf /mnt/NanoporeRawData/dorado.tar.gz -C /mnt/NanoporeRawData/

#### Fast5 Files Conversion
- if necessary

In [43]:
import os
import subprocess

# Define paths
raw_data_path = "/mnt/NanoporeRawData/2023-000029/"
fast5_path = os.path.join(raw_data_path, "fast5")
pod5_path = os.path.join(raw_data_path, "pod5")

# Function to check if the number of files match and none are empty
def check_pod5_files(fast5_path, pod5_path):
    fast5_files = [f for f in os.listdir(fast5_path) if f.endswith('.fast5')]
    pod5_files = [f for f in os.listdir(pod5_path) if f.endswith('.pod5')]

    # Check if counts match and ensure no empty files
    if len(fast5_files) != len(pod5_files):
        return False

    for pod5_file in pod5_files:
        if os.path.getsize(os.path.join(pod5_path, pod5_file)) == 0:  # Check if the file is empty
            return False

    return True

# Check if only the fast5 directory exists, and convert FAST5 to POD5
if os.path.exists(fast5_path) and not os.path.exists(pod5_path):
    print("FAST5 directory detected, converting to POD5...")

    # Install pod5 package if not already installed
    subprocess.run(["pip", "install", "pod5"], check=True)

    # Ensure the pod5 directory exists
    os.makedirs(pod5_path, exist_ok=True)

    # Convert FAST5 to POD5, using -o for output and -O for one-to-one mapping
    subprocess.run([
        "pod5", "convert", "fast5", 
        "-o", pod5_path,  # Output directory for POD5 files
        "-O", fast5_path,  # Parent directory for input files
        fast5_path  # Input path
    ], check=True)

# Rerun conversion if checks fail
if not check_pod5_files(fast5_path, pod5_path):
    print("File count mismatch or empty POD5 files detected. Re-running conversion...")
    subprocess.run([
        "pod5", "convert", "fast5", 
        "-o", pod5_path, 
        "-O", fast5_path,  
        fast5_path  
    ], check=True)

print("POD5 conversion completed.")

FAST5 directory detected, converting to POD5...


Converting 536 Fast5s: 100%|##########| 2142460/2142460 [4:05:15<00:00, 145.59Reads/s]  


POD5 conversion completed.


## 1-1. Pod5 Files Basecalling
- basecall
- convert BAM to fastq
- generate NanoPlot

In [1]:
import os
import subprocess
from packaging import version


''' ===== Configuration ===== '''
dorado_base_path = "/mnt/NanoporeRawData"
raw_data_path = "/mnt/NanoporeRawData/2024-000025/pod5"
basecall_data_path = "/mnt/Data/2024-000025/1_dorado"


''' ===== Workflow ===== '''
def get_latest_dorado_path(base_path):
    dorado_versions = [
        os.path.join(base_path, d) for d in os.listdir(base_path)
        if d.startswith("dorado-") and os.path.isdir(os.path.join(base_path, d))
    ]
    dorado_versions = sorted(
        dorado_versions, 
        key=lambda d: version.parse(d.split('-')[1].split('-')[0]),
        reverse=True
    )
    if dorado_versions:
        return os.path.join(dorado_versions[0], "bin", "dorado")
    else:
        raise FileNotFoundError("No Dorado versions found in the specified base path.")

try:
    dorado_path = get_latest_dorado_path(dorado_base_path)
    print(f"Using Dorado binary: {dorado_path}")
except FileNotFoundError as e:
    print(e)
    exit(1)

os.makedirs(basecall_data_path, exist_ok=True)

print("Running dorado basecaller...")
bam_file_path = os.path.join(basecall_data_path, "all.bam")
with open(bam_file_path, "w") as bam_file:
    subprocess.run([
        dorado_path, "basecaller", "sup", raw_data_path
    ], stdout=bam_file, check=True)

print("Converting BAM to FASTQ...")
subprocess.run([
    "bedtools", "bamtofastq", 
    "-i", bam_file_path,
    "-fq", os.path.join(basecall_data_path, "all.fastq")
], check=True)

print("Generating NanoPlot...")
subprocess.run([
    "NanoPlot", 
    "--fastq", os.path.join(basecall_data_path, "all.fastq"),
    "-o", basecall_data_path
], check=True)

print("Dorado basecalling and analysis completed.")


Running dorado basecaller...


[2024-12-23 18:05:53.122] [info] Running: "basecaller" "sup" "/mnt/NanoporeRawData/2024-000025/pod5"
[2024-12-23 18:05:55.069] [info]  - downloading dna_r10.4.1_e8.2_400bps_sup@v5.0.0 with httplib
[2024-12-23 18:06:57.923] [info] > Creating basecall pipeline
[2024-12-23 18:07:12.602] [info] Calculating optimized batch size for GPU "NVIDIA GeForce RTX 4090" and model /mnt/Data/2024-000025/.temp_dorado_model-9c7a34f08328b121/dna_r10.4.1_e8.2_400bps_sup@v5.0.0. Full benchmarking will run for this device, which may take some time.
[2024-12-23 18:07:42.852] [info] cuda:0 using chunk size 11520, batch size 128
[2024-12-23 18:07:43.498] [info] cuda:0 using chunk size 5760, batch size 128
[2024-12-23 20:13:45.007] [info] > Finished in (ms): 7560949
[2024-12-23 20:13:45.007] [info] > Simplex reads basecalled: 467683
[2024-12-23 20:13:45.007] [info] > Simplex reads filtered: 5
[2024-12-23 20:13:45.007] [info] > Basecalled @ Samples/s: 3.403250e+06
[2024-12-23 20:13:45.423] [info] > Finished


Converting BAM to FASTQ...
Generating NanoPlot...
Dorado basecalling and analysis completed.


# 2. NanoACT Demultiplexing & Processing

##### Rename SampleID
- if necessary

In [5]:
import os
import csv


''' ===== Configuration ===== '''
project_id = "2024-000012"  # Set this to your project ID
sample_type = "DNA samples"  # Specify the sample type for filtering
replace_name = True  # Set True to replace SampleID with Reference ID from reference.csv

''' File path settings '''
base_dir = f"/mnt/Data/{project_id}"  # Base directory containing the csv files
barcode_ID_file = os.path.join(base_dir, f"{project_id}.csv")
reference_file = os.path.join(base_dir, "reference.csv")
output_file = os.path.join(base_dir, f"{project_id}_modf.csv")


''' ===== Workflow ===== '''
# Initialize containers for data
barcode_IDs = []
reference_data = {}

# Load reference data if the file exists
if replace_name and os.path.exists(reference_file):
    # Read the reference file and extract relevant columns
    with open(reference_file, 'r') as ref_file:
        reader = csv.DictReader(ref_file)  # Assuming CSV file
        for row in reader:
            # Filter by the specified sample type
            if row['Sample type'] == sample_type:
                reference_data[row['PCR ID']] = row['Name or Morph description']
    print(f"Loaded reference data for sample type: {sample_type}.")
else:
    print("reference.csv not found. Skipping sample replacement step.")

# Read the barcode_ID file
with open(barcode_ID_file, 'r') as barcode_file:
    reader = csv.DictReader(barcode_file)
    barcode_header = reader.fieldnames  # Save the header
    for row in reader:
        sample_id = row['SampleID']  # Original SampleID from the barcode file
        # Replace SampleID if reference data is available and matches
        if reference_data and sample_id in reference_data:
            row['SampleID'] = reference_data[sample_id]  # Replace with Reference ID
        barcode_IDs.append(row)  # Add modified row to the list

# Prepare output data
output_data = barcode_IDs  # Append modified barcode data

# Write to the new CSV file
with open(output_file, 'w', newline='') as output_csv:
    writer = csv.DictWriter(output_csv, fieldnames=barcode_header)
    writer.writeheader()
    writer.writerows(output_data)

print(f"File written to: {output_file}")

Loaded reference data for sample type: DNA samples.


AttributeError: 'list' object has no attribute 'keys'

## 2-1. Load NanoAct

In [2]:
import os

working_directory = os.getcwd()

# Change to home directory
os.chdir(os.path.expanduser("~"))

# Check if 'nanoACT' directory exists
if not os.path.exists("nanoACT"):
    # If not, clone the repository
    !git clone https://github.com/Raingel/nanoACT.git
    os.chdir(os.path.expanduser("~/nanoACT/"))
else:
    # If the directory exists, reset local changes and pull the latest updates
    os.chdir(os.path.expanduser("~/nanoACT/"))
    !git fetch --all > /dev/null 2>&1
    !git reset --hard origin/main > /dev/null 2>&1 # Force reset to the latest commit
    !git pull > /dev/null 2>&1

# Install requirements if necessary
"""
!pip install --upgrade pip
!pip install -r requirements.txt
"""

# Import nanoAct and initialize
from nanoact import nanoact
dumb = nanoact.NanoAct(TEMP = "/home/huanglabserver/nanoACT/temp/")

# Change back to the original working directory
os.chdir(working_directory)

# Verify the current working directory
print(os.getcwd())

/mnt/Data/2024-000012


## 2-2. Processing sequences
- Quality filtering
- Demultiplexing
- Orientation 
- Trimming artificial reads

In [7]:
import os


''' ===== Configuration ===== '''
project_id = "2024-000012"  # Set this to your project ID


''' Analysis settings '''
input_format = "fastq"
output_format = "fastq" #輸出檔案的格式，預設為 'both'。可以是 fastq 或 fasta。'both' 代表同時輸出 fastq 和 fasta
mismatch_ratio_f = 0.1 #FwIndex容許的錯誤率，預設為0.15。例如barcode長度為20bp，則容許0.15*20=3bp的錯誤(edit distance)
mismatch_ratio_r = 0.1 #RvAnchor容許的錯誤率，預設為0.15

# Quality Filter Configuration
QSCORE = 9 #recommended 7-9
MIN_LEN = 700 #depends on the length of your reads
MAX_LEN = 2000 #depends on the length of your reads

# Demultiplexing Configuration
expected_length_variation = 0.75 #預期的read長度變異，預設為0.3。例如預期的read長度為300bp，則容許0.3*300=90bp的變異
search_range = 150 #搜尋barcode的範圍，預設為150bp。代表搜尋範圍為前150bp和後150bp
rvc_rvanchor = False #預設為'False'。'True'則程式執行reverse-complement。

# Orientation Correction Configuration
orientation_search_range = 500 #搜尋FwPrimer和RvPrimer的範圍，預設為200bp。代表搜尋範圍為前200bp和後200bp。

# Trim Reads Configuration
fw_offset = 0 #從距離找到的切除位點開始往後切除幾個bp，預設為0，可以是負數。例如fw_offset=-10，則從距離找到的切除位點開始往前切除10個bp
rv_offset = 0 #從距離找到的切除位點開始往前切除幾個bp，預設為0，可以是負數。例如rv_offset=-10，則從距離找到的切除位點開始往後切除10個bp
discard_no_match = False
check_both_directions = True
reverse_complement_rv_col = True
trimming_search_range = 200

# Clustering Configuration (mmseqs_cluster)
cluster_min_seq_id = 0.98
cluster_mode = 0
cov_mode = 0
kmer_length = 15
kmer_per_seq = 20
sensitivity = 8.5
min_read_num = 2
suppress_output = False #suppress_output=False will output all details of the clustering process. Use it when unknown error occurs.

# Consensus Configuration (mafft_consensus)
minimal_reads = 2  # minimal_reads for consensus
max_reads = -1 #max_reads: 設定最多的序列數量，-1 代表不限制。例如max_reads=100，則只會隨機取100個序列進行排比
adjustdirection = False


''' ===== Workflow ===== '''
def process_data(project_id):
    data_base_path = f"/mnt/Data/{project_id}"
    src_path_dorado = os.path.join(data_base_path, "1_dorado")
    des_path_nanofilt = os.path.join(data_base_path, "2_nanofilt")
    des_path_demultiplex = os.path.join(data_base_path, "3_demultiplex")
    des_path_orientation = os.path.join(data_base_path, "4_orientation")  # Orientation output folder
    des_path_trimmed = os.path.join(data_base_path, "5_trimmed")  # Trimming output folder
    barcode_index_file = os.path.join(data_base_path, f"{project_id}.csv")


    # Step 1. Filter by Quality and Length
    filtered_fastq = dumb.qualityfilt(
        src = os.path.join(src_path_dorado, 'all.fastq'),
        des = des_path_nanofilt,
        name = 'all_qualityfilt.fastq',
        QSCORE = QSCORE,
        MIN_LEN = MIN_LEN,
        MAX_LEN = MAX_LEN
    )

    # Step 2. Demultiplexing
    demultiplexed = dumb.singlebar(
        src = os.path.join(des_path_nanofilt, 'all_qualityfilt.fastq'),
        des = des_path_demultiplex,
        BARCODE_INDEX_FILE = barcode_index_file,
        mismatch_ratio_f = mismatch_ratio_f,
        mismatch_ratio_r = mismatch_ratio_r,
        expected_length_variation = expected_length_variation,
        search_range = search_range,
        rvc_rvanchor = rvc_rvanchor,
        input_format = input_format,
        output_format = output_format
    )

    # Step 3. Orientation correction
    orientation = dumb.orientation(
        src = des_path_demultiplex,
        des = des_path_orientation,
        input_format = input_format,
        output_format = output_format,
        BARCODE_INDEX_FILE = barcode_index_file,
        FwPrimer = "FwPrimer",
        RvPrimer = "RvPrimer",
        search_range = orientation_search_range
    )
    
    # Step 4. Trim Reads
    trimmed = dumb.trim_reads(
        src = des_path_orientation,
        des = des_path_trimmed,
        BARCODE_INDEX_FILE = barcode_index_file,
        fw_col = "FwPrimer",
        rv_col = "RvPrimer",
        input_format = input_format,
        output_format = output_format,
        mode = "table",
        fw_offset = fw_offset,
        rv_offset = rv_offset,
        mismatch_ratio_f = mismatch_ratio_f,
        mismatch_ratio_r = mismatch_ratio_r,
        discard_no_match = discard_no_match,
        check_both_directions = check_both_directions,
        reverse_complement_rv_col = reverse_complement_rv_col,
        search_range = trimming_search_range
    )

    return "Data processing complete."

# Call the function with the desired project ID:
result = process_data(project_id)
print(result)

[13:45:12] Start Qualityfilt...
[13:45:12] QSCORE: 9, MIN_LEN: 700, MAX_LEN: 2000
[13:46:16] 248060/570266 (43.50%) reads were passed quality filter
[13:46:16] BARCODE_INDEX_FILE loaded
[13:46:16] Parsed 10000
[13:46:17] Parsed 20000
[13:46:18] Parsed 30000
[13:46:18] Parsed 40000
[13:46:19] Parsed 50000
[13:46:20] Parsed 60000
[13:46:21] Parsed 70000
[13:46:21] Parsed 80000
[13:46:22] Parsed 90000
[13:46:23] Parsed 100000
[13:46:24] Parsed 110000
[13:46:24] Parsed 120000
[13:46:25] Parsed 130000
[13:46:26] Parsed 140000
[13:46:26] Parsed 150000
[13:46:27] Parsed 160000
[13:46:28] Parsed 170000
[13:46:29] Parsed 180000
[13:46:29] Parsed 190000
[13:46:30] Parsed 200000
[13:46:31] Parsed 210000
[13:46:32] Parsed 220000
[13:46:32] Parsed 230000
[13:46:33] Parsed 240000
[13:47:17] 140610/248060 (56.68%) reads were demultiplexed successfully
[13:47:18] Processing D6NTU.fastq
[13:47:21] Processing D6NTL.fastq
[13:47:29] Processing D6WTL.fastq
[13:47:32] Processing D6WTU.fastq
[13:47:32] Proc

#### Copy files for Kraken Analysis
- if necessary

In [28]:
import os
import pandas as pd
import shutil


''' ===== Configuration ===== '''
project_id = "Taiwan_soil_microbiome"

base_path = f"/mnt/Data/{project_id}"
csv_file = os.path.join(base_path, "Soil_Microbiome_Project.csv")  # Path to your CSV file
des_path = os.path.join(base_path, "fastq") # Destination directory for copied FASTQ files

''' File path settings '''
subfolder_col_base = "Nanopore ID"  # Base name for subfolder columns
fastq_col_base = "PCR ID"  # Base name for fastq columns
gene_name_lists = ["16S", "ITS"]  # Different gene name suffixes
fastq_folder = "5_trimmed"

''' ===== Workflow ===== '''
def copy_fastq_files(csv_file, des_path):
    df = pd.read_csv(csv_file, sep=',')
    
    # Loop through each gene name in the list
    for gene_name in gene_name_lists:
        # Create column names based on the base column name and gene suffix
        subfolder_cols = [f'{subfolder_col_base} ({gene_name}_1)', f'{subfolder_col_base} ({gene_name}_2)']
        fastq_cols = [f'{fastq_col_base} ({gene_name}_1)', f'{fastq_col_base} ({gene_name}_2)']

        # Loop through each row in the DataFrame
        for index, row in df.iterrows():
            for i, (subfolder_col, fastq_col) in enumerate(zip(subfolder_cols, fastq_cols), start=1):
                subfolder = row.get(subfolder_col)
                fastq_file = row.get(fastq_col)
                
                # If both subfolder and fastq file exist, construct the path
                if pd.notna(subfolder) and pd.notna(fastq_file):
                    # Construct the full path to the FASTQ file
                    source_file = os.path.join(base_path, str(subfolder), fastq_folder, str(fastq_file) + ".fastq")
                    print(source_file)

                    # Define the destination subfolder based on gene name and replicate number
                    replicate_folder = f"{gene_name}_replicate_{i}"
                    full_des_path = os.path.join(des_path, replicate_folder)

                    # Ensure the destination directory exists
                    os.makedirs(full_des_path, exist_ok=True)

                    # Construct the destination file path
                    dest_file = os.path.join(full_des_path, str(fastq_file) + ".fastq")

                    try:
                        # Copy the FASTQ file to the destination directory
                        shutil.copy2(source_file, dest_file)
                        print(f"Copied: {source_file} to {dest_file}")
                    except FileNotFoundError:
                        print(f"File not found: {source_file}")
                    except Exception as e:
                        print(f"Error copying file {source_file}: {e}")

# Call the function to copy the files
copy_fastq_files(csv_file, des_path)


# 3. Microbiome Sequences Processing

## 3-1. Kraken2 execution
- Kracken2 taxonomic assignment

In [1]:
import os
import concurrent.futures
import subprocess


''' ===== Configuration ===== '''
project_id = "2024-000012"
num_workers = 1  # Adjust this number based on your system's capabilities

''' File path settings '''
base_path =  f"/mnt/Data/{project_id}"  # Base directory containing the csv files
fastq_dir = os.path.join(base_path, "5_trimmed/Termites_soil")
output_dir = os.path.join(base_path, "Termites_soil/kraken2_output")
kraken_db = "/mnt/localdatabase/k2_unite-allEUK/"


''' ===== Workflow  ===== '''
os.makedirs(output_dir, exist_ok=True) # Create the output directory if it doesn't exist

# Function to check if Kraken 2 analysis is already done for a file
def is_already_processed(base_name):
    kraken_output = os.path.join(output_dir, f"{base_name}.kraken2.out")
    kraken_report = os.path.join(output_dir, f"{base_name}.kraken2.report")
    return os.path.exists(kraken_output) and os.path.exists(kraken_report)

# Function to process each FASTQ file for Kraken 2 analysis
def process_fastq_for_kraken(fastq_file):
    base_name = os.path.splitext(fastq_file)[0]
    print(f"Processing file for Kraken2: {fastq_file} (Base name: {base_name})")
    if is_already_processed(base_name): # Skip processing if already processed
        print(f"Skipping {base_name}, already processed by Kraken2.")
        return

    # Kraken 2 analysis
    kraken_output = os.path.join(output_dir, f"{base_name}.kraken2.out")
    kraken_report = os.path.join(output_dir, f"{base_name}.kraken2.report")
    
    kraken_cmd = [
        "kraken2", "--db", kraken_db, "--threads", "20",
        "--confidence", "0.01",  # Adjust confidence between 0 and 1 as needed
        "--memory-mapping",
        "--output", kraken_output,
        "--report", kraken_report,
        os.path.join(fastq_dir, fastq_file)
    ]
    try:
        subprocess.run(kraken_cmd, check=True)
        print(f"Kraken2 analysis completed for {base_name}")
    except subprocess.CalledProcessError as e:
        print(f"Error during Kraken2 analysis for {base_name}: {e}")

if __name__ == "__main__":
    os.makedirs(output_dir, exist_ok=True)
    
    # Check if there are FASTQ files in the directory
    fastq_files = [f for f in os.listdir(fastq_dir) if f.endswith(".fastq")]
    
    if not fastq_files:
        print(f"No FASTQ files found in {fastq_dir}.")
    else:
        print(f"Found {len(fastq_files)} FASTQ files. Processing Kraken2...")

    # Using ThreadPoolExecutor with a specified number of workers
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        executor.map(process_fastq_for_kraken, fastq_files)

    print("All Kraken2 analysis completed.")


Found 9 FASTQ files. Processing Kraken2...
Processing file for Kraken2: D3NTL.fastq (Base name: D3NTL)


Loading database information... done.
20534 sequences (16.25 Mbp) processed in 0.822s (1498.6 Kseq/m, 1185.79 Mbp/m).
  20532 sequences classified (99.99%)
  2 sequences unclassified (0.01%)


Kraken2 analysis completed for D3NTL
Processing file for Kraken2: D3NTU.fastq (Base name: D3NTU)


Loading database information... done.
8400 sequences (6.76 Mbp) processed in 0.111s (4526.4 Kseq/m, 3640.85 Mbp/m).
  8400 sequences classified (100.00%)
  0 sequences unclassified (0.00%)


Kraken2 analysis completed for D3NTU
Processing file for Kraken2: D3WTL.fastq (Base name: D3WTL)


Loading database information... done.
5238 sequences (5.77 Mbp) processed in 0.129s (2440.7 Kseq/m, 2687.60 Mbp/m).
  5238 sequences classified (100.00%)
  0 sequences unclassified (0.00%)


Kraken2 analysis completed for D3WTL
Processing file for Kraken2: D3WTU.fastq (Base name: D3WTU)


Loading database information... done.
7862 sequences (8.72 Mbp) processed in 0.134s (3511.8 Kseq/m, 3896.80 Mbp/m).
  7862 sequences classified (100.00%)
  0 sequences unclassified (0.00%)


Kraken2 analysis completed for D3WTU
Processing file for Kraken2: D6NTL.fastq (Base name: D6NTL)


Loading database information... done.
40964 sequences (31.63 Mbp) processed in 0.253s (9700.0 Kseq/m, 7490.29 Mbp/m).
  40962 sequences classified (100.00%)
  2 sequences unclassified (0.00%)


Kraken2 analysis completed for D6NTL
Processing file for Kraken2: D6NTU.fastq (Base name: D6NTU)


Loading database information... done.
17298 sequences (13.54 Mbp) processed in 0.167s (6231.1 Kseq/m, 4876.25 Mbp/m).
  17296 sequences classified (99.99%)
  2 sequences unclassified (0.01%)


Kraken2 analysis completed for D6NTU
Processing file for Kraken2: D6WTL.fastq (Base name: D6WTL)


Loading database information... done.
11498 sequences (12.49 Mbp) processed in 0.146s (4717.5 Kseq/m, 5125.62 Mbp/m).
  11498 sequences classified (100.00%)
  0 sequences unclassified (0.00%)


Kraken2 analysis completed for D6WTL
Processing file for Kraken2: D6WTU.fastq (Base name: D6WTU)


Loading database information... done.
1656 sequences (1.87 Mbp) processed in 0.107s (931.8 Kseq/m, 1053.91 Mbp/m).
  1656 sequences classified (100.00%)
  0 sequences unclassified (0.00%)


Kraken2 analysis completed for D6WTU
Processing file for Kraken2: DoLower.fastq (Base name: DoLower)


Loading database information... done.
8140 sequences (8.99 Mbp) processed in 0.127s (3830.7 Kseq/m, 4230.06 Mbp/m).
  8140 sequences classified (100.00%)
  0 sequences unclassified (0.00%)


Kraken2 analysis completed for DoLower
All Kraken2 analysis completed.


#### Rarefaction Curve
- To determine how much sequencing depth you need

In [5]:
import os
import csv
import pandas as pd
import numpy as np
from pathlib import Path
from rpy2.robjects import pandas2ri, r
import rpy2.robjects.packages as rpackages
from rpy2.robjects.vectors import FloatVector

pandas2ri.activate() # Activate pandas2ri for DataFrame conversion

''' ===== Configuration ===== '''
project_id = "Taiwan_soil_microbiome"
coverage_cutoff = 0.995

''' File path settings '''
base_path =  f"/mnt/Data/{project_id}"
input_dir = Path(base_path) / "thesis_data"
output_dir = Path(base_path) / "thesis_data"
files = list(input_dir.glob("16S_r*_kraken_filtered/*.kraken2.report"))
output_csv = os.path.join(base_path, "thesis_data/16S_species_richness_matrix.csv")

r.assign("output_dir", str(output_dir))


''' ===== Function ===== '''
def parse_kraken_report(file_path, level="S"):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            fields = line.strip().split("\t")
            rank_code = fields[3]
            if rank_code == level:  # Filter by taxonomic level
                count = int(fields[1])
                name = fields[5].strip()
                data.append({"Taxon": name, "Reads": count})
    return pd.DataFrame(data)


''' ===== Workflow ===== '''
richness_matrix = pd.DataFrame()

for file in files:
    file_name = file.name.replace(".kraken2.report", "")  
    species_data = parse_kraken_report(file)
    
    if not species_data.empty:
        species_data.set_index("Taxon", inplace=True)
        species_data = species_data["Reads"].transpose()
        
        richness_matrix = pd.concat([richness_matrix, species_data], axis=1).fillna(0)

richness_matrix.columns = [f.name.replace(".kraken2.report", "") for f in files]
#richness_matrix = richness_matrix.transpose()
richness_matrix.to_csv(output_csv)
print("Analysis completed... Creating species richness matrix...")

print("Executing R codes using Python")
utils = rpackages.importr("utils")
utils.chooseCRANmirror(ind=1)  # Select the first CRAN mirror
iNEXT = rpackages.importr("iNEXT")

richness_matrix_r = pandas2ri.py2rpy(richness_matrix)
#print(richness_matrix_r)
#print(richness_matrix.shape)
#print(richness_matrix.head())
#print(richness_matrix.isnull().sum())
rf = iNEXT.iNEXT(richness_matrix_r, q=FloatVector([0]), datatype="abundance", endpoint = 50000)
r.assign("rf", rf)
r.assign("coverage_cutoff", coverage_cutoff)

r('''
library(iNEXT)
library(dplyr)
library(ggplot2)

rarefaction_curve <- ggiNEXT(rf, type = 1) + theme(legend.position="none")
ggsave(file.path(output_dir, "rarefaction_curve.png"), plot = rarefaction_curve, device = "png", width = 10, height = 8)

rf_cov <- rf[["iNextEst"]][["coverage_based"]]
uniq_cov <- sort(unique(rf_cov$Assemblage))
cov <- rf_cov[rf_cov$SC >= coverage_cutoff, ]
cov$Assemblage <- as.numeric(cov$Assemblage)

low_cov <- cov %>% 
  group_by(Assemblage) %>%
  slice_min(order_by = SC, with_ties = FALSE) %>%
  arrange(Assemblage)

low_cov <- low_cov[!is.infinite(low_cov$m), ]
seq_depth <- max(low_cov$m)
''')

seq_depth = r("seq_depth")
print("Species richness matrix and rarefaction analysis complete.")
print(f"The minimum sequencing depth is {seq_depth}")

Analysis completed... Creating species richness matrix...
Executing R codes using Python


R[write to console]: In addition: 

R[write to console]: 1: 
R[write to console]: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
R[write to console]: 
 
R[write to console]:  libraries ‘/usr/local/lib/R/site-library’, ‘/usr/lib/R/site-library’ contain no packages

R[write to console]: 2: 
R[write to console]: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
R[write to console]: 
 
R[write to console]:  libraries ‘/usr/local/lib/R/site-library’, ‘/usr/lib/R/site-library’ contain no packages

R[write to console]: 3: The shape palette can deal with a maximum of 6 discrete values because more
than 6 becomes difficult to discriminate
ℹ you have requested 218 values. Consider specifying shapes manually if you
  need that many have them. 

R[write to console]: 4: Removed 212 rows containing missing values or values outside the scale range
(`geom_point()`). 



Species richness matrix and rarefaction analysis complete.
The minimum sequencing depth is [19474.]


## 3-2. Kraken2 reads report

In [18]:
''' latest'''
import re
import csv
import os
from pathlib import Path


''' ===== Configuration ===== ''' 
project_id = "2024-000012"

''' File path settings '''
base_path = f"/mnt/Data/{project_id}"  # Base directory containing the CSV files
input_dir = os.path.join(base_path, "Termites_soil/kraken2_output")
output_csv = os.path.join(base_path, "2024-000012_reads.csv")
taxon_rank = 'D'                            # Taxon rank input by user 
taxid = 9                                # Taxid input by user
taxn = "Fungi"


''' ===== Functions ===== '''
def parse_kraken2_report(file_path, taxid, taxon_rank):
    match = re.match(r"(.+?)\.kraken2(?:\.report)?$", Path(file_path).name)
    sample_id = match.group(1) if match else Path(file_path).stem
    total_reads_root = None
    taxid_reads = None
    taxn_reads = None
    taxon_name_root = ""
    taxon_name = ""
    
    with open(file_path, 'r') as file:
        for line in file:
            fields = line.strip().split('\t')
            perc_reads, total_reads, reads_assigned, rank, tax_id, taxon_name_temp = fields

            if tax_id == '1':
                total_reads_root = int(total_reads)
                taxon_name_root = taxon_name_temp.strip()
            
            if taxn in taxon_name_temp and rank == taxon_rank:
                taxn_reads = int(total_reads)
                taxon_name = taxon_name_temp.strip()
      
    return {
        'sample_id': sample_id,
        'total_reads_root': total_reads_root,
        'taxon_name_root': taxon_name_root,
        'taxon_reads': taxn_reads,
        'taxon_name': taxon_name
    }

def process_reports(input_dir, taxn, taxon_rank, output_csv):
    report_files = list(Path(input_dir).rglob("*.kraken2.report"))
    total_files = len(report_files)
    results = []
    
    for count, report_path in enumerate(report_files, start=1):
        result = parse_kraken2_report(report_path, taxn, taxon_rank)
        print(result)
        if result['total_reads_root'] is not None and result['taxon_reads'] is not None:
            results.append([
                result['sample_id'], 
                result['total_reads_root'], 
                result['taxon_name_root'], 
                result['taxon_reads'], 
                result['taxon_name']
            ])
        
        print(f"Processing file {count}/{total_files} ...", end='\r')

    results.sort(key=lambda x: x[0])  # Sort by the first column (Sample ID)

    # Write sorted data to CSV
    with open(output_csv, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            'Sample ID', 
            'Root Reads (taxid=1)', 
            'Root Name (rank=R)', 
            f'Taxon Reads (taxon={taxn}, rank={taxon_rank})', 
            f'Taxon Name (taxon={taxn}, rank={taxon_rank})'
        ])
        writer.writerows(results)
    
    print("\nReads report generated")


''' ===== Workflow ===== '''
process_reports(input_dir, taxn, taxon_rank, output_csv)

Debug: Rank=D, Taxon Name=  Fungi, Tax ID=9, Reads=19990
Debug: Rank=D, Taxon Name=  Eukaryota kgd Incertae sedis, Tax ID=8, Reads=338
Debug: Rank=D, Taxon Name=  Metazoa, Tax ID=14, Reads=88
Debug: Rank=D, Taxon Name=  Viridiplantae, Tax ID=23, Reads=44
Debug: Rank=D, Taxon Name=  Amoebozoa, Tax ID=3, Reads=4
{'sample_id': 'D3NTL', 'total_reads_root': 20532, 'taxon_name_root': 'root', 'taxon_reads': 19990, 'taxon_name': 'Fungi'}
Debug: Rank=D, Taxon Name=  Fungi, Tax ID=9, Reads=8096
Debug: Rank=D, Taxon Name=  Eukaryota kgd Incertae sedis, Tax ID=8, Reads=240
Debug: Rank=D, Taxon Name=  Viridiplantae, Tax ID=23, Reads=24
{'sample_id': 'D3NTU', 'total_reads_root': 8400, 'taxon_name_root': 'root', 'taxon_reads': 8096, 'taxon_name': 'Fungi'}
Debug: Rank=D, Taxon Name=  Fungi, Tax ID=9, Reads=5232
Debug: Rank=D, Taxon Name=  Eukaryota kgd Incertae sedis, Tax ID=8, Reads=4
{'sample_id': 'D3WTL', 'total_reads_root': 5238, 'taxon_name_root': 'root', 'taxon_reads': 5232, 'taxon_name': 'Fungi

#### Reads Comparison between Database
- if necessary

In [3]:
import csv
import os


''' ===== Configuration ===== '''
project_id = "2024-000012"

''' File path settings '''
base_path = f"/mnt/Data/{project_id}"  # Base directory containing the CSV files
file_1 = os.path.join(base_path, "kraken_refseq_qc.csv")  # First QC CSV file
file_2 = os.path.join(base_path, "kraken_D3_qc.csv")  # Second QC CSV file
output_csv = os.path.join(base_path, "kraken_qc_comparison.csv")  # Output comparison CSV


''' ===== Functions ===== '''
def read_qc_file(file_path):
    data = {}
    with open(file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            sample_id = row['Sample ID']
            # Extract root and taxon reads as integers
            root_reads = int(row['Reads (taxid=1)'])
            taxon_reads = int(row[f'Reads (taxid={taxid}, rank={taxon_rank})'])
            data[sample_id] = {'Root Reads': root_reads, 'Taxon Reads': taxon_reads}
    return data

def compare_qc_data(data_1, data_2):
    comparison = []
    all_sample_ids = set(data_1.keys()).union(data_2.keys())
    
    for sample_id in sorted(all_sample_ids):
        root_reads_1 = data_1.get(sample_id, {}).get('Root Reads', 'N/A')
        taxon_reads_1 = data_1.get(sample_id, {}).get('Taxon Reads', 'N/A')
        root_reads_2 = data_2.get(sample_id, {}).get('Root Reads', 'N/A')
        taxon_reads_2 = data_2.get(sample_id, {}).get('Taxon Reads', 'N/A')

        comparison.append({
            'Sample ID': sample_id,
            'Root Reads (File 1)': root_reads_1,
            'Taxon Reads (File 1)': taxon_reads_1,
            'Root Reads (File 2)': root_reads_2,
            'Taxon Reads (File 2)': taxon_reads_2,
            'Root Reads Difference': (root_reads_2 - root_reads_1) if root_reads_1 != 'N/A' and root_reads_2 != 'N/A' else 'N/A',
            'Taxon Reads Difference': (taxon_reads_2 - taxon_reads_1) if taxon_reads_1 != 'N/A' and taxon_reads_2 != 'N/A' else 'N/A'
        })
    
    return comparison

def write_comparison_to_csv(comparison_data, output_csv):
    with open(output_csv, 'w', newline='') as csvfile:
        fieldnames = [
            'Sample ID', 
            'Root Reads (File 1)', 
            'Taxon Reads (File 1)', 
            'Root Reads (File 2)', 
            'Taxon Reads (File 2)',
            'Root Reads Difference', 
            'Taxon Reads Difference'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(comparison_data)

    print("\nComparison report generated")


''' ===== Workflow ===== '''
# Read both QC files
data_1 = read_qc_file(file_1)
data_2 = read_qc_file(file_2)

# Compare data
comparison_data = compare_qc_data(data_1, data_2)

# Write comparison results to CSV
write_comparison_to_csv(comparison_data, output_csv)


KeyError: 'Reads (taxid=1)'

#### Kraken Checkpoint Filtering
- if necessary
- List reports with reads < 2000

In [17]:
import os
import csv


''' ===== Configuration ===== '''      
project_id = "Taiwan_soil_microbiome"

''' File path settings '''
base_path = f"/mnt/Data/{project_id}" 
input_dir = os.path.join(base_path, "")  
output_csv = os.path.join(base_path, "kraken2_failed.csv")
target_dir = "chemical_analysis"


''' ===== Functions  ===== '''
# Function to extract sample ID from .kraken2.report files
def extract_sample_id_from_kraken_reports(input_dir, output_csv):
    results = []

    # Iterate through all subdirectories in the given folder
    for root, dirs, files in os.walk(input_dir):
        # Only process subdirectories containing "chemical_analysis"
        if target_dir in root:
            for filename in files:
                if filename.endswith(".kraken2.report"):
                    file_path = os.path.join(root, filename)

                    # Open and read the file line by line to find rows with tax rank 'R'
                    with open(file_path, 'r') as file:
                        for line in file:
                            columns = line.strip().split()

                            # Ensure the row has enough columns and check if taxonomic rank is 'R'
                            if len(columns) >= 6 and columns[3] == 'R':
                                # Extract the second column (reads number) and convert to integer
                                second_column_value = int(columns[1])

                                # Filter: only include sample if reads number (second column) <= 2000
                                if second_column_value <= 2000:
                                    # Extract sample ID (assuming the sample ID is the number in the filename)
                                    sample_id = filename.split('.')[0]
                                    # Store the sample ID and second column value
                                    results.append((sample_id, second_column_value))
                                break  # Exit loop after processing the first 'R' rank

    # Sort the results by sample ID in ascending order
    results.sort(key=lambda x: int(x[0]))

    # Write the sorted results to the CSV file
    with open(output_csv, 'w', newline='') as csvfile:  # 'w' mode overwrites the file
        writer = csv.writer(csvfile)
        writer.writerow(['Sample ID', 'Reads'])
        writer.writerows(results)

    print(f"Results saved to {output_csv}")


''' ===== Workflow ===== '''
extract_sample_id_from_kraken_reports(input_dir, output_csv)


Results saved to /mnt/Data/Taiwan_soil_microbiome/kraken2_failed.csv


# 3-2. Kraken Data Filtering

In [3]:
import os
import shutil
import pandas as pd


''' ===== Configuration ===== '''          
project_id = "Taiwan_soil_microbiome"

''' File path settings '''
base_path = f"/mnt/Data/{project_id}" 
input_dir = os.path.join(base_path, "thesis_data/16S_r2_kraken")
output_dir = os.path.join(base_path, "thesis_data/16S_r2_kraken_filtered")
include_taxid = 2  # Taxid to filter
taxon_rank = 'D'  # Rank code to filter (e.g., D for domain)


''' ===== Functions ===== ''' 
def create_taxon_rank_hierarchy():
    # Define base ranks
    ranks = ['R', 'D', 'K', 'P', 'C', 'O', 'F', 'G', 'S']
    taxon_rank_hierarchy = {} # Initialize the hierarchy dictionary
    rank_counter = 1
    # Loop over the base ranks and their numbered versions
    for rank in ranks:
        taxon_rank_hierarchy[rank] = rank_counter # Add the base rank to the hierarchy
        rank_counter += 1
        for i in range(1, 10):
            rank_code = f"{rank}{i}"
            taxon_rank_hierarchy[rank_code] = rank_counter
            rank_counter += 1
    return taxon_rank_hierarchy

def get_primary_rank(rank_code):
    return ''.join(filter(str.isalpha, rank_code))

def parse_kraken_report(file_path, include_taxid, taxon_rank, output_file):
    # Load Kraken report
    with open(file_path, 'r') as file:
        data = []
        for line in file:
            fields = line.strip().split('\t')
            percentage, num_reads, direct_reads, rank_code, taxid, name = fields
            # Create a dictionary for each entry
            data.append({
                'percentage': float(percentage),
                'num_reads': int(num_reads),
                'direct_reads': int(direct_reads),
                'rank_code': rank_code,
                'primary_rank': get_primary_rank(rank_code),  # Determine the primary rank
                'taxid': int(taxid),
                'name': name,  # Keep unstripped name with indentation
                'indentation': len(name) - len(name.strip())  # Indentation level (spaces before name)
            })
    df = pd.DataFrame(data)

    # Step 1: Initialize variables for filtering
    taxon_rank_hierarchy = create_taxon_rank_hierarchy()
    start_index = df[df['taxid'] != 0].index[0]
    filtered_data = []
    upstream_data = []
    target_index = None
    target_indentation = None
    found_target = False
    downstream_end_index = None

    # Step 2: Process the rows
    for i in range(start_index, len(df)):
        row = df.iloc[i]

        # Check if we found the target row with the given taxon_rank and taxid
        if (row['rank_code'] == taxon_rank and row['taxid'] == include_taxid):
            target_index = i
            target_indentation = row['indentation']
            found_target = True
            continue

        # Append until a higher rank is found or until the end of the dataframe
        if found_target:
            if (row['rank_code'] == taxon_rank and row['taxid'] != include_taxid):
                filtered_data.append(row)
                if downstream_end_index is None:
                    downstream_end_index = i

            # Flexible filtering logic based on ranks and indentation after the target
            if (taxon_rank_hierarchy.get(row['rank_code'], float('inf')) < taxon_rank_hierarchy[taxon_rank]
                and row['indentation'] < target_indentation):
                filtered_data.append(row)

    if downstream_end_index is None:
        downstream_end_index = len(df)

    # Convert collected data into DataFrames
    df_filtered_data = pd.DataFrame(filtered_data)
    df_upstream_data = pd.DataFrame(df.iloc[start_index : target_index])
    df_downstream_data = pd.DataFrame(df.iloc[target_index : downstream_end_index])

    # Deduct the filtered data reads
    if df_filtered_data.empty:
        filtered_reads = 0
    else:
        filtered_reads = df_filtered_data['num_reads'].sum()
    df_upstream_data['num_reads'] = df_upstream_data['num_reads'] - filtered_reads

    # Final data
    filtered_data = pd.concat([df_upstream_data, df_downstream_data]) # Combine the data before and after the input taxon_rank and taxid

    # Set the first row percentage to 100% and round the percentages to 3 decimal places
    first_row_reads = filtered_data['num_reads'].iloc[0]
    filtered_data['percentage'] = round((filtered_data['num_reads'] / first_row_reads) * 100, 3)

    # Rearrange the DataFrame to match the original Kraken2 report format
    filtered_data = filtered_data[['percentage', 'num_reads', 'direct_reads', 'rank_code', 'taxid', 'name']]

    # Save the filtered data to a new .report file
    filtered_data.to_csv(output_file, sep='\t', index=False, header=False)

    print(f"Output saved to {output_file}")

def process_all_kraken_reports(input_dir, output_dir, include_taxid, taxon_rank):
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)  # Delete the output directory if it exists
    os.makedirs(output_dir)  # Create the output directory
    
    print("Processing ...")
    for filename in os.listdir(input_dir):
        if filename.endswith('.kraken2.report'):
            file_path = os.path.join(input_dir, filename)
            base_name = os.path.splitext(filename)[0]  # Get the base filename without extension
            output_file = os.path.join(output_dir, f'{base_name}.report')  # Output as .kraken2.report

            print(f"Processing {file_path} -> {output_file}")
            parse_kraken_report(file_path, include_taxid, taxon_rank, output_file)


''' ===== Workflow ===== '''
process_all_kraken_reports(input_dir, output_dir, include_taxid, taxon_rank)

Processing ...
Processing /mnt/Data/Taiwan_soil_microbiome/thesis_data/16S_r2_kraken/10957.kraken2.report -> /mnt/Data/Taiwan_soil_microbiome/thesis_data/16S_r2_kraken_filtered/10957.kraken2.report
Output saved to /mnt/Data/Taiwan_soil_microbiome/thesis_data/16S_r2_kraken_filtered/10957.kraken2.report
Processing /mnt/Data/Taiwan_soil_microbiome/thesis_data/16S_r2_kraken/10958.kraken2.report -> /mnt/Data/Taiwan_soil_microbiome/thesis_data/16S_r2_kraken_filtered/10958.kraken2.report
Output saved to /mnt/Data/Taiwan_soil_microbiome/thesis_data/16S_r2_kraken_filtered/10958.kraken2.report
Processing /mnt/Data/Taiwan_soil_microbiome/thesis_data/16S_r2_kraken/10959.kraken2.report -> /mnt/Data/Taiwan_soil_microbiome/thesis_data/16S_r2_kraken_filtered/10959.kraken2.report
Output saved to /mnt/Data/Taiwan_soil_microbiome/thesis_data/16S_r2_kraken_filtered/10959.kraken2.report
Processing /mnt/Data/Taiwan_soil_microbiome/thesis_data/16S_r2_kraken/10960.kraken2.report -> /mnt/Data/Taiwan_soil_micro

# 3-3. Bracken analysis

In [8]:
import os
import concurrent.futures
import subprocess


''' ===== Configuration ===== '''          
project_id = "2024-000012"
taxon_ranks = ['P', 'C', 'O', 'F', 'G', 'S']
read_len = "1000" #Adjust this depends the type of genes (either 16S rDNA: 1600 or ITS: 1000)as
num_workers = 1  # Adjust this number based on your system's capabilities

''' File path settings '''
base_path =  f"/mnt/Data/{project_id}"
kraken_output_dir = os.path.join(base_path, "Termites_soil/kraken2_output")
output_dir = os.path.join(base_path, "Termites_soil/kraken2_output/bracken")
bracken_db = "/mnt/localdatabase/k2_unite-allEUK/"


''' ===== Functions ===== ''' 
def bracken_exists(base_name):
    missing_levels = []
    for level in taxon_ranks:
        output_file = os.path.join(output_dir, f"{base_name}.bracken_{level}.tsv")
        if not os.path.exists(output_file):
            missing_levels.append(level)
    return missing_levels

def process_kraken_report_for_bracken(kraken_report_file):
    base_name = os.path.splitext(kraken_report_file)[0]
    print(f"Processing Kraken report for Bracken: {kraken_report_file} (Base name: {base_name})")

    # Skip processing if already processed
    if not bracken_exists(base_name):
        print(f"Skipping {base_name}, all Bracken analyses already completed.")
        return

    # Bracken analysis for different taxonomic levels
    for level in bracken_exists(base_name):  # Genus, Family, Species
        bracken_cmd = [
            "bracken", "-d", bracken_db,
            "-i", os.path.join(kraken_output_dir, kraken_report_file),
            "-o", os.path.join(output_dir, f"{base_name}.bracken_{level}.tsv"),
            "-r", read_len, "-l", level
        ]
        try:
            subprocess.run(bracken_cmd, check=True)
            print(f"Bracken {level} level analysis completed for {base_name}")
        except subprocess.CalledProcessError as e:
            print(f"Error during Bracken {level} analysis for {base_name}: {e}")


''' ===== Workflow ===== '''
if __name__ == "__main__":
    os.makedirs(output_dir, exist_ok=True)
    
    # Check if there are Kraken2 reports in the directory
    kraken_reports = [f for f in os.listdir(kraken_output_dir) if f.endswith(".kraken2.report")]
    
    if not kraken_reports:
        print(f"No Kraken2 reports found in {kraken_output_dir}.")
    else:
        print(f"Found {len(kraken_reports)} Kraken2 reports. Processing Bracken...")

    # Using ThreadPoolExecutor with a specified number of workers
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        executor.map(process_kraken_report_for_bracken, kraken_reports)

    print("All Bracken analysis completed.")

Found 9 Kraken2 reports. Processing Bracken...
Processing Kraken report for Bracken: D3NTL.kraken2.report (Base name: D3NTL.kraken2)
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTL.kraken2.report -o /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D3NTL.kraken2.bracken_P.tsv -k /mnt/localdatabase/k2_unite-allEUK/database1000mers.kmer_distrib -l P -t 10


>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTL.kraken2.report


PROGRAM START TIME: 01-03-2025 05:42:58
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTL.kraken2.report)
    >>> Threshold: 10 
    >>> Number of phylums in sample: 13 
	  >> Number of phylums with reads > threshold: 8 
	  >> Number of phylums with reads < threshold: 5 
    >>> Total reads in sample: 20534
	  >> Total reads kept at phylums level (reads > threshold): 20404
	  >> Total reads discarded (phylums reads < threshold): 18
	  >> Reads distributed: 110
	  >> Reads not distributed (eg. no phylums above threshold): 0
	  >> Unclassified reads: 2
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D3NTL.kraken2.bracken_P.tsv
PROGRAM END TIME: 01-03-2025 05:42:58
  Bracken complete.
Bracken P level analysis completed for D3NTL.kraken2
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTL.kraken2.report 

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTL.kraken2.report


PROGRAM START TIME: 01-03-2025 05:42:58
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTL.kraken2.report)
    >>> Threshold: 10 
    >>> Number of genuses in sample: 74 
	  >> Number of genuses with reads > threshold: 17 
	  >> Number of genuses with reads < threshold: 57 
    >>> Total reads in sample: 20534
	  >> Total reads kept at genuses level (reads > threshold): 19402
	  >> Total reads discarded (genuses reads < threshold): 172
	  >> Reads distributed: 880
	  >> Reads not distributed (eg. no genuses above threshold): 78
	  >> Unclassified reads: 2
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D3NTL.kraken2.bracken_G.tsv
PROGRAM END TIME: 01-03-2025 05:42:58
  Bracken complete.
Bracken G level analysis completed for D3NTL.kraken2
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTL.kraken2.rep

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTU.kraken2.report


PROGRAM START TIME: 01-03-2025 05:42:59
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTU.kraken2.report)
    >>> Threshold: 10 
    >>> Number of classes in sample: 15 
	  >> Number of classes with reads > threshold: 8 
	  >> Number of classes with reads < threshold: 7 
    >>> Total reads in sample: 8400
	  >> Total reads kept at classes level (reads > threshold): 8296
	  >> Total reads discarded (classes reads < threshold): 26
	  >> Reads distributed: 78
	  >> Reads not distributed (eg. no classes above threshold): 0
	  >> Unclassified reads: 0
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D3NTU.kraken2.bracken_C.tsv
PROGRAM END TIME: 01-03-2025 05:42:59
  Bracken complete.
Bracken C level analysis completed for D3NTU.kraken2
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTU.kraken2.report -o 

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTL.kraken2.report


PROGRAM START TIME: 01-03-2025 05:42:59
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3NTU.kraken2.report)
    >>> Threshold: 10 
    >>> Number of species in sample: 51 
	  >> Number of species with reads > threshold: 8 
	  >> Number of species with reads < threshold: 43 
    >>> Total reads in sample: 8400
	  >> Total reads kept at species level (reads > threshold): 7612
	  >> Total reads discarded (species reads < threshold): 146
	  >> Reads distributed: 436
	  >> Reads not distributed (eg. no species above threshold): 206
	  >> Unclassified reads: 0
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D3NTU.kraken2.bracken_S.tsv
PROGRAM END TIME: 01-03-2025 05:42:59
  Bracken complete.
Bracken S level analysis completed for D3NTU.kraken2
Processing Kraken report for Bracken: D3WTL.kraken2.report (Base name: D3WTL.kraken2)
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTL.kraken2.report


PROGRAM START TIME: 01-03-2025 05:42:59
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTL.kraken2.report)
    >>> Threshold: 10 
    >>> Number of families in sample: 9 
	  >> Number of families with reads > threshold: 2 
	  >> Number of families with reads < threshold: 7 
    >>> Total reads in sample: 5238
	  >> Total reads kept at families level (reads > threshold): 5196
	  >> Total reads discarded (families reads < threshold): 26
	  >> Reads distributed: 10
	  >> Reads not distributed (eg. no families above threshold): 6
	  >> Unclassified reads: 0
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D3WTL.kraken2.bracken_F.tsv
PROGRAM END TIME: 01-03-2025 05:42:59
  Bracken complete.
Bracken F level analysis completed for D3WTL.kraken2
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTL.kraken2.repor

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTU.kraken2.report


PROGRAM START TIME: 01-03-2025 05:42:59
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTU.kraken2.report)
    >>> Threshold: 10 
    >>> Number of classes in sample: 7 
	  >> Number of classes with reads > threshold: 2 
	  >> Number of classes with reads < threshold: 5 
    >>> Total reads in sample: 7862
	  >> Total reads kept at classes level (reads > threshold): 7848
	  >> Total reads discarded (classes reads < threshold): 10
	  >> Reads distributed: 4
	  >> Reads not distributed (eg. no classes above threshold): 0
	  >> Unclassified reads: 0
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D3WTU.kraken2.bracken_C.tsv
PROGRAM END TIME: 01-03-2025 05:42:59
  Bracken complete.
Bracken C level analysis completed for D3WTU.kraken2
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTU.kraken2.report -o /m

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTL.kraken2.report


PROGRAM START TIME: 01-03-2025 05:43:00
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D3WTU.kraken2.report)
    >>> Threshold: 10 
    >>> Number of species in sample: 8 
	  >> Number of species with reads > threshold: 2 
	  >> Number of species with reads < threshold: 6 
    >>> Total reads in sample: 7862
	  >> Total reads kept at species level (reads > threshold): 5800
	  >> Total reads discarded (species reads < threshold): 12
	  >> Reads distributed: 2032
	  >> Reads not distributed (eg. no species above threshold): 18
	  >> Unclassified reads: 0
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D3WTU.kraken2.bracken_S.tsv
PROGRAM END TIME: 01-03-2025 05:43:00
  Bracken complete.
Bracken S level analysis completed for D3WTU.kraken2
Processing Kraken report for Bracken: D6NTL.kraken2.report (Base name: D6NTL.kraken2)
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abu

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTL.kraken2.report


PROGRAM START TIME: 01-03-2025 05:43:00
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTL.kraken2.report)
    >>> Threshold: 10 
    >>> Number of orders in sample: 44 
	  >> Number of orders with reads > threshold: 24 
	  >> Number of orders with reads < threshold: 20 
    >>> Total reads in sample: 40964
	  >> Total reads kept at orders level (reads > threshold): 40112
	  >> Total reads discarded (orders reads < threshold): 54
	  >> Reads distributed: 790
	  >> Reads not distributed (eg. no orders above threshold): 6
	  >> Unclassified reads: 2
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D6NTL.kraken2.bracken_O.tsv
PROGRAM END TIME: 01-03-2025 05:43:00
  Bracken complete.
Bracken O level analysis completed for D6NTL.kraken2
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTL.kraken2.report -o /

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTU.kraken2.report


PROGRAM START TIME: 01-03-2025 05:43:00
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTU.kraken2.report)
    >>> Threshold: 10 
    >>> Number of phylums in sample: 14 
	  >> Number of phylums with reads > threshold: 6 
	  >> Number of phylums with reads < threshold: 8 
    >>> Total reads in sample: 17298
	  >> Total reads kept at phylums level (reads > threshold): 17118
	  >> Total reads discarded (phylums reads < threshold): 26
	  >> Reads distributed: 152
	  >> Reads not distributed (eg. no phylums above threshold): 0
	  >> Unclassified reads: 2
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D6NTU.kraken2.bracken_P.tsv
PROGRAM END TIME: 01-03-2025 05:43:00
  Bracken complete.
Bracken P level analysis completed for D6NTU.kraken2
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTU.kraken2.report 

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTL.kraken2.report


PROGRAM START TIME: 01-03-2025 05:43:00
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTU.kraken2.report)
    >>> Threshold: 10 
    >>> Number of genuses in sample: 106 
	  >> Number of genuses with reads > threshold: 21 
	  >> Number of genuses with reads < threshold: 85 
    >>> Total reads in sample: 17298
	  >> Total reads kept at genuses level (reads > threshold): 15940
	  >> Total reads discarded (genuses reads < threshold): 270
	  >> Reads distributed: 982
	  >> Reads not distributed (eg. no genuses above threshold): 104
	  >> Unclassified reads: 2
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D6NTU.kraken2.bracken_G.tsv
PROGRAM END TIME: 01-03-2025 05:43:00
  Bracken complete.
Bracken G level analysis completed for D6NTU.kraken2
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6NTU.kraken2.r

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTL.kraken2.report


PROGRAM START TIME: 01-03-2025 05:43:00
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTL.kraken2.report)
    >>> Threshold: 10 
    >>> Number of orders in sample: 10 
	  >> Number of orders with reads > threshold: 4 
	  >> Number of orders with reads < threshold: 6 
    >>> Total reads in sample: 11498
	  >> Total reads kept at orders level (reads > threshold): 11456
	  >> Total reads discarded (orders reads < threshold): 24
	  >> Reads distributed: 16
	  >> Reads not distributed (eg. no orders above threshold): 2
	  >> Unclassified reads: 0
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D6WTL.kraken2.bracken_O.tsv
PROGRAM END TIME: 01-03-2025 05:43:00
  Bracken complete.
Bracken O level analysis completed for D6WTL.kraken2
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTL.kraken2.report -o /mnt

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTL.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTU.kraken2.report


PROGRAM START TIME: 01-03-2025 05:43:01
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTL.kraken2.report)
    >>> Threshold: 10 
    >>> Number of species in sample: 17 
	  >> Number of species with reads > threshold: 5 
	  >> Number of species with reads < threshold: 12 
    >>> Total reads in sample: 11498
	  >> Total reads kept at species level (reads > threshold): 8812
	  >> Total reads discarded (species reads < threshold): 36
	  >> Reads distributed: 2620
	  >> Reads not distributed (eg. no species above threshold): 30
	  >> Unclassified reads: 0
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D6WTL.kraken2.bracken_S.tsv
PROGRAM END TIME: 01-03-2025 05:43:01
  Bracken complete.
Bracken S level analysis completed for D6WTL.kraken2
Processing Kraken report for Bracken: D6WTU.kraken2.report (Base name: D6WTU.kraken2)
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTU.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTU.kraken2.report


PROGRAM START TIME: 01-03-2025 05:43:01
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTU.kraken2.report)
    >>> Threshold: 10 
    >>> Number of families in sample: 6 
	  >> Number of families with reads > threshold: 1 
	  >> Number of families with reads < threshold: 5 
    >>> Total reads in sample: 1656
	  >> Total reads kept at families level (reads > threshold): 1644
	  >> Total reads discarded (families reads < threshold): 12
	  >> Reads distributed: 0
	  >> Reads not distributed (eg. no families above threshold): 0
	  >> Unclassified reads: 0
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/D6WTU.kraken2.bracken_F.tsv
PROGRAM END TIME: 01-03-2025 05:43:01
  Bracken complete.
Bracken F level analysis completed for D6WTU.kraken2
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/D6WTU.kraken2.report

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/DoLower.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/DoLower.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/DoLower.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/DoLower.kraken2.report


PROGRAM START TIME: 01-03-2025 05:43:01
BRACKEN SUMMARY (Kraken report: /mnt/Data/2024-000012/Termites_soil/kraken2_output/DoLower.kraken2.report)
    >>> Threshold: 10 
    >>> Number of phylums in sample: 2 
	  >> Number of phylums with reads > threshold: 2 
	  >> Number of phylums with reads < threshold: 0 
    >>> Total reads in sample: 8140
	  >> Total reads kept at phylums level (reads > threshold): 8126
	  >> Total reads discarded (phylums reads < threshold): 0
	  >> Reads distributed: 14
	  >> Reads not distributed (eg. no phylums above threshold): 0
	  >> Unclassified reads: 0
BRACKEN OUTPUT PRODUCED: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken/DoLower.kraken2.bracken_P.tsv
PROGRAM END TIME: 01-03-2025 05:43:01
  Bracken complete.
Bracken P level analysis completed for DoLower.kraken2
 >> Checking for Valid Options...
 >> Running Bracken 
      >> python3 /opt/Bracken/src/est_abundance.py -i /mnt/Data/2024-000012/Termites_soil/kraken2_output/DoLower.kraken2.repo

>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/DoLower.kraken2.report
>> Checking report file: /mnt/Data/2024-000012/Termites_soil/kraken2_output/DoLower.kraken2.report


##### backup

In [None]:
'''backup'''
import os
import concurrent.futures
import subprocess


''' ===== Configuration ===== '''
project_id = "2024-000012"
taxon_ranks = ['P', 'C', 'O', 'F', 'G', 'S'] # Add taxonomic ranks that you need for analysis
num_workers = 1  # Adjust this number based on your system's capabilities

''' File path settings '''
base_path =  f"/mnt/Data/{project_id}"  # Base directory containing the csv files
input_dir = os.path.join(base_path, "7_kraken_filtered")
output_dir = os.path.join(base_path, "8_bracken")
bracken_db = "/mnt/localdatabase/k2_refmicrobiome/"


''' ===== Functions ===== ''' 
def bracken_exists(base_name):
    output_path = Path(output_dir)
    pattern = f"{base_name}.bracken_*.tsv"
    return any(base_path.glob(pattern))
    
def process_kraken_report_for_bracken(kraken_report_file):
    base_name = os.path.splitext(kraken_report_file)[0]
    print(f"Processing Kraken report for Bracken: {kraken_report_file} (Base name: {base_name})")

    existing_levels = set(
        file.stem.split(f"{base_name}.bracken_")[-1]
        for file in Path(output_dir).glob(f"{base_name}.bracken_*.tsv")
    )

    if not existing_levels:
        print(f"No existing Bracken levels detected for {base_name}. Skipping analysis.")
        return
    print(f"Detected Bracken levels for {base_name}: {', '.join(existing_levels)}.")

    for level in existing_levels:
        bracken_cmd = [
            "bracken", "-d", bracken_db,
            "-i", os.path.join(input_dir, kraken_report_file),
            "-o", os.path.join(output_dir, f"{base_name}.bracken_{level}.tsv"),
            "-r", "300", "-l", level
        ]
        try:
            subprocess.run(bracken_cmd, check=True)
            print(f"Bracken {level} level analysis completed for {base_name}")
        except subprocess.CalledProcessError as e:
            print(f"Error during Bracken {level} analysis for {base_name}: {e}")
            

''' ===== Workflow ===== '''
if __name__ == "__main__":
    os.makedirs(output_dir, exist_ok=True)
    kraken_reports = [f for f in os.listdir(input_dir) if f.endswith(".kraken2.report")]

    if not kraken_reports:
        print(f"No Kraken2 reports found in {input_dir}.")
    else:
        print(f"Found {len(kraken_reports)} Kraken2 reports. Processing Bracken...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        executor.map(process_kraken_report_for_bracken, kraken_reports)

    print("All Bracken analysis completed.")

# 3-4. Bracken merge
##### merge results of the samples for analysis

In [9]:
import pandas as pd
from pathlib import Path
import os


''' ===== Configuration ===== '''
project_id = "2024-000012"

''' File path settings '''
base_path = f"/mnt/Data/{project_id}"  # Base directory containing the csv files
input_dir = os.path.join(base_path, "Termites_soil/kraken2_output/bracken")
output_dir = os.path.join(base_path, "Termites_soil/kraken2_output/mergedBracken")


''' ===== Functions ===== '''
def combine_taxonomic_levels(input_dir, output_dir):
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # Extract identifier from input folder (e.g., 'D2' or 'D1')
    folder_identifier = input_path.stem.split('_')[-1]  # Extracts 'D2' or 'D1'

    # Debugging: Print the absolute path and identifier to check if it's correct
    print(f"Input folder: {input_path.absolute()}")
    print(f"Output folder: {output_path.absolute()}")
    print(f"Folder identifier: {folder_identifier}")

    # Attempt to find all .tsv files that match the naming convention
    files = list(input_path.glob('*bracken_*.tsv'))

    if not files:
        print("No files found. Check your input folder path and file naming convention.")
        return

    # Dynamically detect levels from filenames
    detected_levels = {file.stem.split('_')[-1] for file in files}  # Extract levels like 'S', 'G', 'F'
    print(f"Detected taxonomic levels: {detected_levels}")

    # Combine files for each taxonomic level
    for level in detected_levels:
        print(f"Processing taxonomic level: {level}")
        file_paths = [file for file in files if file.stem.endswith(f"_{level}")]
        combined_df = pd.DataFrame()

        for file_path in file_paths:
            df = pd.read_csv(file_path, sep='\t')
            sample_name = file_path.stem.split('.')[0]  # Extracts sample name, e.g., '5985'
            
            # Check if 'name' exists in the file
            if 'name' not in df.columns:
                print(f"Warning: 'name' column missing in file {file_path}")
                continue

            # Create the Max column with NaN initially, place it in the fourth column position
            df = df[['name', 'taxonomy_lvl', 'taxonomy_id', 'new_est_reads']].rename(columns={
                'taxonomy_lvl': 'taxRank',
                'taxonomy_id': 'taxID',
                'new_est_reads': sample_name
            })
            
            # Add 'Max' column with NaN
            df.insert(3, 'Max', pd.NA)

            if combined_df.empty:
                combined_df = df
            else:
                combined_df = pd.merge(combined_df, df, on=['name', 'taxRank', 'taxID', 'Max'], how='outer')

        if not combined_df.empty:
            # Fill NA values for the sample columns with 0
            for col in combined_df.columns:
                if col not in ['name', 'taxRank', 'taxID', 'Max']:
                    combined_df[col] = combined_df[col].fillna(0)

            # Convert sample columns to numeric after merging
            sample_cols = combined_df.columns[4:]  # Get all sample columns after 'Max'
            combined_df[sample_cols] = combined_df[sample_cols].astype(float)

            # Recalculate the 'Max' column across all sample columns
            combined_df['Max'] = combined_df[sample_cols].max(axis=1)

            # Sort the DataFrame by 'name' in alphabetical order
            combined_df = combined_df.sort_values(by='name')

            # Save the combined DataFrame
            output_file_path = output_path / f'combined_{folder_identifier}_{level}.tsv'
            combined_df.to_csv(output_file_path, index=False, sep='\t')
            print(f"Combined file for level {level} saved to: {output_file_path}")
        else:
            print(f"No valid data for level {level}")

''' ===== Workflow ===== '''
combine_taxonomic_levels(input_dir, output_dir)

Input folder: /mnt/Data/2024-000012/Termites_soil/kraken2_output/bracken
Output folder: /mnt/Data/2024-000012/Termites_soil/kraken2_output/mergedBracken
Folder identifier: bracken
Detected taxonomic levels: {'G', 'F', 'P', 'C', 'S', 'O'}
Processing taxonomic level: G
Combined file for level G saved to: /mnt/Data/2024-000012/Termites_soil/kraken2_output/mergedBracken/combined_bracken_G.tsv
Processing taxonomic level: F
Combined file for level F saved to: /mnt/Data/2024-000012/Termites_soil/kraken2_output/mergedBracken/combined_bracken_F.tsv
Processing taxonomic level: P
Combined file for level P saved to: /mnt/Data/2024-000012/Termites_soil/kraken2_output/mergedBracken/combined_bracken_P.tsv
Processing taxonomic level: C
Combined file for level C saved to: /mnt/Data/2024-000012/Termites_soil/kraken2_output/mergedBracken/combined_bracken_C.tsv
Processing taxonomic level: S
Combined file for level S saved to: /mnt/Data/2024-000012/Termites_soil/kraken2_output/mergedBracken/combined_bracke

# 4. Microbiome Analysis

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import entropy
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import MDS
from matplotlib.backends.backend_pdf import PdfPages
import datetime

########### Configuration ###########

# Read the data file
file_path = '/mnt/Data/Taiwan_soil_microbiome/seminar'
data = pd.read_csv(file_path, sep='\t')

pdf_name = '/mnt/Data/Taiwan_soil_microbiome/seminar/results/Taiwan_Soil_Microbiome_16S.pdf'
svg_folder = '/mnt/Data/Taiwan_soil_microbiome/seminar/results/svg_files'  # Update this with your desired directory
output_choice = 'SVG' # Set the output choice (default as 'SVG')

"""Calculates diversity indices: Richness, Shannon, and Simpson."""
def calculate_diversity_indices(data):
    richness = data.apply(lambda x: (x > 0).sum(), axis=0)
    shannon = data.apply(lambda x: entropy(x[x > 0]), axis=0)
    simpson = data.apply(lambda x: 1 - sum((i/sum(x))**2 for i in x[x > 0]), axis=0)
    return pd.DataFrame({'Richness': richness, 'Shannon': shannon, 'Simpson': simpson})

"""Plots bar charts of the three alpha diversity indices."""
def plot_alpha_diversity(alpha_diversity):
    colors = sns.color_palette('viridis', n_colors=3)
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    for ax, (index, color) in zip(axes, zip(alpha_diversity.columns, colors)):
        alpha_diversity[index].plot(kind='bar', ax=ax, color=color)
        ax.set_title(f'{index} (Alpha Diversity)')
        ax.set_ylabel('Value')
        ax.set_xlabel('Soil Samples')
    plt.subplots_adjust(wspace=0.3)
    plt.close(fig)
    print('Alpha diversity... Done...')
    return fig

def plot_heatmaps(samples_data, normalized_data, data):
    fig1 = plt.figure(figsize=(10, 8))
    bray_curtis_dissimilarity = squareform(pdist(samples_data.T, metric='braycurtis'))
    
    sns.heatmap(bray_curtis_dissimilarity, cmap="YlGnBu", 
                xticklabels=samples_data.columns, 
                yticklabels=samples_data.columns)
    plt.title('Bray-Curtis Dissimilarity Between Soil Samples')
    plt.xlabel('Sample ID')
    plt.ylabel('Sample ID')
    plt.close(fig1)
    print('Bray-Curtis Dissimilarity... Done...')

    fig2 = plt.figure(figsize=(10, 8))
    sns.heatmap(normalized_data.set_index(data['name']), cmap="YlGnBu")
    plt.title('Microbial Abundance Across Different Soil Samples')
    plt.close(fig2)
    print('Heatmap... Done...')
    
    return fig1, fig2

def plot_pcoa(samples_data):
    fig = plt.figure(figsize=(10, 8))
    bray_curtis_dissimilarity = pdist(samples_data.T, metric='braycurtis')
    mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
    pcoa_result = mds.fit_transform(squareform(bray_curtis_dissimilarity))
    pcoa_df_temp = pd.DataFrame(data=pcoa_result, columns=['PCoA 1', 'PCoA 2'])
    pcoa_df_temp['Soil Sample'] = samples_data.T.index

    pcoa_df_temp['Treatment'] = pcoa_df_temp['Soil Sample'].apply(lambda x: x.split('_')[0] if '_' in x else x)
    pcoa_df_temp['Replicate'] = pcoa_df_temp['Soil Sample'].apply(lambda x: x.split('_')[1] if '_' in x else 'NA')

    ax = sns.scatterplot(x='PCoA 1', y='PCoA 2', hue='Replicate', style='Treatment',
                         data=pcoa_df_temp, s=200, palette='viridis')

    for line in range(0, pcoa_df_temp.shape[0]):
         ax.text(pcoa_df_temp["PCoA 1"][line]+0.02, pcoa_df_temp["PCoA 2"][line],
                 pcoa_df_temp["Soil Sample"][line], horizontalalignment='left',
                 size='medium', color='black', weight='semibold')

    plt.title('PCoA of Microbial Composition of Soil Samples', fontsize=16)
    plt.close(fig)
    print('PCoA... Done...')
    return fig


def save_plots(output_choice=output_choice, svg_folder=svg_folder):
    # Calculate alpha diversity indices
    samples_data = data.drop(columns=['name', 'taxRank', 'taxID', 'Max'])
    normalized_data = samples_data.div(samples_data.sum(axis=1), axis=0)
    alpha_diversity = calculate_diversity_indices(normalized_data)
    print(alpha_diversity)

    # Generate plots
    fig_alpha = plot_alpha_diversity(alpha_diversity)
    fig_heatmap1, fig_heatmap2 = plot_heatmaps(samples_data, normalized_data, data)
    fig_pcoa = plot_pcoa(samples_data)

    if output_choice == 'PDF':
        # Save all figures to a single PDF
        with PdfPages(pdf_name) as pdf:
            # Add alpha diversity table
            fig, ax = plt.subplots(figsize=(12, 4))
            ax.axis('off')
            table = ax.table(cellText=alpha_diversity.values, colLabels=alpha_diversity.columns, rowLabels=alpha_diversity.index, cellLoc='center', loc='center')
            table.auto_set_font_size(False)
            table.set_fontsize(10)
            table.scale(1.2, 1.2)
            plt.title('Alpha Diversity Indices Table', fontsize=16)
            pdf.savefig(fig, bbox_inches='tight')
            plt.close()

            pdf.savefig(fig_alpha, bbox_inches='tight')
            pdf.savefig(fig_heatmap1, bbox_inches='tight')
            pdf.savefig(fig_heatmap2, bbox_inches='tight')
            pdf.savefig(fig_pcoa, bbox_inches='tight')

            # PDF Metadata
            d = pdf.infodict()
            d['Title'] = 'Soil Microbiome Analysis Results'
            d['Author'] = 'Your Name'
            d['Subject'] = 'Microbiome Analysis'
            d['Keywords'] = 'PCoA, Alpha Diversity, Beta Diversity, Microbiome'
            d['CreationDate'] = datetime.datetime.today()
            
            print("PDF saved successfully.")
    
    elif output_choice == 'SVG':
        # Save each figure as a separate SVG file
        fig_alpha.savefig(f'{svg_folder}alpha_diversity.svg', format='svg')
        fig_heatmap1.savefig(f'{svg_folder}heatmap_braycurtis.svg', format='svg')
        fig_heatmap2.savefig(f'{svg_folder}heatmap_abundance.svg', format='svg')
        fig_pcoa.savefig(f'{svg_folder}pcoa_plot.svg', format='svg')

        print("SVG files saved successfully.")

# Save all figures to a single PDF file
with PdfPages(pdf_name) as pdf:
    # Adding diversity indices table to the pdf (publishable format)
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.axis('off')
    table = ax.table(cellText=alpha_diversity.values, colLabels=alpha_diversity.columns, rowLabels=alpha_diversity.index, cellLoc='center', loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 1.2)  # Adjust table size
    plt.title('Alpha Diversity Indices Table', fontsize=16)
    pdf.savefig(fig, bbox_inches='tight')
    plt.close()

    pdf.savefig(fig_alpha, bbox_inches='tight')
    pdf.savefig(fig_heatmap1, bbox_inches='tight')
    pdf.savefig(fig_heatmap2, bbox_inches='tight')
    pdf.savefig(fig_pcoa, bbox_inches='tight')

    # Metadata for the PDF
    d = pdf.infodict()
    d['Title'] = 'Soil Microbiome Analysis Results'
    d['Author'] = 'Your Name'
    d['Subject'] = 'Microbiome Analysis'
    d['Keywords'] = 'PCoA, Alpha Diversity, Beta Diversity, Microbiome'
    d['CreationDate'] = datetime.datetime.today()

# Save the plots based on the output choice
save_plots(output_choice)