# Setup and clearing previous Files

In [23]:
import os
import csv
import logging
import multiqc
from collections import defaultdict, OrderedDict
from multiqc.plots import bargraph
from multiqc import BaseMultiqcModule

# Configure logging
log = logging.getLogger('multiqc')
log.setLevel(logging.DEBUG)

if not log.handlers:
    handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    log.addHandler(handler)


# Clear previous MultiQC modules to avoid multiple plots in the report
multiqc.report.modules.clear()

# Define paths for the input, output, and report folders
original_tsv_folder = "/path/to/tsv/files/percolator"
output_folder_path = "/path/to/generated/tsv/files"
report_path = "/path/to/report"
plots_path = os.path.join(report_path, 'plots')

# Create necessary directories if they do not exist
os.makedirs(output_folder_path, exist_ok=True)
os.makedirs(plots_path, exist_ok=True)

# Function to clear previously generated TSV files
def clear_output_folder(output_path):
    for file_name in os.listdir(output_path):
        file_path = os.path.join(output_path, file_name)
        if file_name.endswith('.tsv') and os.path.isfile(file_path):
            os.remove(file_path)
    log.debug(f"Cleared all TSV files from: {output_path}")  # Changed from print to log.debug

# Clear the output folder before generating new TSV files
clear_output_folder(output_folder_path)

2024-09-17 12:50:26,184 - multiqc - DEBUG - Cleared all TSV files from: /home/evasam/multiqc_skript/feature_tsv_files


In [24]:
# Initialize MultiQC Module and Define Feature Groups

In [25]:
# Initialize and configure the MultiQC module
module = BaseMultiqcModule(
    name="Combined Percolator Features",
    anchor="combined_features"
)

# Feature group dictionary (psm_file and PSM file merged)
feature_name_dict = {
    "psm_file_combined": [  
        "MS:1002255", "MS:1002252", "COMET:deltaCn", "COMET:lnNumSP", "COMET:lnRankSP", "COMET:IonFrac", 
        "COMET:deltaLCn", "COMET:lnExpect", "charge1", "mass", "enzC", "charge5", "charge2", "absdm", "peplen", 
        "charge4", "enzN", "charge3", "m0", "enzInt", "isotope_error", "dm", "COMET:lnNumSP", "charge1", "mass", 
        "enzC", "COMET:lnRankSP", "charge5", "charge2", "absdm", "COMET:IonFrac", "COMET:deltaLCn", "peplen", 
        "charge4", "enzN", "charge3", "m0", "COMET:lnExpect", "enzInt", "isotope_error", "dm"
    ],
    "ms2pip": [
        "ionb_min_abs_diff", "ionb_mse", "iony_std_abs_diff_norm", "ionb_mse_norm", "ionb_std_abs_diff_norm", 
        "dotprod", "cos_norm", "iony_std_abs_diff", "iony_mean_abs_diff_norm", "dotprod_norm", "abs_diff_Q2_norm", 
        "iony_spearman", "ionb_min_abs_diff_norm", "iony_max_abs_diff_norm", "dotprod_iony_norm", "ionb_abs_diff_Q2", 
        "max_abs_diff", "dotprod_ionb_norm", "abs_diff_Q3_norm", "iony_min_abs_diff_norm", "cos_iony_norm", 
        "spec_pearson", "std_abs_diff", "min_abs_diff_norm", "ionb_max_abs_diff", "dotprod_iony", "ionb_mean_abs_diff_norm", 
        "max_abs_diff_iontype", "mean_abs_diff", "iony_max_abs_diff", "cos_iony", "ionb_std_abs_diff", "iony_pearson", 
        "ionb_max_abs_diff_norm", "mean_abs_diff_norm", "abs_diff_Q1", "ionb_pearson_norm", "ionb_abs_diff_Q1_norm", 
        "min_abs_diff", "ionb_mean_abs_diff", "spec_spearman", "iony_mean_abs_diff", "abs_diff_Q1_norm", "iony_min_abs_diff", 
        "ionb_spearman", "iony_abs_diff_Q3_norm", "min_abs_diff_iontype", "cos_ionb_norm", "abs_diff_Q2", "abs_diff_Q3", 
        "spec_pearson_norm", "ionb_abs_diff_Q2_norm", "std_abs_diff_norm", "dotprod_ionb", "cos_ionb", "ionb_abs_diff_Q3", 
        "iony_mse_norm", "iony_pearson_norm", "spec_mse_norm", "ionb_abs_diff_Q3_norm", "iony_abs_diff_Q1_norm", 
        "ionb_pearson", "max_abs_diff_norm", "spec_mse", "iony_mse", "ionb_abs_diff_Q1", "iony_abs_diff_Q2_norm", 
        "iony_abs_diff_Q1", "iony_abs_diff_Q3", "cos", "iony_abs_diff_Q2"
    ],
    "deeplc": [
        "rt_diff", "predicted_retention_time", "predicted_retention_time_best", "observed_retention_time_best", 
        "rt_diff_best", "observed_retention_time"
    ]
}

# Clean Files and Generate Feature-specific TSV Files

In [26]:
# Function to clean a file to keep only the first value row under each header
def clean_file(file_path, output_folder):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    cleaned_lines = [lines[0]]  # Keep the first header line
    keep_data = True  # keeping the first value row

    for line in lines[1:]:
        if line.strip() == lines[0].strip():  # If we encounter the same header again
            keep_data = not keep_data  
        elif keep_data:
            cleaned_lines.append(line)  # Keep the first value row
            keep_data = False  # Don't keep the next value row

    # Save the cleaned file
    cleaned_file_path = os.path.join(output_folder, os.path.basename(file_path))
    with open(cleaned_file_path, 'w') as cleaned_file:
        cleaned_file.writelines(cleaned_lines)

    return cleaned_file_path

# Clean all files in the directory
for filename in os.listdir(original_tsv_folder):
    if filename.endswith('.tsv'):
        file_path = os.path.join(original_tsv_folder, filename)
        clean_file(file_path, output_folder_path)

# Function to write features to separate TSV files
def write_features_to_tsv(group_name, features, data, output_path):
    output_file_path = os.path.join(output_path, f"{group_name}_features.tsv")
    with open(output_file_path, 'w', newline='') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t')
        writer.writerow(features)  # Write header
        for row in data:
            writer.writerow([abs(float(row[feature])) if feature in row else '' for feature in features])

# Read and separate cleaned TSV data into groups
all_group_data = defaultdict(list)

for filename in os.listdir(output_folder_path):
    if filename.endswith('.tsv'):
        with open(os.path.join(output_folder_path, filename), 'r') as file:
            reader = csv.DictReader(file, delimiter='\t')
            for row in reader:
                for group, features in feature_name_dict.items():
                    filtered_row = {feature: row[feature] for feature in features if feature in row}
                    if filtered_row:
                        all_group_data[group].append(filtered_row)

# Write new TSV files for each feature group
for group, data in all_group_data.items():
    write_features_to_tsv(group, feature_name_dict[group], data, output_folder_path)


# Prepare Data and Generate the Plot

In [27]:
# Prepare data for a single unified plot
combined_barplot_data = OrderedDict()
combined_plot_groups = {}

# Set color map for different groups
color_map = {
    "psm_file_combined": "#1f77b4",
    "ms2pip": "#ff7f0e",
    "deeplc": "#2ca02c",
    "Other": "#7f7f7f"
}

# Combine data from all TSV files into a single plot
for group_file in os.listdir(output_folder_path):
    if group_file.endswith("_features.tsv"):  # only self-generated TSV files are processed
        file_path = os.path.join(output_folder_path, group_file)

        # Read data from the TSV files
        means = calculate_means(file_path)

        # Merge data for the bar plot
        for header, mean in means.items():
            if header not in combined_barplot_data:
                combined_barplot_data[header] = {}
            combined_barplot_data[header][group_file] = abs(mean) 

            # Ensure correct color grouping for the unified plot
            group = get_feature_group(header)
            if group not in combined_plot_groups:
                combined_plot_groups[group] = {'name': group, 'color': color_map[group]}
            combined_plot_groups[header] = combined_plot_groups[group]

# Plot configuration for the unified plot
mean_pconfig = {
    'id': 'mean_values_combined',
    'title': 'Combined Mean Absolute Values from All Feature Groups',
    'xlab': 'Mean Absolute Value',
    'ylab': 'Headers',
    'stacked': False,
    'plot_groups': combined_plot_groups  # Grouping and colors
}

# Add the unified plot to the MultiQC section
module.add_section(
    plot=bargraph.plot(combined_barplot_data, pconfig=mean_pconfig),
    name="Combined Feature Weights",
    anchor="combined_feature_weights",
)

# Configure MultiQC to use the created plots
multiqc.config.module_order = ['custom_data']
multiqc.config.report_title = "Percolator Data QC Report"
multiqc.report.modules.append(module)

# Generate the MultiQC report
multiqc.write_report(
    force=True,
    title="Percolator QC Report",
    filename=os.path.join(report_path, 'multiqc_report_percolator.html')
)

log.debug("MultiQC report created successfully at: " + os.path.join(report_path, 'multiqc_report_percolator.html'))  # Changed from print to log.debug


2024-09-17 12:50:26,829 - multiqc.core.update_config - DEBUG - This is MultiQC v1.23
2024-09-17 12:50:26,830 - multiqc.core.update_config - DEBUG - Running Python 3.9.19 (main, May  6 2024, 19:43:03)  [GCC 11.2.0]
2024-09-17 12:50:26,832 - multiqc.core.update_config - INFO - Report title: Percolator QC Report
[34m     update_config[0m | Report title: Percolator QC Report


[38;5;208m///[0m [1mhttps://multiqc.info[0m 🔍 [2mv1.23[0m


[34m     version_check[0m | [33mMultiQC Version v1.25 now available![0m
2024-09-17 12:50:27,224 - multiqc.core.version_check - DEBUG - Latest MultiQC version is v1.25, released 2024-09-17
2024-09-17 12:50:27,227 - multiqc.core.write_results - DEBUG - Rendering plots
2024-09-17 12:50:27,233 - multiqc.core.write_results - DEBUG - Exporting plot data to files
2024-09-17 12:50:27,238 - multiqc.report - DEBUG - Wrote data file mean_values_combined-4.txt
2024-09-17 12:50:27,241 - multiqc.core.write_results - DEBUG - Moving data file from '/tmp/tmpyhi9om2n/multiqc_data' to '/home/evasam/multiqc_skript/multiqc_report_percolator/multiqc_report_percolator_data'
2024-09-17 12:50:27,282 - multiqc.core.write_results - INFO - Data        : multiqc_report_percolator/multiqc_report_percolator_data   (overwritten)
[34m     write_results[0m | Data        : multiqc_report_percolator/multiqc_report_percolator_data   (overwritten)
2024-09-17 12:50:27,463 - multiqc.core.write_results - DEBUG - Compres