In [1]:
import io
import subprocess
import os
import glob

import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
from sklearn.decomposition import PCA
import pickle
import re
import math
import plotnine as pn
import seaborn as sns
import glob
import subprocess
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from numpy import log
from collections import defaultdict,Counter
import re
import statsmodels
import warnings

# !pip install statannotations==0.4.0
# #!pip uninstall statannotations --yes
# from statannotations.Annotator import Annotator

warnings.filterwarnings("ignore")
pd.options.display.max_rows = None
pd.options.display.max_columns=None
pd.options.display.max_colwidth=None
work_dir="/home/hazhang/projects/QC_vendor_samples_11162023"

In [10]:
from collect_qc import reformat_qc

# Define the input parameters
qc_tsv_file = f"{work_dir}/IMD024.autoqc_sample_qc.hdr.tsv"
release_version = "Sirius-1.1.0"

# Run the function
qc_ann_df = reformat_qc(qc_tsv_file, release_version)
qc_ann_df

Unnamed: 0,runid,run_sample_id,chry_nonsingleton,contam_vscore,dinucleosome_peak,dm_probes,estimated_contam_level,hotspot_non_singleton_coverage,hotspot_umol_gc_iqr,hypo_on_target_rate,mapd,methyl_95_lower,methyl_95_upper,methyl_hyper_pos_highcg_count,methyl_neg_capture,methyl_pos_capture_max,sample_contamination_pct,sample_coverage_exceptions_ldt,warning
0,231010_A01835_0147_AHF22VDSX7,IMD024,0.0,1.0,309.0,1.0,0.00573,3637.0,1.726,0.73,0.04,11.0,11.0,7854.0,2.9e-05,1.27162,0.02,0.0,methyl_hyper_pos_highcg_count


In [11]:
def reformat_and_merge_qc(directories, release_version):
    """
    This function iterates through multiple directories, applies the reformat_qc function 
    on each QC data file, and merges the results into a single DataFrame.

    :param directories: A dictionary where keys are directory paths and values are lists of sample subfolders. The information is provided by Tara and Shile (slack converstation) 
    :param release_version: The release version for the QC data formatting.
    :return: A merged DataFrame containing QC data for all samples.
    """
    all_qc_data = []

    for dir_path, samples in directories.items():
        for sample in samples:
            qc_tsv_file = os.path.join(dir_path, sample, f"{sample}.autoqc_sample_qc.hdr.tsv")
            if os.path.exists(qc_tsv_file):
                try:
                    qc_data = reformat_qc(qc_tsv_file, release_version)
                    all_qc_data.append(qc_data)
                except Exception as e:
                    print(f"Error processing file {qc_tsv_file}: {e}")
            else:
                print(f"File not found: {qc_tsv_file}")

    return pd.concat(all_qc_data, ignore_index=True)

#The information is provided by Tara and Shile (slack converstation)
directories = {
    "/ghds/ivd/flowcentral/231010_A01835_0147_AHF22VDSX7.ef400660-8ba5-4bca-b4c7-cfc4c7e80c75.20231012030436": ["IMD023", "IMD024", "IMD025", "IMD026", "IMD027"],
    "/ghds/ivd/flowcentral/230921_A02048_0032_BHF2FFDSX7.fb6c8f6e-5005-426d-a5b9-6ff6dbf85ecc.20230923213419": [f"IMDPilot{str(i).zfill(3)}" for i in range(1, 23)],
    "/ghds/ivd/flowcentral/231013_A01020_0680_BHGL5KDSX7.f425e07e-7a46-4661-a4ab-5bc1e664b2e9.20231015221337": ["IMD028"]
}

In [12]:
# call the function
release_version = "Sirius-1.1.0"
merged_qc_df = reformat_and_merge_qc(directories, release_version)
merged_qc_df

Unnamed: 0,runid,run_sample_id,chry_nonsingleton,contam_vscore,dinucleosome_peak,dm_probes,estimated_contam_level,hotspot_non_singleton_coverage,hotspot_umol_gc_iqr,hypo_on_target_rate,mapd,methyl_95_lower,methyl_95_upper,methyl_hyper_pos_highcg_count,methyl_neg_capture,methyl_pos_capture_max,sample_contamination_pct,sample_coverage_exceptions_ldt,warning
0,231010_A01835_0147_AHF22VDSX7,IMD023,0.0,0.0,318.0,0.0,0.0,1501.0,1.349,0.82,0.04,12.0,12.0,96494.0,2.1e-05,1.21527,0.02,2.0,
1,231010_A01835_0147_AHF22VDSX7,IMD024,0.0,1.0,309.0,1.0,0.00573,3637.0,1.726,0.73,0.04,11.0,11.0,207854.0,2.9e-05,1.27162,0.02,0.0,
2,231010_A01835_0147_AHF22VDSX7,IMD025,0.0,2.0,311.0,0.0,0.00535,3514.0,1.238,0.81,0.04,12.0,12.0,187337.0,9.6e-05,1.24194,0.01,0.0,
3,231010_A01835_0147_AHF22VDSX7,IMD026,0.0,0.0,321.0,0.0,0.0,2571.0,1.253,0.82,0.04,12.0,12.0,137037.0,4.4e-05,1.22961,0.02,0.0,
4,231010_A01835_0147_AHF22VDSX7,IMD027,0.0,0.0,318.0,0.0,0.0,3353.0,1.264,0.83,0.06,12.0,12.0,163060.0,1.2e-05,1.22835,0.02,0.0,
5,230921_A02048_0032_BHF2FFDSX7,IMDPilot001,0.0,3.0,321.0,0.0,0.00107,3060.0,1.263,0.85,0.04,11.0,11.0,147800.0,2e-05,1.26687,0.02,0.0,
6,230921_A02048_0032_BHF2FFDSX7,IMDPilot002,0.0,2.0,323.0,0.0,0.00155,2300.0,1.106,0.86,0.05,13.0,13.0,100693.0,5e-05,1.23854,0.01,0.0,
7,230921_A02048_0032_BHF2FFDSX7,IMDPilot003,0.0,2.0,323.0,0.0,0.00161,4050.0,1.07,0.85,0.06,13.0,13.0,203756.0,3.9e-05,1.23485,0.02,0.0,
8,230921_A02048_0032_BHF2FFDSX7,IMDPilot004,0.0,0.0,331.0,0.0,0.0,855.0,1.05,0.85,0.06,12.0,12.0,45689.0,2.2e-05,1.23465,0.02,23.0,
9,230921_A02048_0032_BHF2FFDSX7,IMDPilot005,0.0,1.0,332.0,0.0,0.00153,1511.0,0.925,0.84,0.06,12.0,12.0,67509.0,3e-05,1.24277,0.01,9.0,
