In [4]:
import gzip
import os
import time

import numpy as np
import pandas as pd
from scipy.spatial.distance import jensenshannon
from scipy.special import rel_entr
from scipy.stats import entropy


# Function to calculate KL divergence
def kl_divergence(p, q):
    return np.sum(rel_entr(p, q))


# Function to calculate geometric Jensen-Shannon divergence
def geometric_jsd(p, q):
    jsd = jensenshannon(p, q)
    return np.sqrt(jsd)


# Laplace smoothing function
def laplace_smoothing(data, alpha=1e-10):
    return (data + alpha) / (np.sum(data) + alpha * len(data))


# Function to process input files and apply Laplace smoothing
def process_and_analyze(file_path, output_base_dir):
    with gzip.open(file_path, "rt") as f:
        df = pd.read_csv(
            f,
            sep="\t",
            header=None,
            names=["chr", "start", "end", "percentage", "methylated", "unmethylated"],
        )

    # Initialize results DataFrame
    results_list = []

    for index, row in df.iterrows():
        start_time = time.time()

        # Prepare data
        data = np.array([row["methylated"], row["unmethylated"]])
        smoothed_data = laplace_smoothing(data)
        p = np.array([smoothed_data[0], smoothed_data[1]])
        q = np.array([smoothed_data[1], smoothed_data[0]])

        # Normalize
        p /= p.sum()
        q /= q.sum()

        # Calculate metrics
        ent = entropy(p)
        kl = kl_divergence(p, q)
        js = jensenshannon(p, q)
        gjs = geometric_jsd(p, q)
        end_time = time.time()
        time_elapsed = end_time - start_time

        # Append results
        results_list.append(
            [
                row["chr"],
                row["start"],
                row["end"],
                row["percentage"],
                row["methylated"],
                row["unmethylated"],
                ent,
                kl,
                js,
                gjs,
                time_elapsed,
            ]
        )

    # Convert results to DataFrame
    results_df = pd.DataFrame(
        results_list,
        columns=[
            "chr",
            "start",
            "end",
            "percentage",
            "methylated",
            "unmethylated",
            "entropy",
            "relative_entropy",
            "jsd",
            "geometric_jsd",
            "time",
        ],
    )

    # Create output directory for the sample if it doesn't exist
    sample_dir = os.path.join(
        output_base_dir, os.path.basename(os.path.dirname(file_path))
    )
    os.makedirs(sample_dir, exist_ok=True)

    # Save results to compressed CSV
    output_filename = os.path.join(
        sample_dir, f"output_{os.path.basename(file_path).replace('.gz', '')}.csv.gz"
    )
    results_df.to_csv(output_filename, index=False, compression="gzip")

    return results_df


# Define the base directories
base_dir = "/home/eharpu/methylation_analysis/samples_testing"
output_base_dir = "/shares/grossniklaus.botinst.uzh/eharputluoglu/outputTest"

# Loop through files and process them
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".cov.gz"):
            file_path = os.path.join(root, file)
            print(f"Processing file: {file_path}")
            process_and_analyze(file_path, output_base_dir)

Processing file: /home/eharpu/methylation_analysis/samples_testing/SRX1664497_se/CHG_SRX1664497_se.bismark_chr_4.cov.gz
Processing file: /home/eharpu/methylation_analysis/samples_testing/SRX1664497_se/CpG_SRX1664497_se.bismark_chr_5.cov.gz


KeyboardInterrupt: 