In [20]:
import os
import glob
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp, entropy
from scipy.signal import convolve
from statsmodels.nonparametric.smoothers_lowess import lowess
from scipy.ndimage import gaussian_filter1d
from scipy.spatial.distance import jensenshannon

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='methylation_smoothing.log', 
    filemode='w'                        
)

def calculate_divergences_and_ks(df, smoothing_method="none", window_size=5, smoothing_factor=1e-5, frac=0.1):
    """Calculates normalized divergences (JSD, KL, GJS, SGJS) and KS test for methylation data with optional smoothing.

    Args:
        df (pandas.DataFrame): DataFrame with 'methylated' and 'unmethylated' columns.
        smoothing_method (str, optional): Smoothing method to apply. Options: "none", "moving_average", "lowess", "gaussian", "laplace". Defaults to "none".
        window_size (int, optional): Window size for moving average smoothing. Defaults to 5.
        smoothing_factor (float, optional): Smoothing factor for Laplace smoothing (add to numerator and denominator). Defaults to 1e-5.
        frac (float, optional): Fraction of data to use for LOWESS smoothing. Defaults to 0.1.

    Returns:
        tuple: Normalized divergences (JSD, KL, GJS, SGJS) and KS test statistic and p-value.
    """


     # Filter and drop rows as before
    df = df[(df['methylated'] != 0) | (df['unmethylated'] != 0)].copy()  
    if len(df) == 0:
        return None, None, None, None, None, None

    total = df["methylated"] + df["unmethylated"]
    df["methylation_ratio"] = df["methylated"] / total
    
    # Replace inplace with this line
    df["methylation_ratio"] = df["methylation_ratio"].fillna(0)  

    
    # Apply smoothing based on the selected method
    if smoothing_method == "moving_average":
        window = np.ones(window_size) / window_size
        smoothed_ratios = convolve(df["methylation_ratio"], window, mode="same")
        df["smoothed_methylation_ratio"] = smoothed_ratios.clip(0, 1) 
        
    elif smoothing_method == "lowess":
        smoothed_ratios = lowess(df["methylation_ratio"], np.arange(len(df)), frac=frac)[:, 1]
        df["smoothed_methylation_ratio"] = smoothed_ratios.clip(0, 1)  
        
    elif smoothing_method == "gaussian":
        smoothed_ratios = gaussian_filter1d(df["methylation_ratio"], sigma=window_size)
        df["smoothed_methylation_ratio"] = smoothed_ratios.clip(0, 1)  
        
    elif smoothing_method == "laplace":
        df["smoothed_methylation_ratio"] = (df["methylated"] + smoothing_factor) / (total + 2 * smoothing_factor) 
        
    else:  # No smoothing
        df["smoothed_methylation_ratio"] = df["methylation_ratio"]  

    # Divergence calculations using smoothed ratios
    p = df["smoothed_methylation_ratio"]
    q = 1 - p

 # Divergence calculations (with normalizations)
    normalized_js_divergence = float(jensenshannon(p, q, base=2))  # JSD

    normalized_kl_divergence = float(entropy(p, q, base=2) - entropy(p, base=2))  # KLD

    # Geometric Jensen-Shannon Divergence
    normalized_gjs_divergence = float(np.sqrt(
        0.5 * (np.sqrt(entropy(p, q, base=2)) + np.sqrt(entropy(q, p, base=2)))
    ) / np.sqrt(np.log(2)))
    
    # Symmetric Geometric Jensen-Shannon Divergence
    m = (p + q) / 2
    normalized_sgjs_divergence = float( 0.5 * (
        np.sqrt(jensenshannon(p, m, base=2)) + np.sqrt(jensenshannon(q, m, base=2))
    ) / np.sqrt(np.log(2)))

    # Kolmogorov-Smirnov Test
    ks_statistic, ks_pvalue = ks_2samp(df["methylated"], df["unmethylated"])  # KST
    

    return normalized_js_divergence, normalized_kl_divergence, normalized_gjs_divergence, normalized_sgjs_divergence, ks_statistic, ks_pvalue


def read_bismark_file(filename):
    """Reads Bismark methylation data from a file."""
    column_names = ["chr", "start", "end", "coverage", "methylated", "unmethylated"]
    df = pd.read_csv(filename, sep='\t', header=None, names=column_names, compression='gzip')

    # Convert columns to correct data types
    df['start'] = pd.to_numeric(df['start'])
    df['end'] = pd.to_numeric(df['end'])
    df['coverage'] = pd.to_numeric(df['coverage'])
    df['methylated'] = pd.to_numeric(df['methylated'])
    df['unmethylated'] = pd.to_numeric(df['unmethylated'])

    return df

def clean_data(results_df):
    """Cleans the results DataFrame by removing file extensions from sample names."""
    results_df['Sample'] = results_df['Sample'].astype(str).str.replace('.bedgraph.gz', '', regex=False)
    return results_df



# Data processing pipeline
data_directory = "/shares/grossniklaus.botinst.uzh/dkt/projects/meth1000/analysis/09_split_cov_chr/output"
results = []

# Get all sample subdirectories
sample_directories = [d for d in os.listdir(data_directory) if os.path.isdir(os.path.join(data_directory, d))]


# Iterate through files
for sample_dir in sample_directories:
    sample_path = os.path.join(data_directory, sample_dir)

    logging.info(f"Processing sample directory: {sample_path}")
    print(sample_path)

    # Iterate through files in each sample directory
    for file_path in glob.glob(os.path.join(sample_path, "*_chr_*.cov.gz")):
        try:
            
            filename = os.path.basename(file_path)
            parts = filename.split("_")

            context_type = parts[0]
            sample_name = parts[1] + "_" + parts[2].split(".")[0]
            chromosome = parts[-1].split(".")[0]

            df = read_bismark_file(file_path)

            smoothing_methods = ["none", "moving_average", "lowess", "gaussian", "laplace"]
            smoothing_results = {}

            for smoothing_method in smoothing_methods:

                divergences = calculate_divergences_and_ks(df, smoothing_method, window_size=3)

                smoothing_results[smoothing_method] = {
                    "Divergences": divergences,
                    "Methylation Ratio": df["methylation_ratio"],
                    "Smoothed Methylation Ratio": df["smoothed_methylation_ratio"] if smoothing_method != "none" else None  
                }

            # Plotting
            plt.figure(figsize=(12, 6))
            plt.plot(df["methylation_ratio"], label="Original", alpha=0.5)

            for smoothing_method, data in smoothing_results.items():
                if data["Smoothed Methylation Ratio"] is not None:
                    plt.plot(data["Smoothed Methylation Ratio"], label=smoothing_method)

            plt.title(f"Methylation Ratios with Different Smoothing Methods - Sample: {sample_name} - Chromosome: {chromosome}")
            plt.xlabel("Position")
            plt.ylabel("Methylation Ratio")
            plt.legend()
            plt.tight_layout()
            plt.savefig(f"/path/to/your/plot/directory/{sample_name}_{chromosome}_{context_type}_smoothing_comparison.png")
            plt.close()
    
            results.append(
                {
                    "Sample": sample_name,
                    "Chromosome": chromosome,
                    "Context": context_type,
                    **{
                        f"{method} Divergence": smoothing_results[method]["Divergences"][i] 
                        for method in smoothing_methods
                        for i in range(len(smoothing_results[method]["Divergences"]))  # Iterate over all divergence results
                    }
                }
            )
        


        except Exception as e:
            logging.error(f"Error processing file {filename}: {e}")

# Create and save results DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv("/shares/grossniklaus.botinst.uzh/eharputluoglu/calculation_output/methylation_analysis_results_Smooting_all.csv", index=False)


### OLDDDD


In [None]:


def read_bismark_file(filename):
    """Reads Bismark methylation data from a file."""
    column_names = ["chr", "start", "end", "coverage", "methylated", "unmethylated"]
    df = pd.read_csv(filename, sep='\t', header=None, names=column_names, compression='gzip')

    # Convert columns to correct data types
    df['start'] = pd.to_numeric(df['start'])
    df['end'] = pd.to_numeric(df['end'])
    df['coverage'] = pd.to_numeric(df['coverage'])
    df['methylated'] = pd.to_numeric(df['methylated'])
    df['unmethylated'] = pd.to_numeric(df['unmethylated'])

    return df

def clean_data(results_df):
    """Cleans the results DataFrame by removing file extensions from sample names."""
    results_df['Sample'] = results_df['Sample'].astype(str).str.replace('.bedgraph.gz', '', regex=False)
    return results_df


# Data processing pipeline
data_directory = "/shares/grossniklaus.botinst.uzh/dkt/projects/meth1000/analysis/09_split_cov_chr/output"
results = []

# Get all sample subdirectories
sample_directories = [d for d in os.listdir(data_directory) if os.path.isdir(os.path.join(data_directory, d))]

# Iterate through files
for sample_dir in sample_directories:
    sample_path = os.path.join(data_directory, sample_dir)

    logging.info(f"Processing sample directory: {sample_path}")

    # Iterate through files in each sample directory
    for file_path in glob.glob(os.path.join(sample_path, "*_chr_*.cov.gz")):
        try:
            filename = os.path.basename(file_path)
            parts = filename.split("_")

            context_type = parts[0]
            sample_name = parts[1] + "_" + parts[2].split(".")[0] 
            chromosome = parts[-1].split(".")[0]

            #logging.info(f"Processing file: {filename}")



            df = read_bismark_file(file_path)
            divergences = calculate_divergences_and_ks(df, smoothing_method="gaussian", window_size=3)
    
            results.append(
                {
                    "Sample": sample_name,
                    "Chromosome": chromosome,
                    "Context": context_type,
                    "JS Divergence": divergences[0],  
                    "KL Divergence": divergences[1],
                    "GJS Divergence": divergences[2],
                    "SGJS Divergence": divergences[3],
                    "KS Statistic": divergences[4],
                    "KS P-value": divergences[5]
                }
            )
    
        except Exception as e:
            logging.error(f"Error processing file {filename}: {e}")

# Create a DataFrame from results
results_df = pd.DataFrame(results)

# Clean data
#results_df = clean_data(results_df)

# Save the results to a CSV file
results_df.to_csv("/shares/grossniklaus.botinst.uzh/eharputluoglu/calculation_output/methylation_analysis_results_Smooting_1e-5.csv", index=False)

In [4]:
results_df

In [24]:
column_names = ["chr", "start", "end", "coverage", "methylated", "unmethylated"]
df = pd.read_csv("/shares/grossniklaus.botinst.uzh/eharputluoglu/test_run_datasets/SRX1664464_se/CHG_SRX1664464_se_chr2.bismark.cov.gz", sep='\t', header=None, names=column_names, compression='gzip')

# Convert columns to correct data types
df['start'] = pd.to_numeric(df['start'])
df['end'] = pd.to_numeric(df['end'])
df['coverage'] = pd.to_numeric(df['coverage'])
df['methylated'] = pd.to_numeric(df['methylated'])
df['unmethylated'] = pd.to_numeric(df['unmethylated'])

In [25]:
df

Unnamed: 0,chr,start,end,coverage,methylated,unmethylated
0,2,1013,1014,0.0,0,1
1,2,1034,1035,0.0,0,2
2,2,1084,1085,0.0,0,2
3,2,1087,1088,0.0,0,1
4,2,1108,1109,0.0,0,1
...,...,...,...,...,...,...
849857,2,19697521,19697522,0.0,0,4
849858,2,19697523,19697524,0.0,0,8
849859,2,19697536,19697537,0.0,0,3
849860,2,19697538,19697539,0.0,0,5
