In [17]:
import pandas as pd
import numpy as np
import glob
import os
import logging
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import jensenshannon
from scipy.stats import entropy, ks_2samp, f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def good_turing_smooth(counts):
    """Applies Good-Turing smoothing to a list of counts.

    Args:
        counts (list or Counter): List of observed counts of items.

    Returns:
        dict: Dictionary mapping original counts to smoothed counts.
    """
    
    # Convert to Counter if it's a list
    if isinstance(counts, list):
        counts = Counter(counts)
    
    n1 = sum(1 for count in counts.values() if count == 1)  # Items seen once
    n = sum(counts.values())  # Total items

    p_zero = n1 / n  # Good-Turing estimate for zero count

    smoothed_counts = {}
    for count in counts:
        if count == 0:
            smoothed_counts[count] = p_zero
        else:
            n_next = sum(1 for c in counts.values() if c == count + 1)
            smoothed_counts[count] = (count + 1) * n_next / n

    return smoothed_counts



def calculate_divergences_and_ks(df):

    # Drop rows where both methylated and unmethylated are zero 
    df = df[(df['methylated'] != 0) | (df['unmethylated'] != 0)].copy()

    if len(df) == 0:
        return None, None, None, None, None, None
    
    #df["methylated"] += smoothing
    #df["unmethylated"] += smoothing
    
    # Calculate probabilities
    total = df["methylated"] + df["unmethylated"]
    p = df["methylated"] / total
    q = df["unmethylated"] / total
    
    # Divergence calculations (with normalizations)
    normalized_js_divergence = float(jensenshannon(p, q, base=2))  # JSD

    normalized_kl_divergence = float(entropy(p, q, base=2) - entropy(p, base=2))  # KLD

    # Geometric Jensen-Shannon Divergence
    normalized_gjs_divergence = float(np.sqrt(
        0.5 * (np.sqrt(entropy(p, q, base=2)) + np.sqrt(entropy(q, p, base=2)))
    ) / np.sqrt(np.log(2)))
    
    # Symmetric Geometric Jensen-Shannon Divergence
    m = (p + q) / 2
    normalized_sgjs_divergence = float( 0.5 * (
        np.sqrt(jensenshannon(p, m, base=2)) + np.sqrt(jensenshannon(q, m, base=2))
    ) / np.sqrt(np.log(2)))

    # Kolmogorov-Smirnov Test
    ks_statistic, ks_pvalue = ks_2samp(df["methylated"], df["unmethylated"])  # KST

    return normalized_js_divergence, normalized_kl_divergence, normalized_gjs_divergence, normalized_sgjs_divergence, ks_statistic, ks_pvalue
    

def read_bismark_file(filename):
    """Reads Bismark methylation data from a file."""
    column_names = ["chr", "start", "end", "coverage", "methylated", "unmethylated"]
    df = pd.read_csv(filename, sep='\t', header=None, names=column_names, compression='gzip')

    # Convert columns to correct data types
    df['start'] = pd.to_numeric(df['start'])
    df['end'] = pd.to_numeric(df['end'])
    df['coverage'] = pd.to_numeric(df['coverage'])
    df['methylated'] = pd.to_numeric(df['methylated'])
    df['unmethylated'] = pd.to_numeric(df['unmethylated'])

    return df

def clean_data(results_df):
    """Cleans the results DataFrame by removing file extensions from sample names."""
    results_df['Sample'] = results_df['Sample'].astype(str).str.replace('.bedgraph.gz', '', regex=False)
    return results_df



data_directory = "/shares/grossniklaus.botinst.uzh/eharputluoglu/test_run_datasets"
results_file_path = "/shares/grossniklaus.botinst.uzh/eharputluoglu/calculation_output/methylation_analysis_results.csv"  # Choose an appropriate output location



results = []

# Get all sample subdirectories
sample_directories = [d for d in os.listdir(data_directory) if os.path.isdir(os.path.join(data_directory, d))]

# Iterate through files
for sample_dir in sample_directories:
    sample_path = os.path.join(data_directory, sample_dir)

    logging.info(f"Processing sample directory: {sample_path}")

    # Iterate through files in each sample directory
    for file_path in glob.glob(os.path.join(sample_path, "*.bismark.cov.gz")):
        try:
            filename = os.path.basename(file_path)
            parts = filename.split("_")

            context_type = parts[0]
            sample_name = parts[1] + "_" + parts[2].split(".")[0] 
            chromosome = parts[-1].split(".")[0]

            #logging.info(f"Processing file: {filename}")



            df = read_bismark_file(file_path)
            divergences = calculate_divergences_and_ks(df)
    
            results.append(
                {
                    "Sample": sample_name,
                    "Chromosome": chromosome,
                    "Context": context_type,
                    "JS Divergence": divergences[0],  
                    "KL Divergence": divergences[1],
                    "GJS Divergence": divergences[2],
                    "SGJS Divergence": divergences[3],
                    "KS Statistic": divergences[4],
                    "KS P-value": divergences[5]
                }
            )
    
        except Exception as e:
            logging.error(f"Error processing file {filename}: {e}")

# Create a DataFrame from results
results_df = pd.DataFrame(results)


2024-06-11 15:57:27,427 - INFO - Processing sample directory: /shares/grossniklaus.botinst.uzh/eharputluoglu/test_run_datasets/SRX1664464_se
2024-06-11 15:57:27,438 - ERROR - Error processing file CHG_SRX1664464_se_chrPt.bismark.cov.gz: name 'p' is not defined
2024-06-11 15:57:28,435 - ERROR - Error processing file CHG_SRX1664464_se_chr2.bismark.cov.gz: name 'p' is not defined


KeyboardInterrupt: 

In [4]:
results_df

In [24]:
column_names = ["chr", "start", "end", "coverage", "methylated", "unmethylated"]
df = pd.read_csv("/shares/grossniklaus.botinst.uzh/eharputluoglu/test_run_datasets/SRX1664464_se/CHG_SRX1664464_se_chr2.bismark.cov.gz", sep='\t', header=None, names=column_names, compression='gzip')

# Convert columns to correct data types
df['start'] = pd.to_numeric(df['start'])
df['end'] = pd.to_numeric(df['end'])
df['coverage'] = pd.to_numeric(df['coverage'])
df['methylated'] = pd.to_numeric(df['methylated'])
df['unmethylated'] = pd.to_numeric(df['unmethylated'])

In [25]:
df

Unnamed: 0,chr,start,end,coverage,methylated,unmethylated
0,2,1013,1014,0.0,0,1
1,2,1034,1035,0.0,0,2
2,2,1084,1085,0.0,0,2
3,2,1087,1088,0.0,0,1
4,2,1108,1109,0.0,0,1
...,...,...,...,...,...,...
849857,2,19697521,19697522,0.0,0,4
849858,2,19697523,19697524,0.0,0,8
849859,2,19697536,19697537,0.0,0,3
849860,2,19697538,19697539,0.0,0,5


In [49]:
from collections import Counter

def smooth_good_turing(counts_list):

    counts_counter = Counter(counts_list)
    Nr = Counter(counts_counter.values())

    smoothed_counts = {}
    for r, N_r in Nr.items():
        if r + 1 in Nr:
            N_r_plus_1 = Nr[r + 1]
            smoothed_counts[r] = (r + 1) * N_r_plus_1 / N_r
        else:
            if r == max(Nr):
                smoothed_counts[r] = (r * N_r) / (N_r - Nr.get(r - 1, 0))  
            else:
                smoothed_counts[r] = r

    smoothed_list = [smoothed_counts.get(x, x) for x in counts_list]

    return smoothed_list

# Example usage
data = list(df["methylated"])
smoothed_data = smooth_good_turing(data)

data2 = list(df["unmethylated"])
smoothed_data2 = smooth_good_turing(data2)



In [50]:
print(f"Original data: {data[:100]}")
print(f"Smoothed data: {smoothed_data[:100]}")

Original data: [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 1, 12, 2, 5, 42, 8, 52, 7, 20, 45, 9, 9, 0, 37, 6, 5, 18, 0, 8, 0, 8, 0, 0, 0, 0, 0, 1, 6, 10, 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 46, 95, 138, 122, 3, 58, 152, 148, 290, 69, 117, 71, 71, 27, 69, 66, 23, 0, 0, 1, 1, 1, 9, 4, 23, 25, 7, 23, 60, 57]
Smoothed data: [0, 0, 0, 0, 0, 0, 0, 0, 0.35, 0, 1.7142857142857142, 0.35, 12, 1.7142857142857142, 3.0, 42, 9.0, 52, 1.6, 20, 45, 9, 9, 0, 37, 35.0, 3.0, 18, 0, 9.0, 0, 9.0, 0, 0, 0, 0, 0, 0.35, 35.0, 10, 2.0, 1.7142857142857142, 0, 0, 0.35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.35, 0, 0.35, 0, 1.7142857142857142, 1.7142857142857142, 0, 0, 0, 0, 0.35, 0, 0, 0, 0, 46, 95, 138, 122, 2.0, 58, 152, 148, 290, 69, 117, 71, 71, 27, 69, 66, 23, 0, 0, 0.35, 0.35, 0.35, 9, 5.0, 23, 25, 1.6, 23, 60, 57]


In [51]:
print(f"Original data: {data2[:100]}")
print(f"Smoothed data: {smoothed_data2[:100]}")

Original data: [1, 2, 2, 1, 1, 1, 1, 2, 8, 12, 8, 7, 501, 209, 622, 1748, 660, 2489, 893, 2772, 3490, 903, 938, 973, 3766, 833, 419, 1748, 130, 862, 71, 606, 1, 1, 1, 1, 1, 0, 6, 5, 9, 14, 12, 24, 22, 13, 1, 1, 1, 6, 1, 5, 1, 1, 1, 0, 1, 0, 2, 1, 0, 3, 3, 2, 1, 0, 1, 1, 1, 1, 404, 761, 1730, 1962, 18, 3103, 2954, 3243, 3010, 2951, 2345, 1804, 1792, 1702, 1180, 846, 428, 1, 1, 0, 0, 0, 68, 154, 240, 1177, 852, 1472, 1076, 1193]
Smoothed data: [0.4470588235294118, 1.5789473684210527, 1.5789473684210527, 0.4470588235294118, 0.4470588235294118, 0.4470588235294118, 0.4470588235294118, 1.5789473684210527, 8, 6.5, 8, 8.0, 501, 209, 622, 1748, 660, 2489, 893, 2772, 3490, 903, 938, 973, 3766, 833, 419, 1748, 130, 862, 71, 606, 0.4470588235294118, 0.4470588235294118, 0.4470588235294118, 0.4470588235294118, 0.4470588235294118, 0, 7.0, 2.5, 9, 15.0, 6.5, 8.333333333333334, 23.0, 56.0, 0.4470588235294118, 0.4470588235294118, 0.4470588235294118, 7.0, 0.4470588235294118, 2.5, 0.4470588235294118, 0.44

In [52]:
p_data = [x / y for x, y in zip(data, data2) if y != 0]
p_data_2 = [x / y for x, y in zip(smoothed_data, smoothed_data2) if y != 0] 



In [53]:
p_data_2

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.04375,
 0.0,
 0.21428571428571427,
 0.04375,
 0.023952095808383235,
 0.008202323991797676,
 0.00482315112540193,
 0.02402745995423341,
 0.013636363636363636,
 0.020891924467657693,
 0.0017917133258678612,
 0.007215007215007215,
 0.012893982808022923,
 0.009966777408637873,
 0.009594882729211088,
 0.0,
 0.009824747742963356,
 0.04201680672268908,
 0.007159904534606206,
 0.010297482837528604,
 0.0,
 0.010440835266821345,
 0.0,
 0.01485148514851485,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 5.0,
 4.0,
 0.2222222222222222,
 0.11428571428571428,
 0.0,
 0.0,
 0.015217391304347825,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 3.834586466165413,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.11386138613861387,
 0.12483574244415244,
 0.07976878612716763,
 0.06218144750254842,
 0.1111111111111111,
 0.018691588785046728,
 0.05145565335138795,
 0.04563675609004009,
 0.09634551495016612,
 0.02338190443917316,
 0.049893390191897

In [54]:
p_data

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.125,
 0.0,
 0.25,
 0.14285714285714285,
 0.023952095808383235,
 0.009569377990430622,
 0.008038585209003215,
 0.02402745995423341,
 0.012121212121212121,
 0.020891924467657693,
 0.007838745800671893,
 0.007215007215007215,
 0.012893982808022923,
 0.009966777408637873,
 0.009594882729211088,
 0.0,
 0.009824747742963356,
 0.007202881152460984,
 0.011933174224343675,
 0.010297482837528604,
 0.0,
 0.009280742459396751,
 0.0,
 0.013201320132013201,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 2.0,
 0.3333333333333333,
 0.14285714285714285,
 0.0,
 0.0,
 0.045454545454545456,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 2.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.11386138613861387,
 0.12483574244415244,
 0.07976878612716763,
 0.06218144750254842,
 0.16666666666666666,
 0.018691588785046728,
 0.05145565335138795,
 0.04563675609004009,
 0.09634551495016612,
 0.02338190443917316,
 0.04989339019189765,
 0.039356984

In [56]:
def additive_smoothing(counts_list, alpha=1):

    num_categories = len(set(counts_list))  

    smoothed_list = [(count + alpha) / (sum(counts_list) + alpha * num_categories) 
                     for count in counts_list]

    return smoothed_list

In [57]:
data = list(df["methylated"])
smoothed_add_data = additive_smoothing(data)


In [58]:
data

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 1,
 12,
 2,
 5,
 42,
 8,
 52,
 7,
 20,
 45,
 9,
 9,
 0,
 37,
 6,
 5,
 18,
 0,
 8,
 0,
 8,
 0,
 0,
 0,
 0,
 0,
 1,
 6,
 10,
 3,
 2,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 46,
 95,
 138,
 122,
 3,
 58,
 152,
 148,
 290,
 69,
 117,
 71,
 71,
 27,
 69,
 66,
 23,
 0,
 0,
 1,
 1,
 1,
 9,
 4,
 23,
 25,
 7,
 23,
 60,
 57,
 29,
 84,
 61,
 146,
 199,
 133,
 40,
 106,
 7,
 125,
 48,
 126,
 149,
 146,
 100,
 254,
 167,
 162,
 163,
 54,
 68,
 1,
 19,
 25,
 7,
 35,
 4,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 12,
 3,
 17,
 7,
 10,
 6,
 1,
 8,
 3,
 6,
 1,
 0,
 4,
 2,
 1,
 0,
 3,
 3,
 5,
 1,
 2,
 2,
 3,
 1,
 2,
 3,
 0,
 0,
 3,
 0,
 1,
 0,
 1,
 1,
 3,
 1,
 4,
 1,
 1,
 0,
 4,
 1,
 5,
 1,
 1,
 2,
 4,
 2,
 3,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 1,
 2,
 0,
 1,
 1,
 3,
 5,
 4,
 3,
 6,
 1,
 2,
 5,
 5,
 2,
 3,
 2,
 1,
 2,
 3,
 0

In [59]:
smoothed_add_data

[1.7380694567316299e-06,
 1.7380694567316299e-06,
 1.7380694567316299e-06,
 1.7380694567316299e-06,
 1.7380694567316299e-06,
 1.7380694567316299e-06,
 1.7380694567316299e-06,
 1.7380694567316299e-06,
 3.4761389134632598e-06,
 1.7380694567316299e-06,
 5.21420837019489e-06,
 3.4761389134632598e-06,
 2.259490293751119e-05,
 5.21420837019489e-06,
 1.042841674038978e-05,
 7.473698663946008e-05,
 1.5642625110584668e-05,
 9.211768120677638e-05,
 1.3904555653853039e-05,
 3.649945859136423e-05,
 7.995119500965497e-05,
 1.73806945673163e-05,
 1.73806945673163e-05,
 1.7380694567316299e-06,
 6.604663935580194e-05,
 1.2166486197121409e-05,
 1.042841674038978e-05,
 3.302331967790097e-05,
 1.7380694567316299e-06,
 1.5642625110584668e-05,
 1.7380694567316299e-06,
 1.5642625110584668e-05,
 1.7380694567316299e-06,
 1.7380694567316299e-06,
 1.7380694567316299e-06,
 1.7380694567316299e-06,
 1.7380694567316299e-06,
 3.4761389134632598e-06,
 1.2166486197121409e-05,
 1.9118764024047928e-05,
 6.95227782692651