In [None]:
import pandas as pd
import numpy as np
from scipy.stats import entropy, ks_2samp
from scipy.special import rel_entr
from statsmodels.nonparametric.smoothers_lowess import lowess
import json
import zipfile
import os
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

def kl_divergence(p, q):
    return np.sum(rel_entr(p, q))

def geometric_jsd(p, q):
    jsd = jensenshannon(p, q)
    return np.sqrt(jsd)

def ks_test(p, q):
    return ks_2samp(p, q).statistic

def moving_average(data, window_size=3):
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

def gaussian_smoothing(data, sigma=1):
    return np.exp(-np.square(data - np.mean(data)) / (2 * sigma**2)) / (sigma * np.sqrt(2 * np.pi))

def laplace_smoothing(data, alpha=1):
    return (data + alpha) / (np.sum(data) + alpha * len(data))

def good_turing_smoothing(data):
    unique, counts = np.unique(data, return_counts=True)
    freq_of_freqs = np.bincount(counts)
    smoothed = np.array([((c+1) * (freq_of_freqs[c+1]/freq_of_freqs[c]) if c+1 < len(freq_of_freqs) else c) for c in counts])
    return smoothed / np.sum(smoothed)

def lowess_smoothing(data, frac=0.3):
    return lowess(data, np.arange(len(data)), frac=frac)[:, 1]

def hmm_smoothing(data, n_components=2):
    model = GaussianMixture(n_components=n_components)
    data_reshaped = data.reshape(-1, 1)
    model.fit(data_reshaped)
    smoothed = model.predict_proba(data_reshaped)[:, 1]
    return smoothed

def apply_smoothing(data, method):
    if method == 'moving_average':
        return moving_average(data)
    elif method == 'gaussian':
        return gaussian_smoothing(data)
    elif method == 'laplace':
        return laplace_smoothing(data)
    elif method == 'good_turing':
        return good_turing_smoothing(data)
    elif method == 'lowess':
        return lowess_smoothing(data)
    elif method == 'hmm':
        return hmm_smoothing(data)
    else:
        raise ValueError(f"Unknown smoothing method: {method}")

def divergence_calculations(data, each_row=True, smoothing='laplace'):
    results = []

    if each_row:
        for index, row in data.iterrows():
            p = np.array([row['methylated'], row['unmethylated']], dtype=np.float64)
            q = np.array([row['unmethylated'], row['methylated']], dtype=np.float64)
            
            # Apply smoothing
            p = apply_smoothing(p, smoothing)
            q = apply_smoothing(q, smoothing)
            
            # Normalize p and q
            p /= p.sum()
            q /= q.sum()
            
            kl = kl_divergence(p, q)
            js = jensenshannon(p, q)
            gjs = geometric_jsd(p, q)
            ks = ks_test(p, q)

            row_result = {
                'index': index,
                'entropy': entropy(p),
                'relative_entropy': kl,  # KL divergence is relative entropy
                'jsd': js,
                'geometric_jsd': gjs,
                'kolmogorov_smirnov': ks
            }

            results.append(row_result)

        # Save the results to a JSON file
        with open('results.json', 'w') as f:
            json.dump(results, f)
        
        # Zip the JSON file
        with zipfile.ZipFile('results.zip', 'w') as zipf:
            zipf.write('results.json')
        
        # Clean up the JSON file
        os.remove('results.json')
    else:
        # Calculate the statistics for the entire dataset
        p = data[['methylated', 'unmethylated']].values.flatten().astype(np.float64)
        q = data[['unmethylated', 'methylated']].values.flatten().astype(np.float64)

        # Apply smoothing
        p = apply_smoothing(p, smoothing)
        q = apply_smoothing(q, smoothing)

        # Normalize p and q
        p /= p.sum()
        q /= q.sum()

        kl = kl_divergence(p, q)
        js = jensenshannon(p, q)
        gjs = geometric_jsd(p, q)
        ks = ks_test(p, q)

        dataset_result = {
            'entropy': entropy(p),
            'relative_entropy': kl,  # KL divergence is relative entropy
            'jsd': js,
            'geometric_jsd': gjs,
            'kolmogorov_smirnov': ks
        }
        return dataset_result

# Sample data
data = pd.DataFrame({
    'chr': [1]*11,
    'start': [5694710, 5694712, 5694725, 5694727, 5694740, 5694742, 5694743, 5694745, 5694751, 5694753, 5694776],
    'end': [5694710, 5694712, 5694725, 5694727, 5694740, 5694742, 5694743, 5694745, 5694751, 5694753, 5694776],
    'percentage': [33.33, 0, 0, 66.67, 33.33, 33.33, 0, 66.67, 33.33, 0, 44.44],
    'methylated': [1, 0, 0, 2, 1, 1, 0, 2, 1, 0, 4],
    'unmethylated': [2, 2, 2, 1, 2, 2, 3, 1, 2, 4, 5]
})

# Test for each row with different smoothing methods
smoothing_methods = ['moving_average', 'gaussian', 'laplace', 'good_turing', 'lowess', 'hmm']
results = {}

for method in smoothing_methods:
    print(f"Testing method: {method}")
    divergence_calculations(data, each_row=True, smoothing=method)
    with open('results.json', 'r') as f:
        results[method] = json.load(f)

# Plot the results
metrics = ['entropy', 'relative_entropy', 'jsd', 'geometric_jsd', 'kolmogorov_smirnov']
for metric in metrics:
    plt.figure(figsize=(12, 8))
    for method in smoothing_methods:
        values = [result[metric] for result in results[method]]
        plt.plot(range(len(values)), values, label=method)
    plt.xlabel('Index')
    plt.ylabel(metric)
    plt.title(f'{metric} for different smoothing methods')
    plt.legend()
    plt.show()
