In [21]:
import pandas as pd
import numpy as np
from scipy.stats import entropy, ks_2samp
from scipy.special import rel_entr
import json
import zipfile
import os
import glob
import os
import logging
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import jensenshannon
from scipy.stats import entropy, ks_2samp, f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.special import kl_div

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='methylation_analysis_3.log', 
    filemode='w'                        
)

def divergence_calculations(data, each_row=True, smoothing=1e-10):
    # Function to calculate KL divergence
    def kl_divergence(p, q):
        return np.sum(rel_entr(p, q))

    # Function to calculate Geometric Jensen-Shannon Divergence
    def geometric_jsd(p, q):
        jsd = jensenshannon(p, q)
        return np.sqrt(jsd)
    # Function to calculate Kolmogorov-Smirnov Test
    def ks_test(p, q):
        return ks_2samp(p, q).statistic

    results = []

    if each_row:
        for index, row in data.iterrows():
            p = np.array([row['methylated'], row['unmethylated']], dtype=np.float64)
            q = np.array([row['unmethylated'], row['methylated']], dtype=np.float64)
            
            # Apply smoothing
            p += smoothing
            q += smoothing
            
            # Normalize p and q
            p /= p.sum()
            q /= q.sum()
            
            kl = kl_divergence(p, q)
            js = jensenshannon(p, q)
            gjs = geometric_jsd(p, q)
            ks = ks_test(p, q)

            row_result = {
                'index': index,
                'entropy': entropy(p),
                'relative_entropy': kl,  # KL divergence is relative entropy
                'dkl_divergence': kl,
                'jsd': js,
                'geometric_jsd': gjs,
                'kolmogorov_smirnov': ks
            }

            results.append(row_result)

        # Save the results to a JSON file
        with open('results.json', 'w') as f:
            json.dump(results, f)
        
        # Zip the JSON file
        with zipfile.ZipFile('results.zip', 'w') as zipf:
            zipf.write('results.json')
        
        # Clean up the JSON file
        os.remove('results.json')
    else:
        # Calculate the statistics for the entire dataset
        p = data[['methylated', 'unmethylated']].values.flatten().astype(np.float64)
        q = data[['unmethylated', 'methylated']].values.flatten().astype(np.float64)

        # Apply smoothing
        p += smoothing
        q += smoothing

        # Normalize p and q
        p /= p.sum()
        q /= q.sum()

        logging.info(f"HERE IS THE P: {p}")
        logging.info(f"HERE IS THE Q: {q}")
        

        kl = kl_divergence(p, q)
        js = jensenshannon(p, q)
        gjs = geometric_jsd(p, q)
        ks = ks_test(p, q)

        dataset_result = {
            'entropy': entropy(p),
            'relative_entropy': kl,  
            'dkl_divergence': kl,
            'jsd': js,
            'geometric_jsd': gjs,
            'kolmogorov_smirnov': ks
        }

        logging.info(f"Reult: {kl, js, gjs, ks}")
        
        
        return dataset_result


def read_bismark_file(filename):
    """Reads Bismark methylation data from a file."""
    column_names = ["chr", "start", "end", "coverage", "methylated", "unmethylated"]
    df = pd.read_csv(filename, sep='\t', header=None, names=column_names, compression='gzip')

    # Convert columns to correct data types
    df['start'] = pd.to_numeric(df['start'])
    df['end'] = pd.to_numeric(df['end'])
    df['coverage'] = pd.to_numeric(df['coverage'])
    df['methylated'] = pd.to_numeric(df['methylated'])
    df['unmethylated'] = pd.to_numeric(df['unmethylated'])

    return df

def clean_data(results_df):
    """Cleans the results DataFrame by removing file extensions from sample names."""
    results_df['Sample'] = results_df['Sample'].astype(str).str.replace('.bedgraph.gz', '', regex=False)
    return results_df

In [20]:

# Data processing pipeline
data_directory = "/home/eharpu/methylation_analysis/samples_testing"
results = []

# Get all sample subdirectories
#sample_directories = [d for d in os.listdir(data_directory) if os.path.isdir(os.path.join(data_directory, d))]

# Iterate through files
for sample_dir in ["CHH_SRX1664497"]:
    sample_path = os.path.join(data_directory, sample_dir)

    logging.info(f"Processing sample directory: {sample_path}")

    # Iterate through files in each sample directory
    for file_path in glob.glob(os.path.join(sample_path, "*.cov.gz")):
        try:
            filename = os.path.basename(file_path)
            parts = filename.split("_")

            context_type = parts[0]
            sample_name = parts[1] + "_" + parts[2].split(".")[0] 
            chromosome = parts[-1].split(".")[0]

            logging.info(f"Processing file: {filename}")

            df = read_bismark_file(file_path)
            divergences = divergence_calculations(df, False)

            logging.info(f"divergence calculated: {filename}")
            
            results.append(
                {
                    "Sample": sample_name,
                    "Chromosome": chromosome,
                    "Context": context_type,
                    "JS Divergence": divergences[0],  
                    "KL Divergence": divergences[1],
                    "GJS Divergence": divergences[2],
                    "SGJS Divergence": divergences[3],
                    "KS Statistic": divergences[4],
                    "KS P-value": divergences[5]
                }
            )
    
        except Exception as e:
            logging.error(f"Error processing file {filename}: {e}")

# Create a DataFrame from results
results_df = pd.DataFrame(results)


In [None]:
results_df.to_csv("test.csv", index=False)


In [14]:
results_df