In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.cluster import KMeans
from scipy.stats import norm
from sklearn.mixture import GaussianMixture



In [None]:
# 1. Calculate average DNA methylation level
def average_methylation(methylation_data, start, end):
    region_data = methylation_data[(methylation_data['position'] >= start) & 
                                   (methylation_data['position'] <= end)]
    return region_data['methylation'].mean()



In [None]:
# 2. Identify differentially methylated regions
def identify_dmrs(condition1, condition2, window_size=1000, step_size=100, p_threshold=0.05):
    dmrs = []
    for start in range(0, max(condition1['position'].max(), condition2['position'].max()), step_size):
        end = start + window_size
        region1 = condition1[(condition1['position'] >= start) & (condition1['position'] < end)]
        region2 = condition2[(condition2['position'] >= start) & (condition2['position'] < end)]
        if len(region1) > 0 and len(region2) > 0:
            t_stat, p_value = stats.ttest_ind(region1['methylation'], region2['methylation'])
            if p_value < p_threshold:
                dmrs.append((start, end, t_stat, p_value))
    return pd.DataFrame(dmrs, columns=['start', 'end', 't_statistic', 'p_value'])



In [None]:
# 3. Visualize DNA methylation patterns
def plot_methylation_heatmap(methylation_data, genes, window=1000):
    matrix = []
    for gene in genes:
        start = gene['start'] - window
        end = gene['end'] + window
        region_data = methylation_data[(methylation_data['position'] >= start) & 
                                       (methylation_data['position'] <= end)]
        matrix.append(region_data['methylation'].values)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(matrix, cmap='coolwarm', center=0.5)
    plt.title('DNA Methylation Patterns')
    plt.xlabel('Position relative to gene')
    plt.ylabel('Genes')
    plt.show()



In [None]:
# 4. Classify CpG islands
def classify_cpg_islands(cpg_data, threshold_low=0.3, threshold_high=0.7):
    classifications = []
    for _, cpg in cpg_data.iterrows():
        if cpg['mean_methylation'] < threshold_low:
            classifications.append('Hypomethylated')
        elif cpg['mean_methylation'] > threshold_high:
            classifications.append('Hypermethylated')
        else:
            classifications.append('Intermediate')
    return pd.Series(classifications, index=cpg_data.index)




In [None]:
# 5. Integrate methylation and gene expression data
def integrate_methylation_expression(methylation_data, expression_data, window=1000):
    integrated_data = []
    for gene, expr in expression_data.items():
        gene_methylation = methylation_data[(methylation_data['position'] >= gene['start'] - window) & 
                                            (methylation_data['position'] <= gene['end'] + window)]
        mean_methylation = gene_methylation['methylation'].mean()
        integrated_data.append((gene, mean_methylation, expr))
    return pd.DataFrame(integrated_data, columns=['gene', 'mean_methylation', 'expression'])



In [None]:
# 6. Peak calling for ChIP-seq data
def call_peaks(chip_data, control_data, window_size=1000, fold_enrichment=4, p_threshold=1e-5):
    peaks = []
    for start in range(0, chip_data['position'].max(), window_size):
        end = start + window_size
        chip_counts = chip_data[(chip_data['position'] >= start) & (chip_data['position'] < end)]['counts'].sum()
        control_counts = control_data[(control_data['position'] >= start) & (control_data['position'] < end)]['counts'].sum()
        
        if chip_counts > 0 and control_counts > 0:
            fold_change = chip_counts / control_counts
            p_value = stats.poisson.sf(chip_counts, control_counts)
            
            if fold_change >= fold_enrichment and p_value <= p_threshold:
                peaks.append((start, end, fold_change, p_value))
    
    return pd.DataFrame(peaks, columns=['start', 'end', 'fold_change', 'p_value'])



In [None]:
# 7. Calculate histone modification enrichment around TSS
def histone_enrichment_tss(histone_data, tss_positions, window=5000):
    enrichment = np.zeros(2 * window + 1)
    for tss in tss_positions:
        region_data = histone_data[(histone_data['position'] >= tss - window) & 
                                   (histone_data['position'] <= tss + window)]
        region_data['relative_position'] = region_data['position'] - tss + window
        enrichment += np.bincount(region_data['relative_position'], 
                                  weights=region_data['signal'], 
                                  minlength=2*window+1)
    return enrichment / len(tss_positions)



In [None]:
# 8. Identify bivalent chromatin domains
def identify_bivalent_domains(activating_marks, repressing_marks, window_size=5000):
    bivalent_domains = []
    for start in range(0, max(activating_marks['position'].max(), repressing_marks['position'].max()), window_size):
        end = start + window_size
        activating = activating_marks[(activating_marks['position'] >= start) & (activating_marks['position'] < end)]
        repressing = repressing_marks[(repressing_marks['position'] >= start) & (repressing_marks['position'] < end)]
        
        if activating['signal'].sum() > 0 and repressing['signal'].sum() > 0:
            bivalent_domains.append((start, end))
    
    return pd.DataFrame(bivalent_domains, columns=['start', 'end'])



In [None]:
# 9. Predict enhancer regions
def predict_enhancers(h3k4me1_data, h3k27ac_data, window_size=1000, threshold=0.8):
    enhancers = []
    for start in range(0, max(h3k4me1_data['position'].max(), h3k27ac_data['position'].max()), window_size):
        end = start + window_size
        h3k4me1 = h3k4me1_data[(h3k4me1_data['position'] >= start) & (h3k4me1_data['position'] < end)]
        h3k27ac = h3k27ac_data[(h3k27ac_data['position'] >= start) & (h3k27ac_data['position'] < end)]
        
        if h3k4me1['signal'].mean() > threshold and h3k27ac['signal'].mean() > threshold:
            enhancers.append((start, end))
    
    return pd.DataFrame(enhancers, columns=['start', 'end'])



In [None]:
# 10. Simulate DNA methylation data
def simulate_methylation_data(n_regions=1000, n_cpgs_per_region=20):
    methylation_data = []
    for i in range(n_regions):
        start = i * 1000
        cpg_positions = np.sort(np.random.choice(range(start, start+1000), n_cpgs_per_region, replace=False))
        methylation_levels = np.random.beta(1, 1, n_cpgs_per_region)
        for pos, meth in zip(cpg_positions, methylation_levels):
            methylation_data.append((pos, meth))
    
    return pd.DataFrame(methylation_data, columns=['position', 'methylation'])



In [None]:
# Example usage:
# methylation_data = simulate_methylation_data()
# avg_methylation = average_methylation(methylation_data, 1000, 2000)

# condition1 = simulate_methylation_data()
# condition2 = simulate_methylation_data()
# dmrs = identify_dmrs(condition1, condition2)

# genes = [{'start': 1000, 'end': 2000}, {'start': 3000, 'end': 4000}]
# plot_methylation_heatmap(methylation_data, genes)

# cpg_islands = pd.DataFrame({'mean_methylation': np.random.uniform(0, 1, 100)})
# cpg_classifications = classify_cpg_islands(cpg_islands)

# expression_data = {'gene1': 10, 'gene2': 20}
# integrated_data = integrate_methylation_expression(methylation_data, expression_data)

# chip_data = pd.DataFrame({'position': range(10000), 'counts': np.random.poisson(10, 10000)})
# control_data = pd.DataFrame({'position': range(10000), 'counts': np.random.poisson(5, 10000)})
# peaks = call_peaks(chip_data, control_data)

# tss_positions = [1000, 3000, 5000]
# histone_data = pd.DataFrame({'position': range(10000), 'signal': np.random.poisson(5, 10000)})
# enrichment = histone_enrichment_tss(histone_data, tss_positions)

# activating_marks = pd.DataFrame({'position': range(10000), 'signal': np.random.poisson(5, 10000)})
# repressing_marks = pd.DataFrame({'position': range(10000), 'signal': np.random.poisson(5, 10000)})
# bivalent_domains = identify_bivalent_domains(activating_marks, repressing_marks)

# h3k4me1_data = pd.DataFrame({'position': range(10000), 'signal': np.random.uniform(0, 1, 10000)})
# h3k27ac_data = pd.DataFrame({'position': range(10000), 'signal': np.random.uniform(0, 1, 10000)})
# enhancers = predict_enhancers(h3k4me1_data, h3k27ac_data)

# simulated_methylation = simulate_methylation_data()