# Comparing modkit and DSS outputs for human samples
`modkit dmr pair` vs `DMLtest(.., smoothing=TRUE)`

##### Modules

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

##### Functions

##### Inputs

Column names from https://nanoporetech.github.io/modkit/

In [3]:
modkit_dmr_cols = [
    'chrom',
    'start',
    'end',
    'name',
    'score',
    'strand',
    'a_counts',
    'a_total',
    'b_counts',
    'b_total',
    'a_mod_percentages',
    'b_mod_percentages',
    'a_pct_modified',
    'b_pct_modified',
    'map_pvalue',
    'effect_size',
    'cohen_h', 
    'cohen_h_low',
    'cohen_h_high'
]

dss_cols = [
    'chrom',
    'end',
    'DSS_mu1',
    'DSS_mu2',
    'DSS_diff',
    'DSS_diff.se',
    'DSS_stat',
    'DSS_phi1',
    'DSS_phi2',
    'DSS_pval',
    'DSS_fdr'
]

Load dmr `.bed` (modkit) and `.tsv` (DSS) files.

In [4]:
%%bash
# cell line
dx download "/analysis/Sarah_analysis/human_ONT/single_base_dmr_HG002_vs_HG005_mincov5.bed"
dx download "/analysis/Sarah_analysis/human_ONT/DSS_DMLtestwSmoothing_HG002_vs_HG005_mincov5.tsv"

In [5]:
%%bash
# lung, liver, colon
dx download "analysis/Jon_analysis/human/st001/single_base_dmr_st001_liver_vs_lung_mincov5.bed"
dx download "analysis/Jon_analysis/human/st001/DSS_DMLtestwSmoothing_st001_liver_vs_lung_mincov5.tsv"
dx download "analysis/Jon_analysis/human/st002/single_base_dmr_st002_colon_vs_lung_mincov5.bed"
dx download "analysis/Jon_analysis/human/st002/DSS_DMLtestwSmoothing_st002_colon_vs_lung_mincov5.tsv"

In [6]:
df_dmr1 = pd.read_csv("single_base_dmr_HG002_vs_HG005_mincov5.bed", 
                      header=None, sep='\t', names=modkit_dmr_cols)
df_dss1 = pd.read_csv("DSS_DMLtestwSmoothing_HG002_vs_HG005_mincov5.tsv", 
                      sep='\t', skiprows=1, header=None, names=dss_cols)

In [8]:
df_dmr2 = pd.read_csv("single_base_dmr_st001_liver_vs_lung_mincov5.bed", 
                      header=None, sep='\t', names=modkit_dmr_cols)
df_dss2 = pd.read_csv("DSS_DMLtestwSmoothing_st001_liver_vs_lung_mincov5.tsv", 
                      sep='\t', skiprows=1, header=None, names=dss_cols)

In [9]:
df_dmr3 = pd.read_csv("single_base_dmr_st002_colon_vs_lung_mincov5.bed", 
                      header=None, sep='\t', names=modkit_dmr_cols)
df_dss3 = pd.read_csv("DSS_DMLtestwSmoothing_st002_colon_vs_lung_mincov5.tsv", 
                      sep='\t', skiprows=1, header=None, names=dss_cols)

Merge the results.

In [12]:
print(df_dmr1.shape)
print(df_dss1.shape)

(587561, 19)
(587561, 11)


In [17]:
print(df_dmr2.shape)
print(df_dss2.shape)

(589000, 19)
(589000, 11)


In [18]:
print(df_dmr3.shape)
print(df_dss3.shape)

(588242, 19)
(588242, 11)


In [15]:
df1 = df_dmr1.merge(df_dss1, how='outer', on=['chrom', 'end'])
df2 = df_dmr2.merge(df_dss2, how='outer', on=['chrom', 'end'])
df3 = df_dmr3.merge(df_dss3, how='outer', on=['chrom', 'end'])

### Multiple-test corrections

In [19]:
df1.columns

Index(['chrom', 'start', 'end', 'name', 'score', 'strand', 'a_counts',
       'a_total', 'b_counts', 'b_total', 'a_mod_percentages',
       'b_mod_percentages', 'a_pct_modified', 'b_pct_modified', 'map_pvalue',
       'effect_size', 'cohen_h', 'cohen_h_low', 'cohen_h_high', 'DSS_mu1',
       'DSS_mu2', 'DSS_diff', 'DSS_diff.se', 'DSS_stat', 'DSS_phi1',
       'DSS_phi2', 'DSS_pval', 'DSS_fdr'],
      dtype='object')

In [23]:
m = df1['map_pvalue'].notna().sum()
df1['map_pvalue_bonf'] = df1['map_pvalue'].apply(lambda x: min(x * m, 1.0) if pd.notna(x) else np.nan)

m = df1['DSS_pval'].notna().sum()
df1['DSS_pval_bonf'] = df1['DSS_pval'].apply(lambda x: min(x * m, 1.0) if pd.notna(x) else np.nan)

In [25]:
m = df2['map_pvalue'].notna().sum()
df2['map_pvalue_bonf'] = df2['map_pvalue'].apply(lambda x: min(x * m, 1.0) if pd.notna(x) else np.nan)

m = df2['DSS_pval'].notna().sum()
df2['DSS_pval_bonf'] = df2['DSS_pval'].apply(lambda x: min(x * m, 1.0) if pd.notna(x) else np.nan)

In [26]:
m = df3['map_pvalue'].notna().sum()
df3['map_pvalue_bonf'] = df3['map_pvalue'].apply(lambda x: min(x * m, 1.0) if pd.notna(x) else np.nan)

m = df3['DSS_pval'].notna().sum()
df3['DSS_pval_bonf'] = df3['DSS_pval'].apply(lambda x: min(x * m, 1.0) if pd.notna(x) else np.nan)

### Find DMR that is detected with modkit (no smoothing) but not in DSS (with smoothing)

In [27]:
alpha = 0.05
df1['NoSmooth_sig_only'] = np.where((df1['map_pvalue_bonf'] < alpha) & (df1['DSS_pval_bonf'] > alpha),
                                    True, False)

In [28]:
df1['NoSmooth_sig_only'].value_counts()

NoSmooth_sig_only
False    582019
True       5542
Name: count, dtype: int64

In [29]:
alpha = 0.05
df2['NoSmooth_sig_only'] = np.where((df2['map_pvalue_bonf'] < alpha) & (df2['DSS_pval_bonf'] > alpha),
                                    True, False)

In [30]:
df2['NoSmooth_sig_only'].value_counts()

NoSmooth_sig_only
False    588982
True         18
Name: count, dtype: int64

In [31]:
alpha = 0.05
df3['NoSmooth_sig_only'] = np.where((df3['map_pvalue_bonf'] < alpha) & (df3['DSS_pval_bonf'] > alpha),
                                    True, False)

In [32]:
df3['NoSmooth_sig_only'].value_counts()

NoSmooth_sig_only
False    588241
True          1
Name: count, dtype: int64

In [34]:
out1 = 'HG002_vs_HG005_mincov5_modkit_and_DSS_results.tsv'
df1.to_csv(out1, sep='\t', index=False)

In [35]:
out2 = 'st001_liver_vs_lung_mincov5_modkit_and_DSS_results.tsv'
df2.to_csv(out2, sep='\t', index=False)

In [36]:
out3 = 'st002_colon_vs_lung_mincov5_modkit_and_DSS_results.tsv'
df3.to_csv(out3, sep='\t', index=False)

In [40]:
%%bash
dx cd "/analysis/Sarah_analysis/human_ONT/modkit_and_DSS_results"
dx upload 'HG002_vs_HG005_mincov5_modkit_and_DSS_results.tsv'
dx upload 'st001_liver_vs_lung_mincov5_modkit_and_DSS_results.tsv'
dx upload 'st002_colon_vs_lung_mincov5_modkit_and_DSS_results.tsv'

ID                                file-J2jxjG002FYp6kZ1V4pPF36K
Class                             file
Project                           project-J2g1vg002FYx1k0pkQv4QXQZ
Folder                            /analysis/Sarah_analysis/human_ONT/modkit_and_DSS_results
Name                              HG002_vs_HG005_mincov5_modkit_and_DSS_results.tsv
State                             closing
Visibility                        visible
Types                             -
Properties                        -
Tags                              -
Outgoing links                    -
Created                           Fri Aug 29 16:58:24 2025
Created by                        saraheger
 via the job                      job-J2jvyqj02FYVPvXJkJ4Q1538
Last modified                     Fri Aug 29 16:58:26 2025
Media type                        
archivalState                     "live"
cloudAccount                      "cloudaccount-dnanexus"
ID                                file-J2jxjGQ02FYp6kZ1V4pPF36Q
Cla