# Loading modkit outputs as dataframes
* `modkit pileup`
* `modkit dmr pair`

##### Modules

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

##### Inputs

Column names from https://nanoporetech.github.io/modkit/

In [3]:
pileup_cols = [
    'chrom',
    'start0',
    'end0',
    'name',
    'score',
    'strand',
    'start1',
    'end1',
    'color',
    'valid_coverage',
    'percent_modified',
    'count_modified',
    'count_canonical',
    'count_other_mod',
    'count_delete',
    'count_fail',
    'count_diff',
    'count_nocall'
]

dmr_cols = [
    'chrom',
    'start',
    'end',
    'name',
    'score',
    'strand',
    'a_counts',
    'a_total',
    'b_counts',
    'b_total',
    'a_mod_percentages',
    'b_mod_percentages',
    'a_pct_modified',
    'b_pct_modified',
    'map_pvalue',
    'effect_size',
    'cohen_h', 
    'cohen_h_low',
    'cohen_h_high'
]

Load `.bed` files.

In [13]:
%%bash
dx download "/analysis/Sarah_analysis/human_ONT/HG002.ont.chr22.cpg.bed.gz"
dx download "/analysis/Sarah_analysis/human_ONT/single_base_dmr_HG002_and_HG005.bed"

Error: path "/opt/notebooks/HG002.ont.chr22.cpg.bed.gz" already exists but
-f/--overwrite was not set


In [14]:
df1 = pd.read_csv("HG002.ont.chr22.cpg.bed.gz", header=None, sep='\t', names=pileup_cols)
df2 = pd.read_csv("single_base_dmr_HG002_and_HG005.bed", header=None, sep='\t', names=dmr_cols)

In [18]:
df1

Unnamed: 0,chrom,start0,end0,name,score,strand,start1,end1,color,valid_coverage,percent_modified,count_modified,count_canonical,count_other_mod,count_delete,count_fail,count_diff,count_nocall
0,chr22,10512832,10512833,m,1,.,10512832,10512833,25500,1,100.00,1,0,0,0,0,0,0
1,chr22,10513852,10513853,m,1,.,10513852,10513853,25500,1,100.00,1,0,0,0,0,0,0
2,chr22,10513864,10513865,m,1,.,10513864,10513865,25500,1,100.00,1,0,0,0,0,0,0
3,chr22,10513906,10513907,m,1,.,10513906,10513907,25500,1,0.00,0,1,0,0,0,0,0
4,chr22,10514367,10514368,m,2,.,10514367,10514368,25500,2,50.00,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653079,chr22,50808239,50808240,m,21,.,50808239,50808240,25500,21,4.76,1,20,0,0,2,0,0
653080,chr22,50808245,50808246,a,11,.,50808245,50808246,25500,11,18.18,2,9,0,0,7,5,0
653081,chr22,50808245,50808246,m,20,.,50808245,50808246,25500,20,5.00,1,19,0,0,3,0,0
653082,chr22,50808251,50808252,a,13,.,50808251,50808252,25500,13,15.38,2,11,0,1,8,1,0


In [16]:
df1_min5 = df1[df1['valid_coverage'] >= 5]

In [17]:
df1_min5

Unnamed: 0,chrom,start0,end0,name,score,strand,start1,end1,color,valid_coverage,percent_modified,count_modified,count_canonical,count_other_mod,count_delete,count_fail,count_diff,count_nocall
26,chr22,10519493,10519494,m,5,.,10519493,10519494,25500,5,0.00,0,5,0,0,0,0,0
27,chr22,10519616,10519617,m,6,.,10519616,10519617,25500,6,66.67,4,2,0,0,0,0,0
28,chr22,10519751,10519752,m,6,.,10519751,10519752,25500,6,16.67,1,5,0,0,0,0,0
29,chr22,10519768,10519769,m,5,.,10519768,10519769,25500,5,0.00,0,5,0,0,1,0,0
30,chr22,10519810,10519811,m,6,.,10519810,10519811,25500,6,0.00,0,6,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653079,chr22,50808239,50808240,m,21,.,50808239,50808240,25500,21,4.76,1,20,0,0,2,0,0
653080,chr22,50808245,50808246,a,11,.,50808245,50808246,25500,11,18.18,2,9,0,0,7,5,0
653081,chr22,50808245,50808246,m,20,.,50808245,50808246,25500,20,5.00,1,19,0,0,3,0,0
653082,chr22,50808251,50808252,a,13,.,50808251,50808252,25500,13,15.38,2,11,0,1,8,1,0
