# Step 2: Calculate binned methylation around MEs

Given the list of non-polymorphic MEs generated in step 1 (or a custom list), calculate the binned methylation level around each one. This notebook is flexible, so it can be used to calculate methylation levels at other features, like TSSs, but the subsequent plotting notebook should only be used for MEs. Input ALLC files should be named according to the following format `{celltype}_{sample}.allc.tsv.gz`.

## Load MEs

In [1]:
import pandas as pd
import pyarrow
import numpy as np
from glob import glob
import os,re
from multiprocessing import Pool
from tqdm import tqdm
from itertools import repeat

import sys
# add path to src module - note this is not a Pythonic solution
sys.path.insert(1, '../') 

from src import calculate_mC

In [2]:
feature = 'npL1' 
binsize = 20
nbins = 100
ctxt = 'CHN'

allc_path = '/home/AD/rkgadde/L1IP/celltype_allc'
genome = '/home/AD/rkgadde/L1IP/hg38_data/hg38.sorted.genome'

pdir = '/home/AD/rkgadde/L1IP/mC_data/CZI/type'
ref_file = f'{pdir}/vars/non-polymorphic_refL1.bed' 

In [3]:
datadir = os.path.dirname(ref_file)
bindir = f'{pdir}/bins'
tmpdir = f'{pdir}/tmp'
outdir = f'{pdir}/mC'

In [4]:
ref_df = pd.read_csv(ref_file, sep='\t', 
                      names=['chrom','start','end','id','length','strand','subfamily'])
ref_df.head()

Unnamed: 0,chrom,start,end,id,length,strand,subfamily
0,chr1,64425,64666,L1PA4_69,241,+,L1PA4
1,chr1,400705,400825,L1PA4_508,122,-,L1PA4
2,chr1,423484,423750,L1PA5_548,270,-,L1PA5
3,chr1,435752,435850,L1PA5_564,98,+,L1PA5
4,chr1,636458,636578,L1PA4_810,122,-,L1PA4


In [5]:
bedfile = ref_df[['chrom','start','end','id','strand']]

## Make bins

In [6]:
# Make bins - if feature is a repetitive element, exclude the repeat sequence. 
# Else, include the feature.
if 'L1' in feature or 'Alu' in feature: 
    bed_bins_df = calculate_mC.make_flanking_bins(bedfile, nbins, binsize)
else:
    bed_bins_df = calculate_mC.make_inclusive_bins(bedfile, nbins, binsize)

In [7]:
# Index and merge bins
bin_prefix = ref_file.split('/')[-1]
bin_prefix = re.match(r'(.*)\.bed', bin_prefix).group(1) + '.bins'
calculate_mC.process_bins(bed_bins_df, bin_prefix, bindir)

## Calculate methylation

In [8]:
# Make directories
!mkdir -p {tmpdir}/allc_regions-{binsize}
!mkdir -p {tmpdir}/binc-{binsize}

In [9]:
allc_tables = glob(f'{allc_path}/*.allc.tsv.gz')
allc_prefix = f'{tmpdir}/allc_regions-{binsize}/allc_{feature}'
merge_bed = f'{bindir}/{bin_prefix}.merged.bed'

In [10]:
# Use tabix to extract the portions of allc tables from the deletion regions.
# This will speed up subsequent processing to calculate the binned data
with Pool(16) as p:
    x = list(tqdm(p.starmap(calculate_mC.tabixallc, 
                            zip(allc_tables, 
                                repeat(allc_prefix), 
                                repeat(merge_bed)))))

100%|██████████| 209/209 [00:00<00:00, 723275.19it/s]


In [11]:
allcdir = f'{tmpdir}/allc_regions-{binsize}'
allc_files = glob(f'{allcdir}/allc_{feature}.*.tsv.gz')
bed_bins_file = f'{bindir}/{bin_prefix}.bed.gz'
binc_prefix = f'{tmpdir}/binc-{binsize}/binc'

In [12]:
# Use allcools to calculate the binned mC counts 
with Pool(16) as p:
    x = list(tqdm(p.starmap(calculate_mC.allc2bins, 
                            zip(allc_files, 
                                repeat(binc_prefix), 
                                repeat(feature), 
                                repeat(genome), 
                                repeat(bed_bins_file),
                                repeat(ctxt)))))

Sample already processed, skipping L6-AF1
Sample already processed, skipping CGE_PAX6-YM2
Sample already processed, skipping L6CT_TLE4_FAM95C-YM1
Sample already processed, skipping MGE_SST_CLMP-AF2
Sample already processed, skipping CGE_LAMP5_LHX6-AM2
Sample already processed, skipping MGE_PVALB-AF3
Sample already processed, skipping L6-YM2
Sample already processed, skipping CGE_LAMP5-AF2
Sample already processed, skipping L4-5IT_RORB_TSHZ2-YM1
Sample already processed, skipping L4-5IT_RORB_ARHGAP15-AF3
Sample already processed, skipping MGE_SST_CLMP-YM1
Sample already processed, skipping L4-5IT_RORB_TSHZ2-AF2
Sample already processed, skipping CGE_ADARB2_ADAM33-YF1
Sample already processed, skipping CGE_ADARB2_ADAM33-AM2














Sample already processed, skipping L6IT_THEMIS_LINC00343-YM3
Sample already processed, skipping CGE_LAMP5_LHX6-AF3
Sample already processed, skipping CGE_VIP-AF2
Sample already processed, skipping L56NP_TLE4_TSHZ2-YF1
Sample already processed, skipping C

100%|██████████| 209/209 [00:00<00:00, 1145894.82it/s]


In [13]:
# Load the binned mC data
cn = ctxt.split(' ')[0]
mc_files = glob(f'{binc_prefix}*{feature}_{cn}-Both.sparse.bed.gz')
samples_celltypes = []

for mc_file in mc_files:
    sample = re.match(fr'.*binc\.(.*)\.{feature}_{cn}-Both\.sparse\.bed\.gz', mc_file).group(1)
    samples_celltypes.append(sample)
len(samples_celltypes)

209

In [14]:
# Combine binned mC data
with Pool(16) as p:
    mc_dfs = list(tqdm(p.starmap(calculate_mC.get_binc, 
                            zip(samples_celltypes, 
                                repeat(binc_prefix), 
                                repeat(feature),
                                repeat(ctxt)))))

100%|██████████| 209/209 [00:00<00:00, 403038.87it/s]


In [15]:
mc_df = pd.concat(mc_dfs)

## Export data

In [16]:
%%time

filepath = f'{outdir}/binc-{binsize}.{feature}.all_samples.parquet.gz'

if not os.path.isfile(filepath):
    mc_df.to_parquet(filepath, compression='gzip', index=False)

CPU times: user 3min 42s, sys: 32.1 s, total: 4min 14s
Wall time: 4min 14s
