# Step 2: Calculate binned methylation around MEs

Given the lists of MEVs generated in step 1, calculate methylation levels in bins around each MEV.

## Load variants

In [109]:
import pandas as pd
import pyarrow
import numpy as np
from glob import glob
import os,re
from multiprocessing import Pool
from tqdm import tqdm
from itertools import repeat

import sys
# add path to src module - note this is not a Pythonic solution
sys.path.insert(1, '../') 

from src import calculate_mC

In [110]:
me_type = 'L1'
var_type = 'absence'
binsize = 100
nbins = 20
ctxt = 'CGN CHN'

allc_path = '/home/AD/rkgadde/L1IP/celltype_allc'
genome = '/home/AD/rkgadde/L1IP/hg38_data/hg38.sorted.genome'

pdir = '/home/AD/rkgadde/L1IP/mC_data/CZI/type'
me_file = f'{pdir}/vars/all_{me_type}_{var_type[:3]}.tsv'

In [111]:
var = var_type[:3]
variant = f'{me_type}_{var}'

bindir = f'{pdir}/bins'
outdir = f'{pdir}/mC'
tmpdir = f'{pdir}/tmp'

In [112]:
me_df = pd.read_csv(me_file, sep='\t', 
                    names=['chrom','start','end','id','svlen','strand','mei','het','alt'])
me_df.head()

Unnamed: 0,chrom,start,end,id,svlen,strand,mei,het,alt
0,chr1,56365451,56369289,CZI_abs_1543,3837,+,L1HS,"AM1,AM2,AM3,YF2,YM3","AF1,AF2,YF1,YM1,YM2"
1,chr1,65443954,65444209,CZI_abs_1545,254,-,L1HS,"AF1,YM1,YM3","AF2,AF3,AM1,AM2,AM3,YF1,YF2"
2,chr1,80939086,80945263,CZI_abs_1546,6176,-,L1HS,"AF2,AF3,AM1,YF1,YM1,YM3","AF1,AM2,AM3,YF2,YM2"
3,chr1,82660292,82661886,CZI_abs_1547,1593,+,L1HS,"AF1,AF2,AM2,YF1","AF3,AM1,AM3,YF2,YM1,YM2,YM3"
4,chr1,86275302,86276014,CZI_abs_1548,711,+,L1HS,"AF1,AM1,YF1,YM1,YM2,YM3","AF2,AF3,AM2,AM3,YF2"


In [113]:
bedfile = me_df[['chrom','start','end','id','strand']]

## Make bins

In [114]:
# Create bins
bed_bins_df = calculate_mC.make_flanking_bins(bedfile, nbins, binsize)

In [115]:
# Index and merge bins
bin_prefix = me_file.split('/')[-1]
bin_prefix = re.match(r'(.*)\.tsv', bin_prefix).group(1) + '.bins'
calculate_mC.process_bins(bed_bins_df, bin_prefix, bindir)

## Calculate methylation for each bin

In [116]:
# Make directories
!mkdir -p {tmpdir}/allc_regions-{binsize}
!mkdir -p {tmpdir}/binc-{binsize}

In [117]:
allc_tables = glob(f'{allc_path}/*.allc.tsv.gz')
allc_prefix = f'{tmpdir}/allc_regions-{binsize}/allc_{me_type}_{var}'
merge_bed = f'{bindir}/{bin_prefix}.merged.bed'

In [118]:
# Use tabix to extract the portions of allc tables from the deletion regions.
# This will speed up subsequent processing to calculate the binned data
with Pool(16) as p:
    x = list(tqdm(p.starmap(calculate_mC.tabixallc, 
                            zip(allc_tables, 
                                repeat(allc_prefix), 
                                repeat(merge_bed)))))

command	tabix-allc
command	tabix-allc
allc_path	/home/AD/rkgadde/L1IP/mC_data/CZI/type/tmp/allc_regions-100/allc_L1_abs.L6IT_THEMIS_CUX1-AF3.tsv.gz
command	tabix-allc
reindex	False
allc_path	/home/AD/rkgadde/L1IP/mC_data/CZI/type/tmp/allc_regions-100/allc_L1_abs.L6b_TLE4_NXPH4-AM2.tsv.gz
reindex	False
allc_path	/home/AD/rkgadde/L1IP/mC_data/CZI/type/tmp/allc_regions-100/allc_L1_abs.CGE_ADARB2_ADAM33-YF1.tsv.gz
reindex	False
command	tabix-allc
allc_path	/home/AD/rkgadde/L1IP/mC_data/CZI/type/tmp/allc_regions-100/allc_L1_abs.L3-5IT_RORB_PLCH1-AF1.tsv.gz
reindex	False
command	tabix-allc
allc_path	/home/AD/rkgadde/L1IP/mC_data/CZI/type/tmp/allc_regions-100/allc_L1_abs.L6IT_THEMIS_LINC00343-YM3.tsv.gz
reindex	False
command	tabix-allc
allc_path	/home/AD/rkgadde/L1IP/mC_data/CZI/type/tmp/allc_regions-100/allc_L1_abs.L6b_TLE4_NXPH4-YM3.tsv.gz
reindex	False
command	tabix-allc
allc_path	/home/AD/rkgadde/L1IP/mC_data/CZI/type/tmp/allc_regions-100/allc_L1_abs.L3-5IT_RORB_PLCH1-AF3.tsv.gz
reindex	F

100%|██████████| 209/209 [00:00<00:00, 1277856.47it/s]


In [119]:
allcdir = f'{tmpdir}/allc_regions-{binsize}'
allc_files = glob(f'{allcdir}/allc_{variant}.*.tsv.gz')
bed_bins_file = f'{bindir}/{bin_prefix}.bed.gz'
binc_prefix = f'{tmpdir}/binc-{binsize}/binc'

In [120]:
# Use allcools to calculate the binned mC counts 
with Pool(16) as p:
    x = list(tqdm(p.starmap(calculate_mC.allc2bins, 
                            zip(allc_files, 
                                repeat(binc_prefix), 
                                repeat(variant), 
                                repeat(genome), 
                                repeat(bed_bins_file),
                                repeat(ctxt)))))

100%|██████████| 209/209 [00:00<00:00, 679542.28it/s]


In [121]:
# Load the binned mC data
cn = ctxt.split(' ')[0]
mc_files = glob(f'{binc_prefix}*{variant}_{cn}-Both.sparse.bed.gz')
samples_celltypes = []

for mc_file in mc_files:
    sample = re.match(fr'.*binc\.(.*)\.{variant}_{cn}-Both\.sparse\.bed\.gz', mc_file).group(1)
    samples_celltypes.append(sample)
len(samples_celltypes)

209

In [122]:
# Combine binned mC data
with Pool(16) as p:
    mc_dfs = list(tqdm(p.starmap(calculate_mC.get_binc, 
                              zip(samples_celltypes, 
                                  repeat(binc_prefix), 
                                  repeat(variant),
                                  repeat(ctxt)))))

100%|██████████| 209/209 [00:00<00:00, 722678.92it/s]


In [123]:
mc_df = pd.concat(mc_dfs)

## Mark genotype for each sample at each locus

In [124]:
ids = me_df['id'].unique()

with Pool(16) as p:
    gt_dfs = list(tqdm(p.starmap(calculate_mC.groupby_gt, 
                              zip(ids, 
                                  repeat(me_df), 
                                  repeat(mc_df), 
                                  repeat(me_type)))))

100%|██████████| 122/122 [00:00<00:00, 485949.75it/s]


In [125]:
gt_df = pd.concat(gt_dfs)

## Export data

In [127]:
%%time

# TODO: Change to rewrite file if tmp files are newer
filepath = f'{outdir}/binc-{binsize}.{variant}.all_samples.parquet.gz'

if not os.path.isfile(filepath):
    gt_df.to_parquet(filepath, compression='gzip', index=False)

CPU times: user 987 ms, sys: 71.9 ms, total: 1.06 s
Wall time: 1.05 s
