# Calculate binned methylation around MEVs overlapping genes

## Load variants

In [1]:
import pandas as pd
import pyarrow
import numpy as np
from glob import glob
import os,re
from multiprocessing import Pool
from tqdm import tqdm
from itertools import repeat

import sys
# add path to src module - note this is not a Pythonic solution
sys.path.insert(1, '../') 

from src import calculate_mC

In [2]:
me_type = 'Alu'
binsize = 60
nbins = 3

In [3]:
allc_path = '/home/AD/rkgadde/L1IP/celltype_allc'
genome = '/home/AD/rkgadde/L1IP/hg38_data/hg38.sorted.genome'

genedir = '/home/AD/rkgadde/L1IP/gene_data'
id_file = f'{genedir}/{me_type}_intersect_mev_ids.txt'

me_path = f'/home/AD/rkgadde/L1IP/mC_data/CZI/class/vars'
me_files = [f'{me_path}/all_{me_type}_abs.tsv', f'{me_path}/all_{me_type}_ins.tsv']

In [4]:
pdir = '/home/AD/rkgadde/L1IP/mC_data/CZI/type'
bindir = f'{pdir}/bins'
outdir = f'{pdir}/mC'
tmpdir = f'{pdir}/tmp'

In [5]:
!grep -f {id_file} {me_files[0]} > {genedir}/{me_type}_intersect_mevs.tsv
!grep -f {id_file} {me_files[1]} >> {genedir}/{me_type}_intersect_mevs.tsv

In [6]:
me_df = pd.read_csv(f'{genedir}/{me_type}_intersect_mevs.tsv', sep='\t', 
                    names=['chrom','start','end','id','svlen','strand','mei','het','alt'])
me_df.head()

Unnamed: 0,chrom,start,end,id,svlen,strand,mei,het,alt
0,chr1,17358284,17358591,CZI_abs_6,306.0,+,AluYe5,"AF1,AF3,YF1,YM1,YM3","AF2,AM1,AM2,AM3,YF2,YM2"
1,chr1,24076520,24076812,CZI_abs_7,291.0,-,AluYa5,"AM3,YF2",
2,chr1,42277515,42277821,CZI_abs_12,305.0,+,AluYa5,"AF2,AM3","AF1,AF3,AM2,YF1,YF2,YM1,YM2"
3,chr1,42542541,42542852,CZI_abs_13,310.0,-,AluY,"AF1,AM2,YM1",
4,chr1,43391798,43392120,CZI_abs_14,321.0,+,AluY,"AF1,AM3,YF2","AF2,AM1,AM2,YF1,YM1,YM3"


In [7]:
bedfile = me_df[['chrom','start','end','id','strand']]

## Make bins

In [8]:
# Create bins
bed_bins_df = calculate_mC.make_flanking_bins(bedfile, nbins, binsize)

In [9]:
# Index and merge bins
bin_prefix = f'{me_type}_intersect.bins'
calculate_mC.process_bins(bed_bins_df, bin_prefix, bindir)

## Calculate methylation for each bin

In [10]:
# Make directories
!mkdir -p {tmpdir}/allc_regions-{binsize}
!mkdir -p {tmpdir}/binc-{binsize}

In [11]:
allc_tables = glob(f'{allc_path}/*.allc.tsv.gz')
allc_prefix = f'{tmpdir}/allc_regions-{binsize}/allc_{me_type}_intersect'
merge_bed = f'{bindir}/{bin_prefix}.merged.bed'

In [12]:
# Use tabix to extract the portions of allc tables from the deletion regions.
# This will speed up subsequent processing to calculate the binned data
with Pool(16) as p:
    x = list(tqdm(p.starmap(calculate_mC.tabixallc, 
                            zip(allc_tables, 
                                repeat(allc_prefix), 
                                repeat(merge_bed)))))

100%|██████████| 231/231 [00:00<00:00, 977683.37it/s]


In [13]:
allcdir = f'{tmpdir}/allc_regions-{binsize}'
allc_files = glob(f'{allcdir}/allc_{me_type}_intersect.*.tsv.gz')
bed_bins_file = f'{bindir}/{bin_prefix}.bed.gz'
binc_prefix = f'{tmpdir}/binc-{binsize}/binc'

In [14]:
# Use allcools to calculate the binned mC counts 
with Pool(16) as p:
    x = list(tqdm(p.starmap(calculate_mC.allc2bins, 
                            zip(allc_files, 
                                repeat(binc_prefix), 
                                repeat(f'{me_type}_intersect'), 
                                repeat(genome), 
                                repeat(bed_bins_file)))))

Sample already processed, skipping CGE_LAMP5_LHX6-AF1
Sample already processed, skipping L6IT_THEMIS_CUX1-AM2
Sample already processed, skipping L2-4IT_CUX2-YM3
Sample already processed, skipping CGE_ADARB2_ADAM33-YM1
Sample already processed, skipping L3-5IT_RORB_PLCH1-YM1
Sample already processed, skipping L3-5IT_RORB_PLCH1-AF2
Sample already processed, skipping L4-5IT_RORB_TSHZ2-YF1
Sample already processed, skipping Glia_Astro-AM3
Sample already processed, skipping L6b_TLE4_NXPH4-YM1
Sample already processed, skipping L6IT_THEMIS_CUX1-YF1
Sample already processed, skipping Glia_Astro-YF2


Sample already processed, skipping CGE_VIP-AF3
Sample already processed, skipping CGE_ADARB2_ADAM33-YF2

Sample already processed, skipping CGE_ADARB2_ADAM33-AF2








Sample already processed, skipping Glia_Astro-AM1
Sample already processed, skipping CGE_PAX6-AF1

Sample already processed, skipping L4-5IT_RORB_LRRK1-AM2


Sample already processed, skipping L6CT_TLE4_FAM95C-AF1
Sample already 

100%|██████████| 231/231 [00:00<00:00, 622676.24it/s]


In [15]:
# Load the binned mC data
mc_files = glob(f'{binc_prefix}*{me_type}_intersect_CGN-Both.sparse.bed.gz')
samples_celltypes = []
for mc_file in mc_files:
    sample = re.match(fr'.*binc\.(.*)\.{me_type}_intersect_CGN-Both\.sparse\.bed\.gz', mc_file).group(1)
    samples_celltypes.append(sample)
len(samples_celltypes)

231

In [16]:
# Combine binned mC data
with Pool(16) as p:
    mc_dfs = list(tqdm(p.starmap(calculate_mC.get_binc, 
                              zip(samples_celltypes, 
                                  repeat(binc_prefix), 
                                  repeat(f'{me_type}_intersect')))))

100%|██████████| 231/231 [00:00<00:00, 880003.84it/s]


In [17]:
mc_df = pd.concat(mc_dfs)

## Mark genotype for each sample at each locus

In [18]:
ids = me_df['id'].unique()

with Pool(16) as p:
    gt_dfs = list(tqdm(p.starmap(calculate_mC.groupby_gt, 
                              zip(ids, 
                                  repeat(me_df), 
                                  repeat(mc_df), 
                                  repeat(me_type)))))

100%|██████████| 2882/2882 [00:00<00:00, 1599997.90it/s]


In [19]:
gt_df = pd.concat(gt_dfs)

## Export data

In [20]:
%%time

filepath = f'{outdir}/binc-{binsize}.{me_type}_intersect.gt.all_samples.parquet.gz'

if not os.path.isfile(filepath):
    gt_df.to_parquet(filepath, compression='gzip', index=False)

CPU times: user 3.51 s, sys: 243 ms, total: 3.75 s
Wall time: 4.22 s
