# Step 3: Plot binned methylation around MEs

In [19]:
import pandas as pd
import numpy as np
import cmasher as cmr
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection

import sys
# add path to src module - note this is not a Pythonic solution
sys.path.insert(1, '../') 

from src import plot_mC

mpl.rcParams['pdf.fonttype'] = 42
# plt.rc('axes', labelsize=12)

In [20]:
from warnings import filterwarnings
filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [21]:
feature = "refAlu"
binsize = 20

pdir = "/home/AD/rkgadde/L1IP/mC_data/CZI/type"
outdir = "/home/AD/rkgadde/L1IP/results/CZI/plots/reference"

ref_file = f"{pdir}/vars/subsample_{feature}.bed"
fa = "/home/AD/rkgadde/L1IP/RepeatMasker/hg38_noALT.fa.masked"

In [22]:
if 'L1' in feature:
    names = ['L1HS','L1PA2','L1PA3','L1PA4','L1PA5']
    fl = 6000
if 'Alu' in feature:
    names = ['AluYb8','AluYa5','AluY','AluSx','AluJo']
    fl = 300

## Load data

In [23]:
ref_df = pd.read_csv(ref_file, sep='\t', 
                     names=['chrom','start','end','id','length','strand','subfamily'])
ref_df.head()

Unnamed: 0,chrom,start,end,id,length,strand,subfamily
0,chr5,118255287,118255594,AluSx_3230318,306,+,AluSx
1,chr1,159087573,159087870,AluSx_235927,299,-,AluSx
2,chr5,154434678,154434968,AluSx_3287082,295,+,AluSx
3,chr6,76811405,76811700,AluSx_3449453,299,-,AluSx
4,chr8,3541806,3542027,AluSx_3822382,297,+,AluSx


In [24]:
ref_df['subfamily'].value_counts()

subfamily
AluSx     1868
AluJo     1868
AluY      1868
AluYb8    1868
AluYa5    1868
Name: count, dtype: int64

In [25]:
%%time

# Load the binned mC data
df = pd.read_parquet(f'{pdir}/mC/binc-{binsize}.{feature}.all_samples.parquet.gz')

df['percent_mC'] = (df['mc'] / df['cov']) * 100
df.loc[df['bin'] < 0, 'location'] = 'upstream'
df.loc[df['bin'] > 0, 'location'] = 'downstream'
df['subfamily'] = df['id'].str.extract(r'(.*)_[0-9]+')

df.head()

CPU times: user 3min 51s, sys: 33.1 s, total: 4min 24s
Wall time: 3min 59s


Unnamed: 0,id,bin,sample,celltype,mc,cov,ctxt,percent_mC,location,subfamily
0,AluJo_214,-990,AF2,L6b_TLE4_NXPH4,2,26,mCH,7.692308,upstream,AluJo
1,AluJo_214,-970,AF2,L6b_TLE4_NXPH4,1,36,mCH,2.777778,upstream,AluJo
2,AluJo_214,-950,AF2,L6b_TLE4_NXPH4,1,18,mCH,5.555556,upstream,AluJo
3,AluJo_214,-930,AF2,L6b_TLE4_NXPH4,0,39,mCH,0.0,upstream,AluJo
4,AluJo_214,-910,AF2,L6b_TLE4_NXPH4,0,27,mCH,0.0,upstream,AluJo


In [26]:
# Get full-length and truncated (or transducted) variants
fl_ids = ref_df[ref_df['length'] >= fl]['id'].unique()
df_fl = df[df['id'].isin(fl_ids)]
df_tr = df[~df['id'].isin(fl_ids)]

In [27]:
# df_fam_all = df.groupby(['ctxt','celltype','location','bin','subfamily'])['percent_mC'].mean()
# df_fam_fl = df_fl.groupby(['ctxt','celltype','location','bin','subfamily'])['percent_mC'].mean()
# df_fam_tr = df_tr.groupby(['ctxt','celltype','location','bin','subfamily'])['percent_mC'].mean()

In [28]:
df_fam_all = plot_mC.aggregate_mC_by_sum(df, ['ctxt','celltype','bin','subfamily'])
df_fam_fl = plot_mC.aggregate_mC_by_sum(df_fl, ['ctxt','celltype','bin','subfamily'])
df_fam_tr = plot_mC.aggregate_mC_by_sum(df_tr, ['ctxt','celltype','bin','subfamily'])

In [29]:
# df_fam_sum.head()

In [30]:
print("Full-length: ", df_fl['id'].nunique())
print("All: ", df['id'].nunique())

Full-length:  5129
All:  9338


## Plot mC for each subfamily

In [31]:
if 'L1' in feature:
    xlims = [(-1000, -binsize/2), (binsize/2, 1000)]
    xticks = np.arange(-900,1080,180)

if 'Alu' in feature:
    xlims = [(-1000, -binsize/2), (binsize/2, 1000)]
    xticks = np.arange(-900,1080,180)

In [32]:
celltypes = df['celltype'].unique()
contexts = df['ctxt'].unique()

### Full-length variants

In [33]:
for ctxt in contexts:
    for ct in celltypes:
        labels = {'xlabel': f'Position relative to {feature} element (bp)',
                  'ylabel': f'%{ctxt}',
                  'title': ct}
        pdf = f'{outdir}/mC_by_subfamily/fl_{feature}_{ctxt}_{ct}_fam_bin{binsize}.pdf'
        df_plot_fl = df_fam_fl.loc[ctxt, ct].reset_index()

        fig = plot_mC.plot_mC_by_subfamily(df_plot_fl, names, xlims, xticks, labels, pdf)

### Non-full-length variants

In [34]:
for ct in celltypes:
    for ctxt in contexts:
        labels = {'xlabel': f'Position relative to {feature} element (bp)',
                  'ylabel': f'%{ctxt}',
                  'title': ct}
        pdf = f'{outdir}/mC_by_subfamily/tr_{feature}_{ctxt}_{ct}_fam_bin{binsize}.pdf'
        df_plot_tr = df_fam_tr.loc[ctxt, ct].reset_index()

        fig = plot_mC.plot_mC_by_subfamily(df_plot_tr, names, xlims, xticks, labels, pdf)

### All variants

In [35]:
for ct in celltypes:
    for ctxt in contexts:
        labels = {'xlabel': f'Position relative to {feature} element (bp)',
                  'ylabel': f'%{ctxt}',
                  'title': ct}
        pdf = f'{outdir}/mC_by_subfamily/all_{feature}_{ctxt}_{ct}_fam_bin{binsize}.pdf'
        df_plot_all = df_fam_all.loc[ctxt, ct].reset_index()

        fig = plot_mC.plot_mC_by_subfamily(df_plot_all, names, xlims, xticks, labels, pdf)

## Test periodicity of each subfamily

In [36]:
df_test = plot_mC.aggregate_mC_by_sum(df, ['ctxt','celltype','location','bin','subfamily'])
df_res = plot_mC.test_periodicity(df_test, ['ctxt','celltype','location','subfamily'])
df_res.to_csv(f'{outdir}/mC_period/{feature}_{ctxt}_fam_bin{binsize}.tsv', sep='\t', index=None)

In [37]:
for ct in celltypes:
    for ctxt in contexts:
        df_bar = df_res[(df_res['celltype'] == ct) & (df_res['ctxt'] == ctxt)]

        ax = sns.barplot(df_bar, 
                    x='location', y='amplitude', 
                    order=['upstream', 'downstream'],
                    hue='subfamily', 
                    hue_order=names, 
                    palette='viridis')
        
        sns.move_legend(ax, 'center left', bbox_to_anchor=(1,0.5))
        plt.title(ct)

        pdf = f'{outdir}/mC_period/{feature}_{ctxt}_{ct}_fam_bin{binsize}.pdf'
        plt.savefig(pdf, transparent=True, bbox_inches='tight')
        plt.close()

## Rank all MEs by sequence length

Whereas the above analysis only splits MEs into full-length and not full-length, here we take a closer look at differences in methylation based on sequence length.

In [20]:
df_len = ref_df[['id', 'subfamily', 'length']].copy()
df_len['quantile'] = df_len.groupby('subfamily')['length'].transform(lambda x: pd.qcut(x, 6))

In [21]:
df_quant = df.merge(df_len)

In [22]:
df_quant = plot_mC.aggregate_mC_by_sum(df_quant, ['ctxt','celltype','location','bin','subfamily','quantile'])

In [23]:
df_quant = df_quant.reset_index()
df_quant = df_quant.rename(columns={'quantile': 'length'})

In [24]:
# dfq = plot_mC.aggregate_mC_by_quantile(df, ref_df, 'length', 8)

for ctype in celltypes:
    for fam in names:
        df_hm = df_quant[(df_quant['celltype'] == ct) & (df_quant['subfamily'] == fam)]
        fn = f'{outdir}/mC_by_length/{feature}_mCH_{ctype}_{fam}_len_rank_bin{binsize}.pdf'

        hm = df_hm[(df_hm['bin'] >= -1000) & (df_hm['bin'] < 1000)]
        hm = hm.pivot_table(index='length', 
                            columns='bin', 
                            values='percent_mC',
                            observed=False)

        scale_min = np.floor(min(hm.min()))
        scale_max = np.ceil(max(hm.max()))
        
        plt.figure(figsize=(12,6))
        ax = sns.heatmap(hm, 
                        vmin=scale_min, 
                        vmax=scale_max, 
                        cmap=cmr.torch,
                        xticklabels=10, 
                        cbar_kws={'label': '%mC', 'ticks': [scale_min, scale_max]})
        
        intvs = list(hm.index)
        yticks = [int(np.ceil(i.mid)) for i in intvs]
        ax.set_yticklabels(labels=yticks, rotation=0)

        ax.set_xlabel(f'Position relative to L1 element (bp)', labelpad=15)
        ax.set_ylabel('Length of L1 (bp)', labelpad=15);

        plt.savefig(fn, transparent=True, bbox_inches='tight')
        plt.close()
        
        # ax = plot_mC.plot_mC_by_ME_length(df_hm, fn=fn, bound=1000, bs=binsize, me=feature)

In [26]:
df_res = plot_mC.test_periodicity(df_quant, ['ctxt','celltype','location','subfamily','length'])
df_res.to_csv(f'{outdir}/mC_period/{feature}_{ctxt}_quant_bin{binsize}.tsv', sep='\t', index=None)