# Step 3: Plot binned methylation around MEs

In [122]:
import pandas as pd
# import pyarrow
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import fdrcorrection

import sys
# add path to src module - note this is not a Pythonic solution
sys.path.insert(1, '../') 

from src import plot_mC

mpl.rcParams['pdf.fonttype'] = 42
# plt.rc('axes', labelsize=12)

In [123]:
me_type = 'L1'
var_type = 'absence' # absence or insertion
binsize = 100

pdir = '/home/AD/rkgadde/L1IP/mC_data/CZI/type'
outdir = '/home/AD/rkgadde/L1IP/results/CZI/plots/polymorphic'

me_file = f'{pdir}/vars/all_{me_type}_{var_type[:3]}.tsv'

In [124]:
var = var_type[:3]
variant = f'{me_type}_{var}'

if me_type == 'L1':
    fl = 6000
if me_type == 'Alu':
    fl = 280

## Load data

In [125]:
me_df = pd.read_csv(me_file, sep='\t',
                    names=['chrom','start','end','id','svlen','strand','mei','het','alt'])

In [126]:
%%time

# Load the binned mC data
df = pd.read_parquet(f'{pdir}/mC/binc-{binsize}.{variant}.all_samples.parquet.gz')
# df['percent_mC'] = (df['mc'] / df['cov']) * 100
df.loc[df['bin'] < 0, 'location'] = 'upstream'
df.loc[df['bin'] > 0, 'location'] = 'downstream'
df.head()

CPU times: user 519 ms, sys: 130 ms, total: 649 ms
Wall time: 280 ms


Unnamed: 0,id,bin,sample,celltype,mc,cov,ctxt,genotype,location
0,CZI_abs_1543,-1950,YM2,L4-5IT_RORB_ARHGAP15,3,3,mCG,NoL1,upstream
1,CZI_abs_1543,-1850,YM2,L4-5IT_RORB_ARHGAP15,6,6,mCG,NoL1,upstream
2,CZI_abs_1543,-1450,YM2,L4-5IT_RORB_ARHGAP15,6,6,mCG,NoL1,upstream
3,CZI_abs_1543,-1350,YM2,L4-5IT_RORB_ARHGAP15,7,7,mCG,NoL1,upstream
4,CZI_abs_1543,-650,YM2,L4-5IT_RORB_ARHGAP15,1,1,mCG,NoL1,upstream


In [127]:
# Get full-length and truncated variants
fl_ids = me_df[me_df['svlen'] >= fl]['id'].unique()
df_fl = df[df['id'].isin(fl_ids)]
df_tr = df[~df['id'].isin(fl_ids)]

In [128]:
# Aggregate counts for each bin across loci
# TODO: Decide whether to aggregate by summing or averaging
# TODO: Decide whether to implement coverage filter
df_gt_all = plot_mC.aggregate_mC_by_sum(df, ['ctxt','celltype','bin','genotype'])
df_gt_fl = plot_mC.aggregate_mC_by_sum(df_fl, ['ctxt','celltype','bin','genotype'])
df_gt_tr = plot_mC.aggregate_mC_by_sum(df_tr, ['ctxt','celltype','bin','genotype'])

In [129]:
df_gt_all.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mc,cov,percent_mC
ctxt,celltype,bin,genotype,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mCG,CGE_ADARB2_ADAM33,-1950,Het,1830,2624,69.740854
mCG,CGE_ADARB2_ADAM33,-1950,L1,1378,1558,88.446727
mCG,CGE_ADARB2_ADAM33,-1950,NoL1,1300,1523,85.357846
mCG,CGE_ADARB2_ADAM33,-1850,Het,1898,2182,86.984418
mCG,CGE_ADARB2_ADAM33,-1850,L1,1922,2110,91.090047


In [130]:
print('Full-length: ', df_fl['id'].nunique())
print('All: ', df['id'].nunique())

Full-length:  50
All:  122


In [131]:
gt_per_loci = df[['id','sample','genotype']].drop_duplicates()
gt_per_loci['genotype'].value_counts()

genotype
Het     506
NoL1    490
L1      346
Name: count, dtype: int64

## Plot mC for each genotype

In [132]:
if 'L1' in me_type:
    xlims = [(-2000, -binsize/2), (binsize/2, 2000)]
    xticks = np.arange(-1800,2160,360)

if 'Alu' in me_type:
    xlims = [(-1000, -binsize/2), (binsize/2, 1000)]
    xticks = np.arange(-900,1080,180)

In [133]:
celltypes = df['celltype'].unique()
contexts = df['ctxt'].unique()

### Full-length variants

In [134]:
# for ct in celltypes:
#     for ctxt in contexts:
#         labels = {'xlabel': f'Position relative to {me_type} {var_type} (bp)',
#                   'ylabel': f'%{ctxt}',
#                   'title': ct}
#         pdf = f'{outdir}/mC_by_genotype/fl_{variant}_{ctxt}_{ct}_gt_bin{binsize}.pdf'
#         df_plot_fl = df_gt_fl.loc[ctxt, ct].reset_index()

#         fig = plot_mC.plot_mC_by_genotype(df_plot_fl, me_type, binsize, xlims, xticks, labels, pdf)

### Non-full-length variants

In [135]:
# for ct in celltypes:
#     for ctxt in contexts:
#         labels = {'xlabel': f'Position relative to {me_type} {var_type} (bp)',
#                   'ylabel': f'%{ctxt}',
#                   'title': ct}
#         pdf = f'{outdir}/mC_by_genotype/tr_{variant}_{ctxt}_{ct}_gt_bin{binsize}.pdf'
#         df_plot_tr = df_gt_tr.loc[ctxt, ct].reset_index()

#         fig = plot_mC.plot_mC_by_genotype(df_plot_tr, me_type, binsize, xlims, xticks, labels, pdf)

### All variants

In [136]:
for ct in celltypes:
    for ctxt in contexts:
        labels = {'xlabel': f'Position relative to {me_type} {var_type} (bp)',
                  'ylabel': f'%{ctxt}',
                  'title': ct}
        pdf = f'{outdir}/mC_by_genotype/all_{variant}_{ctxt}_{ct}_gt_bin{binsize}.pdf'
        df_plot_all = df_gt_all.loc[ctxt, ct].reset_index()
        
        fig = plot_mC.plot_mC_by_genotype(df_plot_all, me_type, binsize, xlims, xticks, labels, pdf)

## Test periodicity of each genotype

In [104]:
df_test = plot_mC.aggregate_mC_by_sum(df, ['ctxt','celltype','location','bin','genotype'])
df_res = plot_mC.test_periodicity(df_test, ['ctxt','celltype','location','genotype'])
df_res.to_csv(f'{outdir}/mC_period/{variant}_{ctxt}_gt_bin{binsize}.tsv', sep='\t', index=None)

In [105]:
for ct in celltypes:
    for ctxt in contexts:
        df_bar = df_res[(df_res['celltype'] == ct) & (df_res['ctxt'] == ctxt)]

        ax = sns.barplot(df_bar, 
                    x='location', y='amplitude', 
                    order=['upstream', 'downstream'],
                    hue='genotype', 
                    hue_order=[f'No{me_type}','Het',me_type], 
                    palette=['turquoise','orange','mediumvioletred'])
        
        sns.move_legend(ax, 'center left', bbox_to_anchor=(1,0.5))
        plt.title(ct)

        pdf = f'{outdir}/mC_period/{variant}_{ctxt}_{ct}_gt_bin{binsize}.pdf'
        plt.savefig(pdf, transparent=True, bbox_inches='tight')
        plt.close()

## Plot paired mCH for each locus

There are less CG sites than CH sites in the genome, so mCG signal is too noisy to look at individual loci.

In [106]:
if 'mCH' in contexts:
    df_ch = df[df['ctxt'] == 'mCH']
    df_gt_locus = plot_mC.aggregate_mC_by_sum(df_ch, ['id','ctxt','bin','celltype','genotype'])
    df_gt_locus = df_gt_locus[df_gt_locus['cov'] > (binsize*0.2)]

    bin_up = df_gt_locus.xs(-binsize/2, level='bin', drop_level=False)['percent_mC']
    bin_up = bin_up.unstack('genotype')

    for ct in celltypes:
        df_gt_pair = bin_up.xs(ct, level='celltype', drop_level=False)
        pdf = f'{outdir}/mC_by_locus/paired_{variant}_mCH_{ct}_bin{binsize}.pdf'
        fig = plot_mC.plot_paired_mC(df_gt_pair, me_type, 'mCH', ct, pdf)