# Process RepeatMasker annotation file

## Merge hits corresponding to the same element

In [1]:
import pandas as pd
import numpy as np
import pybedtools
import seaborn as sns
import matplotlib.pyplot as plt

import sys
sys.path.insert(1, '../') 

from src import process_rmsk

In [2]:
rmsk = "/home/AD/rkgadde/L1IP/RepeatMasker/hg38_noALT.interspersed.autosomes.bed"
cols = ['chrom', 'start', 'end', 'name', 'score', 'strand'] 
df = pd.read_csv(rmsk, sep="\t", names=cols)

In [3]:
df_merge = process_rmsk.merge_on_rmsk_id(df)
df_merge = df_merge[cols]

## Investigate poly-A/T tails in RepeatMasker annotation

RepeatMasker is not consistent about whether poly-A tails are annotated separately as a simple repeat or included in the coordinates of an interspersed repeat. From our understanding, poly-A tails surpassing a certain length are annotated separately. Here, we merge mobile elements with their poly-A tails.

### Find elements with long poly-A tails and merge coordinates

In [4]:
me_df = df_merge[~df_merge['name'].str.contains(r'\(A+\)n|\(T+\)n', regex=True)]
pA_df = df_merge[df_merge['name'].str.contains(r'\(A+\)n|\(T+\)n', regex=True)]

In [5]:
def get_3prime_region(df, n):
    """For each feature, return the region of length n immediately downstream
    of the 3' end."""
    df_out = df.copy()
    
    df_out.loc[df_out['strand'] == '+', 'start'] = df_out['end']
    df_out.loc[df_out['strand'] == '+', 'end'] = df_out['end'] + n

    df_out.loc[df_out['strand'] == '-', 'end'] = df_out['start']
    df_out.loc[df_out['strand'] == '-', 'start'] = df_out['start'] - n

    return df_out

In [6]:
me3 = get_3prime_region(me_df, 12) # get 12 bp downstream of 3' end
me3 = pybedtools.BedTool.from_dataframe(me3)
simple = pybedtools.BedTool.from_dataframe(pA_df)
me3_simple = simple.intersect(me3, wa=True, wb=True) # find MEs with tails

In [None]:
me_intersect = me3_simple.to_dataframe() 
me_tails = me_intersect.iloc[:, [0,1,2,9,4,11]] # tails that intersect 3' region of an ME
me_tails.columns = cols

tailed_mes = me_intersect.iloc[:, [9,10]]
tailed_mes.columns = ['name', 'score']
tailed_mes = me_df.merge(tailed_mes)

In [8]:
print(len(me_tails), len(tailed_mes)) # check if each ME has only 1 tail

6403 6403


In [9]:
me_concat = pd.concat([me_tails, tailed_mes])
me_merged = me_concat.groupby(['name', 'chrom', 'strand']).agg({'start': 'min', 
                                                                'end': 'max',
                                                                'score': 'sum'}).reset_index() # merge ME with tail

In [None]:
with_tails = me_merged[cols]
no_tails = me_df[~me_df['name'].isin(with_tails['name'])]
me_coords = pd.concat([no_tails, with_tails])
me_coords = me_coords.sort_values(['chrom', 'start', 'end'])

In [11]:
me_file = '/home/AD/rkgadde/L1IP/RepeatMasker/hg38_noALT.interspersed.merged.bed'
me_coords.to_csv(me_file, sep='\t', index=False, header=None)

tail_file = '/home/AD/rkgadde/L1IP/RepeatMasker/hg38_noALT.tails.bed'
me_tails.to_csv(tail_file, sep='\t', index=False, header=None)

## Filter elements by length and subfamily

In [12]:
me_coords = me_coords[me_coords['score'] > 80]

In [13]:
me_coords['subfamily'] = me_coords['name'].str.extract(r'(.*)_[0-9]+')

In [14]:
subfamilies = ['L1HS', 'L1PA2', 'L1PA3', 'L1PA4', 'L1PA5',
               'AluJo', 'AluSx', 'AluY', 'AluYa5', 'AluYb8']
me_coords = me_coords[me_coords['subfamily'].isin(subfamilies)]

In [15]:
me_coords['subfamily'].value_counts()

subfamily
AluSx     298768
AluY      101744
AluJo      12713
L1PA4      10085
L1PA5       9181
L1PA3       7982
L1PA2       4400
AluYa5      3570
AluYb8      3168
L1HS        1442
Name: count, dtype: int64

## Remove elements within 1kb of another element

In [16]:
l1 = me_coords[me_coords['name'].str.contains('L1')]
alu = me_coords[me_coords['name'].str.contains('Alu')]

In [17]:
l1_rm = process_rmsk.remove_nearby_MEs(l1, 1000)
alu_rm  = process_rmsk.remove_nearby_MEs(alu, 1000)

In [18]:
l1_rm['name'].str.extract(r'(.*)_[0-9]+').value_counts()

0    
L1PA4    7569
L1PA5    7146
L1PA3    5391
L1PA2    3052
L1HS      870
Name: count, dtype: int64

In [19]:
alu_rm['name'].str.extract(r'(.*)_[0-9]+').value_counts()

0     
AluSx     160243
AluY       58199
AluJo       6471
AluYa5      2461
AluYb8      2075
Name: count, dtype: int64

## Remove polymorphic elements

In [20]:
vardir = '/home/AD/rkgadde/L1IP/mC_data/CZI/type/vars'

In [21]:
poly_l1 = np.loadtxt(f'{vardir}/polymorphic_refL1.txt', dtype=str)
poly_alu = np.loadtxt(f'{vardir}/polymorphic_refAlu.txt', dtype=str)

In [22]:
np_l1 = l1_rm[~l1_rm['name'].isin(poly_l1)]
np_alu = alu_rm[~alu_rm['name'].isin(poly_alu)]

In [23]:
np_l1.to_csv(f'{vardir}/non-polymorphic_refL1.bed', sep='\t', index=False, header=None)
np_alu.to_csv(f'{vardir}/non-polymorphic_refAlu.bed', sep='\t', index=False, header=None)

## Subsample non-polymorphic elements

In [24]:
def subsample_elements(df):
    subfamilies = df['subfamily'].unique() 
    n_sample = df['subfamily'].value_counts().min()

    dfs = []

    for fam in subfamilies:
        subsample = df[df['subfamily'] == fam].sample(n=n_sample, random_state=0)
        dfs.append(subsample)

    df_out = pd.concat(dfs)
    return df_out

In [25]:
sample_l1 = subsample_elements(np_l1)
sample_alu = subsample_elements(np_alu)

In [27]:
sample_l1.to_csv(f'{vardir}/subsample_refL1.bed', sep='\t', index=False, header=None)
sample_alu.to_csv(f'{vardir}/subsample_refAlu.bed', sep='\t', index=False, header=None)