In [57]:
import pandas as pd, numpy as np
from glob import glob
import os
from multiprocessing import Pool
from tqdm import tqdm

# Get the coverage of RNA reads around each L1 element

In [58]:
TEs = pd.read_csv('/cndd3/dburrows/DATA/annotations/rmsk/rmsk.hg38.filt-5ptrim.merge.rerr.bed',sep='\t',
                  names=['chr','start','end','TEtype','TEfamily','strand','TEclass','length','TSS_start','TSS_end'])

In [59]:
TEs[TEs['TEfamily'].isin(['L1HS'])]['TEfamily'].value_counts()
# TEs[(TEs['TEtype']=='L1')&(TEs['TEfamily'].str.contains('L1PA') | TEs['TEfamily'].str.contains('L1HS'))]['TEfamily'].value_counts()

L1HS    177
Name: TEfamily, dtype: int64

In [66]:
# L1 = TEs.loc[TEs['TEfamily'].isin(['L1HS']) & (TEs['chr']!='chrY') & (~TEs['chr'].str.contains('_'))].reset_index()
L1 = TEs.loc[(TEs['TEfamily'].isin(['L1HS'])| TEs['TEfamily'].str.startswith('L1PA'))
             & (TEs['chr']!='chrY') & (~TEs['chr'].str.contains('_'))].reset_index()
L1

Unnamed: 0,index,chr,start,end,TEtype,TEfamily,strand,TEclass,length,TSS_start,TSS_end
0,0,chr1,440936,447357,L1,L1PA7,+,LINE,6463,440886,440986
1,1,chr1,675912,682333,L1,L1PA7,+,LINE,6463,675862,675962
2,2,chr1,25506586,25512707,L1,L1PA2,+,LINE,6019,25506536,25506636
3,4,chr1,30493390,30499556,L1,L1PA4,+,LINE,6151,30493340,30493440
4,5,chr1,33671998,33678118,L1,L1PA4,+,LINE,6151,33671948,33672048
...,...,...,...,...,...,...,...,...,...,...,...
5668,474909,chrX,155299576,155305601,L1,L1PA2,-,LINE,6031,155305551,155305651
5669,474910,chrX,155481358,155487841,L1,L1PA13,-,LINE,7597,155487791,155487891
5670,474911,chrX,155516017,155522048,L1,L1HS,-,LINE,6031,155521998,155522098
5671,474912,chrX,155912364,155918771,L1,L1PA7,-,LINE,6465,155918721,155918821


In [61]:
binsize=1000

L1bins=[]
for i in L1.index:
  if L1.loc[i,'strand']=='+':
    bins = L1.loc[i,'start'] + np.arange(-50000,50000,binsize)
  else:
    bins = L1.loc[i,'end'] + np.arange(50000,-50000,-binsize)-binsize
  
  bins_df=pd.DataFrame(index=np.arange(-50000,50000,binsize))
  bins_df['start']=bins
  bins_df['end']=bins+binsize
  bins_df['chr']=L1.loc[i,'chr']
  bins_df['L1_start']=L1.loc[i,'start']
  bins_df['L1_end']=L1.loc[i,'end']
  bins_df['strand']=L1.loc[i,'strand']
  bins_df['L1_id'] = bins_df['chr']+'_'+bins_df['L1_start'].astype(str)+'_bin'+bins_df.index.astype(str)
  
  L1bins.append(bins_df)
  
L1_bins=pd.concat(L1bins).reset_index(drop=True)
L1_bins['-']='-'
L1_bins = L1_bins[['chr','start','end','L1_id','-','strand']]

In [62]:
L1_bins=L1_bins.sort_values(['chr','start'])

In [63]:
L1_bins.to_csv('L1_bins.tsv',sep='\t',header=False,index=False)
!bedtools sort -g hg38.chroms -i L1_bins.tsv > L1_bins.sorted.tsv
!bedtools merge -i L1_bins.sorted.tsv > L1_bins.merged.bed

In [64]:
def my_bedcov(samplepath,qthresh=30):
  sample=samplepath.split('/')[-1]
  bam=f'/cndd3/dburrows/DATA/te/rna/PE.bam/{sample}/Aligned.sortedByCoord.out.bam'
  outfile=f'L1_bins.{sample}.q{qthresh}.sense.coverage.bed'
  cmd=f'/usr/bin/samtools view -q {qthresh} -M -f64 -b -L L1_bins.merged.bed {bam} | '
  cmd+=f' bedtools coverage -sorted -c -s -a L1_bins.sorted.tsv -b - > {outfile} '
  os.system(cmd)
  cmd=f'/usr/bin/samtools view -q {qthresh} -M -f128 -b -L L1_bins.merged.bed {bam} | '
  cmd+=f' bedtools coverage -sorted -c -S -a L1_bins.sorted.tsv -b - >> {outfile} '
  os.system(cmd)

  outfile=f'L1_bins.{sample}.q{qthresh}.antisense.coverage.bed'
  cmd=f'/usr/bin/samtools view -q {qthresh} -M -f64 -b -L L1_bins.merged.bed {bam} | '
  cmd+=f' bedtools coverage -sorted -c -S -a L1_bins.sorted.tsv -b - > {outfile} '
  os.system(cmd)
  cmd=f'/usr/bin/samtools view -q {qthresh} -M -f128 -b -L L1_bins.merged.bed {bam} | '
  cmd+=f' bedtools coverage -sorted -c -s -a L1_bins.sorted.tsv -b - >> {outfile} '
  os.system(cmd)


In [65]:
samples=glob('/cndd3/dburrows/DATA/te/rna/PE.bam/Sample*')
with Pool() as p:
  x=list(tqdm(p.imap(my_bedcov, samples),total=len(samples)))

100%|██████████| 116/116 [07:32<00:00,  3.90s/it]


In [50]:
samples=glob('/cndd3/dburrows/DATA/te/rna/PE.bam/Sample*')
qthresh=30
# for sample in tqdm(samples):
def my_libsize(sample):
  sample_name=sample.split('/')[-1]
  bam=f'{sample}/Aligned.sortedByCoord.out.bam'
  os.system(f'samtools view -q {qthresh} -c {bam} > {sample_name}.q{qthresh}.libsize.txt')

with Pool() as p:
  list(tqdm(p.imap(my_libsize, samples),total=len(samples)))

100%|██████████| 116/116 [1:50:04<00:00, 56.94s/it] 
