In [2]:
#CONDA ENV base (python 3.9.12)
#Import packages
#---------------------------------------
import json
import pandas as pd
import pyranges as pr
import sys

#Import your modules
#---------------------------------------
sys.path.insert(1, '/cndd3/dburrows/CODE/te_ageing/')
import te_rna_f as te
sys.path.insert(1, '/cndd3/dburrows/CODE/admin_tools/')
from admin_tools import admin_functions as adm



'3.9.12 (main, Apr  5 2022, 06:56:58) \n[GCC 7.5.0]'

In [23]:
#Read in required files for filtering
js = json.load(open('config.json'))
bed_pl = pd.read_csv(js['bed_plus_path'], sep='\t', header=None)
bed_pl.columns =['Chromosome', 'Start', 'End', 'Strand', 'transcript_id', 'gene_id', 'family_id', 'class_id']
bed_mi = pd.read_csv(js['bed_minus_path'],sep='\t', header=None)
bed_mi.columns =['Chromosome', 'Start', 'End', 'Strand', 'transcript_id', 'gene_id', 'family_id', 'class_id']

bam_pl = pr.read_bam(snakemake.input.bam_pl, as_df=True) 
bam_mi = pr.read_bam(snakemake.input.bam_mi, as_df=True) 
#Swap Start + End for minus strand
bam_mi['Start'], bam_mi['End'] = bam_mi['End'], bam_mi['Start']

#File checks
assert sum(bam_pl['Strand'] == '+') == len(bam_pl), 'Some non plus strands assigned to plus bam'
assert sum(bam_mi['Strand'] == '-') == len(bam_mi), 'Some non minus strands assigned to minus bam'
assert sum(bed_pl['Strand'] == '+') == len(bed_pl), 'Some non plus strands assigned to plus bed'
assert sum(bed_mi['Strand'] == '-') == len(bed_mi), 'Some non minus strands assigned to minus bed'


In [24]:
#Define + and - strand files
pl_pars = [bed_pl, bam_pl, snakemake.input.meta_pl, 'plus'] 
mi_pars = [bed_mi, bam_mi, snakemake.input.meta_mi, 'minus'] 
par_list = [pl_pars, mi_pars]


In [26]:
pd.options.mode.chained_assignment = None  # default='warn'
count_df = pd.DataFrame() #empty count matrix
bam_ll = [[],[]] #empty list of lists to store curr_bam indices

# Filter out reads that do not overlap with 5' portion of insertion
for x,par in enumerate(par_list):
    curr_bed = par[0]
    curr_bam = par[1]
    curr_name = pd.read_csv(par[2], sep='\t', header=None)
    assert len(curr_bam) == len(curr_name), 'Bam and metadata files not the same length'
    curr_bam['UMI']=curr_name[0].values #Add UMI column to bam file

    #Loop through each chromosome
    chr_unq = np.unique(curr_bam['Chromosome'].values)
    for i,chr in enumerate(chr_unq):
        print('Aligning to chromosome ' + chr + ' for ' + par[3] + ' strand')
        
        #Slice bed/bam files by chromosome
        chr_bam = curr_bam[curr_bam['Chromosome'] == chr]
        chr_bed = curr_bed[curr_bed['Chromosome'] == chr]
        count_df, bam_ll[x] = te.five_prime_align(chr_bam, chr_bed, count_df, bam_ll[x])

#Add in CPMs as a column
total_reads = pd.read_csv(snakemake.inputs.n_reads, sep=" ", header=None)[0].values[0] 
count_df['CPM'] = count_df['Count'].values / total_reads * 1000000 

Aligning to chromosome chr1 for plus strand
Aligning to chromosome chr10 for plus strand
Aligning to chromosome chr11 for plus strand
Aligning to chromosome chr12 for plus strand
Aligning to chromosome chr13 for plus strand
Aligning to chromosome chr14 for plus strand
Aligning to chromosome chr15 for plus strand
Aligning to chromosome chr16 for plus strand
Aligning to chromosome chr17 for plus strand
Aligning to chromosome chr18 for plus strand
Aligning to chromosome chr19 for plus strand
Aligning to chromosome chr2 for plus strand
Aligning to chromosome chr20 for plus strand
Aligning to chromosome chr21 for plus strand
Aligning to chromosome chr22 for plus strand
Aligning to chromosome chr3 for plus strand
Aligning to chromosome chr4 for plus strand
Aligning to chromosome chr5 for plus strand
Aligning to chromosome chr6 for plus strand
Aligning to chromosome chr7 for plus strand
Aligning to chromosome chr8 for plus strand
Aligning to chromosome chr9 for plus strand
Aligning to chromos

In [30]:
#Save counts matrix
count_df.to_csv(snakemake.output.count_mat, sep='\t', index=False)

Unnamed: 0,Chromosome,Start,End,Strand,transcript_id,gene_id,family_id,class_id,Count,CPM
59460,chr1,240195735,240195755,+,AluJo_dup6673,AluJo,Alu,SINE,1.0,0.003852
21778,chr1,1357526,1357546,+,AluSx1_dup79,AluSx1,Alu,SINE,1.0,0.003852
49347,chr1,172601207,172601227,+,AluSz_dup6301,AluSz,Alu,SINE,1.0,0.003852
34049,chr1,46158806,46158826,+,AluSx3_dup949,AluSx3,Alu,SINE,1.0,0.003852
55356,chr1,216510603,216510623,+,AluY_dup7600,AluY,Alu,SINE,1.0,0.003852
...,...,...,...,...,...,...,...,...,...,...
473243,chrY,14747792,14747812,-,AluJr_dup46073,AluJr,Alu,SINE,1.0,0.003852
472889,chrY,12405168,12405188,-,AluJr_dup45997,AluJr,Alu,SINE,1.0,0.003852
473140,chrY,13759996,13760016,-,AluSx_dup63231,AluSx,Alu,SINE,1.0,0.003852
472578,chrY,8521782,8521802,-,AluYe5_dup763,AluYe5,Alu,SINE,1.0,0.003852


In [27]:
# Obtain start positions
#Make txt file of start sites to remove
pl_umi=pd.read_csv(snakemake.input.meta_pl, sep='\t', header=None).iloc[np.setxor1d(np.arange(0,len(bam_pl)) , bam_ll[0].astype(int))]
mi_umi=pd.read_csv(snakemake.input.meta_mi, sep='\t', header=None).iloc[np.setxor1d(np.arange(0,len(bam_mi)) , bam_ll[1].astype(int))]
np.savetxt(snakemake.output.meta_pl,  pl_umi, fmt='%s')
np.savetxt(snakemake.output.meta_mi,  mi_umi, fmt='%s')
