In [1]:
import gzip
import itertools as it
import os

import pysam
import numpy as np
import pandas as pd
from intervaltree import Interval, IntervalTree

In [2]:
CLASS_AND_STEP = (
				('Lariat', 'Lariat filtering'),
				('Circularized intron', 'Lariat filtering'),
				('Template-switching', 'Lariat filtering'),
				('In repetitive region', 'Lariat filtering'),
				("Unmapped, with 5'ss alignment", 'Trimmed alignment filtering'),
				("Unmapped, with 5'ss alignment", 'Trimmed read mapping'),
				("Unmapped, with 5'ss alignment", "5'ss alignment filtering"),
				("Unmapped", "5'ss mapping")
				  )

In [3]:
def exclude_classed_reads(read_ids:np.array, read_classes) -> np.array:
	print('Starting:', len(read_ids))
	classed_rids = [row[0] for row in read_classes]
	keep = np.isin(read_ids, classed_rids, assume_unique=True, invert=True)
	read_ids = read_ids[keep]
	print('Post-filter:', len(read_ids))

	return read_ids

In [4]:
# # Get args
# ref_exons, ref_introns, output_base, log_level = sys.argv[1:]
# # Get logger
# log = functions.get_logger(log_level)
# log.debug(f'Args recieved: {sys.argv[1:]}')

ref_exons = '/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Projects/Lariat_mapping/reference_data/hg38_ref/exons.tsv.gz'
ref_introns = '/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Projects/Lariat_mapping/reference_data/hg38_ref/introns.tsv.gz'
single_end = False
output_base = '/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Projects/Lariat_mapping/output/pipeline/C22-1_100k/'

In [5]:
read_classes = []

In [6]:
lariat = pd.read_csv(f'{output_base}lariat_reads.tsv', sep='\t', usecols=[0]).read_id.unique()
lariat = [[read_id, 'Lariat', 'Lariat filtering'] for read_id in lariat] 
read_classes.extend(lariat) 

In [7]:
circular = pd.read_csv(f'{output_base}circularized_intron_reads.tsv', sep='\t', usecols=[0]).read_id.unique()
circular = [[read_id, 'Circularized intron', 'Lariat filtering'] for read_id in circular] 
read_classes.extend(circular) 

In [8]:
temp_switch = pd.read_csv(f'{output_base}template_switching_reads.tsv', sep='\t', usecols=[0]).read_id.unique()
temp_switch = [[read_id, 'Template-switching', 'Lariat filtering'] for read_id in temp_switch] 
read_classes.extend(temp_switch) 

In [9]:
lariat_failed = pd.read_csv(f'{output_base}failed_lariat_alignments.tsv', sep='\t')
lariat_failed = lariat_failed.loc[lariat_failed.filter_failed=='in_repeat', 'read_id'].unique()
lariat_failed = exclude_classed_reads(lariat_failed, read_classes)
lariat_failed = [[read_id, 'In repetitive region', 'Lariat filtering'] for read_id in lariat_failed] 
read_classes.extend(lariat_failed) 

Starting: 0
Post-filter: 0


In [10]:
trim_failed = pd.read_csv(f'{output_base}failed_trimmed_alignments.tsv', sep='\t', usecols=[0]).read_id
trim_failed = trim_failed.str.slice(0,-6).unique()
trim_failed = exclude_classed_reads(trim_failed, read_classes)
trim_failed = [[read_id, "Unmapped, with 5'ss alignment", 'Trimmed alignment filtering'] for read_id in trim_failed] 
read_classes.extend(trim_failed) 

Starting: 2567
Post-filter: 1462


In [11]:
fivep_passed = pd.read_csv(f'{output_base}fivep_info_table.tsv', sep='\t', usecols=[0]).read_id
fivep_passed = fivep_passed.str.slice(0,-6).unique()
fivep_passed = exclude_classed_reads(fivep_passed, read_classes)
fivep_passed = [[read_id, "Unmapped, with 5'ss alignment", 'Trimmed read mapping'] for read_id in fivep_passed]
read_classes.extend(fivep_passed) 

Starting: 5769
Post-filter: 2868


In [12]:
fivep_failed = pd.read_csv(f'{output_base}failed_fivep_alignments.tsv', sep='\t', usecols=[0]).read_id
fivep_failed = fivep_failed.str.slice(0,-2).unique()
fivep_failed = exclude_classed_reads(fivep_failed, read_classes)
fivep_failed = [[read_id, "Unmapped, with 5'ss alignment", "5'ss alignment filtering"] for read_id in fivep_failed]
read_classes.extend(fivep_failed)

Starting: 4606
Post-filter: 160


In [13]:
unmapped_reads = set()
with open(f'{output_base}/unmapped_reads.fa') as r:
	for line in r:
		rid = line[1:-3]
		unmapped_reads.add(rid)
		r.readline()

unmapped_reads = np.asarray(list(unmapped_reads))
og_count = len(unmapped_reads)
unmapped_reads = exclude_classed_reads(unmapped_reads, read_classes)
unmapped_reads = [[read_id, "Unmapped", "5'ss mapping"] for read_id in unmapped_reads]
read_classes.extend(unmapped_reads)

Starting: 45116
Post-filter: 39187


In [14]:
# for group_a, group_b in it.combinations(read_classes.keys(), 2):
	# assert any(np.isin(read_classes[group_a], read_classes[group_b], assume_unique=True)) is False, f'{group_a} and {group_b}'

In [15]:
read_classes = pd.DataFrame(read_classes, columns=['read_id', 'read_class', 'stage_reached'])
read_classes['spliced'] = np.nan
print(read_classes.read_class.value_counts())
print(read_classes.stage_reached.value_counts())
read_classes

read_class
Unmapped                         39187
Unmapped, with 5'ss alignment     4490
Template-switching                1433
Lariat                               6
Name: count, dtype: int64
stage_reached
5'ss mapping                   39187
Trimmed read mapping            2868
Trimmed alignment filtering     1462
Lariat filtering                1439
5'ss alignment filtering         160
Name: count, dtype: int64


Unnamed: 0,read_id,read_class,stage_reached,spliced
0,NGSNJ-086:229:GW200110425th:1:1101:15582:16830,Lariat,Lariat filtering,
1,NGSNJ-086:229:GW200110425th:1:1101:2166:13448,Lariat,Lariat filtering,
2,NGSNJ-086:229:GW200110425th:1:1101:24551:5713,Lariat,Lariat filtering,
3,NGSNJ-086:229:GW200110425th:1:1101:30291:23797,Lariat,Lariat filtering,
4,NGSNJ-086:229:GW200110425th:1:1101:5394:15374,Lariat,Lariat filtering,
...,...,...,...,...
45111,NGSNJ-086:229:GW200110425th:1:1101:7726:17284,Unmapped,5'ss mapping,
45112,NGSNJ-086:229:GW200110425th:1:1101:25518:27712,Unmapped,5'ss mapping,
45113,NGSNJ-086:229:GW200110425th:1:1101:15745:10535,Unmapped,5'ss mapping,
45114,NGSNJ-086:229:GW200110425th:1:1101:4119:15013,Unmapped,5'ss mapping,


In [16]:
assert read_classes.read_id.is_unique
assert len(read_classes) == og_count
if not os.path.isfile(f'{output_base}read_classes.tsv'):
	raise RuntimeError()

In [17]:
read_classes.to_csv(f'{output_base}read_classes.tsv', sep='\t', mode='a', index=False, header=False, na_rep='N/A')