In [16]:
import sys
import itertools as it
import os

import pysam
import numpy as np
import pandas as pd
from intervaltree import Interval, IntervalTree

import functions

In [2]:
LINEAR_COLUMNS = ('read_id', 
				  'chrom', 
				#   'start', 
				#   'end', 
				  'align_is_reverse',
				  'blocks', 
				  'cigar',
				  'read',
				  )
CIGARTUPLE_CODES = {0: 'M',
					1: 'I',
					2: 'D',
					3: 'N',
					4: 'S',
					5: 'H',
					6: 'P',
					7: '=',
					8: 'X',
					9: 'B',
					}

In [3]:
def infer_segments(row):
	if len(row['blocks']) == 1:
		return row['blocks']
	
	segs = []
	current_block = row['blocks'][0]
	for blocks_index in range(1, len(row['blocks'])):
		next_block = row['blocks'][blocks_index]
		cigar_index = 2*blocks_index - 1
		gap_type = row['cigar'][cigar_index][0]

		if gap_type == 'N':
			segs.append(current_block)
			current_block = next_block
		elif gap_type in ('I', 'D'):
			current_block = (current_block[0], next_block[1])
		else:
			raise RuntimeError(f"{row['blocks']}\n{row['cigar']}\n{blocks_index}")
	
	segs.append(current_block)
		
	return tuple(segs)


def infer_common_genes(df):
	gene_ids = []
	for i, seg_row in df.iterrows():
		features = seg_row['exons'].union(seg_row['introns'])
		seg_genes = set()
		for feat in features:
			seg_genes.update(feat.data['gene_id'])
		gene_ids.append(seg_genes)

	common_genes = set(gid for gid in gene_ids[0] if all(gid in set_genes for set_genes in gene_ids))
	return common_genes


def tree_covers_interval(tree:IntervalTree, interval:Interval) -> bool:
	total_coverage = False
	merged_tree = tree.copy()
	merged_tree.merge_overlaps(strict=False)
	for merged_interval in merged_tree:
		if merged_interval.contains_interval(interval):
			total_coverage = True
	
	return total_coverage

In [4]:
# # Get args
# ref_exons, ref_introns, output_base, log_level = sys.argv[1:]
# # Get logger
# log = functions.get_logger(log_level)
# log.debug(f'Args recieved: {sys.argv[1:]}')

ref_exons = '/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Projects/Lariat_mapping/reference_data/hg38_ref/exons.tsv.gz'
ref_introns = '/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Projects/Lariat_mapping/reference_data/hg38_ref/introns.tsv.gz'
output_base = '/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Projects/Lariat_mapping/output/pipeline/C22-1_100k/'

In [5]:
introns = pd.read_csv(ref_introns, sep='\t')
introns['gene_id'] = introns.gene_id.transform(lambda gid: set(gid.split(',')))
introns['interval'] = introns.apply(lambda row: Interval(row['start'], row['end'], {'gene_id': row['gene_id'], 'strand': row['strand']}), axis=1)
introns = introns.groupby('chrom').interval.agg(IntervalTree).to_dict()

In [6]:
exons = pd.read_csv(ref_exons, sep='\t')
exons['gene_id'] = exons.gene_id.transform(lambda gid: set(gid.split(',')))
exons['interval'] = exons.apply(lambda row: Interval(row['start'], row['end'], {'gene_id': row['gene_id'], 'strand': row['strand']}), axis=1)
exons = exons.groupby('chrom').interval.agg(IntervalTree).to_dict()

In [7]:
linear_reads = []
for align in pysam.AlignmentFile(f'{output_base}mapped_reads.bam', 'rb'):
	linear_reads.append([
					align.query_name, 
					align.reference_name, 
					# align.reference_start,
					# align.reference_end,
					align.is_reverse,
					align.get_blocks(),
					align.cigartuples,
					align.is_read1,
					])

linear_reads = pd.DataFrame(linear_reads, columns=LINEAR_COLUMNS)

# Fix starting columns
linear_reads.read_id = linear_reads.read_id.astype('string')
linear_reads.chrom = linear_reads.chrom.astype('category')
linear_reads.cigar = linear_reads.cigar.transform(lambda cigar: tuple((CIGARTUPLE_CODES[op],length) for op, length in cigar))
linear_reads.read = linear_reads.read.map({True: '1', False: '2'}).astype('category')
linear_reads = linear_reads.sort_values(['read_id'])

linear_reads

Unnamed: 0,read_id,chrom,align_is_reverse,blocks,cigar,read
41499,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,True,"[(43161160, 43161190), (43162802, 43162922)]","((M, 30), (N, 1612), (M, 120))",1
41500,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,False,"[(43161094, 43161190), (43162802, 43162856)]","((M, 96), (N, 1612), (M, 54))",2
44307,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr9,False,"[(9442096, 9442246)]","((M, 150),)",1
44308,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr9,True,"[(9442107, 9442257)]","((M, 150),)",2
44642,NGSNJ-086:229:GW200110425th:1:1101:10004:12085,chr14,True,"[(49862683, 49862833)]","((M, 150),)",2
...,...,...,...,...,...,...
105542,NGSNJ-086:229:GW200110425th:1:1101:9995:27352,chr14,True,"[(32163959, 32164109)]","((M, 150),)",1
15935,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,False,"[(48233242, 48233392)]","((M, 150),)",2
15934,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,True,"[(48233331, 48233481)]","((M, 150),)",1
34263,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,True,"[(49586704, 49586854)]","((M, 150),)",1


In [8]:
linear_reads['spliced'] = linear_reads.blocks.transform(lambda blocks: len(blocks)>1)
print(linear_reads.spliced.value_counts())
linear_reads['segs'] = linear_reads.apply(infer_segments, axis=1, result_type='reduce')
linear_reads = linear_reads.explode('segs', ignore_index=True)
linear_reads['seg'] = linear_reads.segs.transform(lambda segs: Interval(*segs))
linear_reads = linear_reads.drop(columns=['blocks', 'segs', 'cigar'])
linear_reads

spliced
False    82975
True     29645
Name: count, dtype: int64


Unnamed: 0,read_id,chrom,align_is_reverse,read,spliced,seg
0,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,True,1,True,"(43161160, 43161190, None)"
1,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,True,1,True,"(43162802, 43162922, None)"
2,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,False,2,True,"(43161094, 43161190, None)"
3,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,False,2,True,"(43162802, 43162856, None)"
4,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr9,False,1,False,"(9442096, 9442246, None)"
...,...,...,...,...,...,...
142693,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,False,2,False,"(48233242, 48233392, None)"
142694,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,True,1,False,"(48233331, 48233481, None)"
142695,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,True,1,False,"(49586704, 49586854, None)"
142696,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,False,2,True,"(49563422, 49563424, None)"


In [9]:
for chrom in linear_reads.chrom.unique():
	if chrom not in exons.keys():
		print(f'No exons in {chrom}')
		continue
	chrom_exons = exons[chrom]
	linear_reads.loc[linear_reads.chrom==chrom, 'exons'] = linear_reads.loc[linear_reads.chrom==chrom, 'seg'].transform(chrom_exons.overlap)
	
	if chrom not in introns.keys():
		print(f'No introns in {chrom}')
		continue
	chrom_introns = introns[chrom]
	linear_reads.loc[linear_reads.chrom==chrom, 'introns'] = linear_reads.loc[linear_reads.chrom==chrom, 'seg'].transform(chrom_introns.overlap)

linear_reads.exons = linear_reads.exons.fillna('').transform(set)
linear_reads.introns = linear_reads.introns.fillna('').transform(set)
linear_reads

No introns in chrM


Unnamed: 0,read_id,chrom,align_is_reverse,read,spliced,seg,exons,introns
0,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,True,1,True,"(43161160, 43161190, None)","{(43161051, 43161190, {'gene_id': {'ENSG000001...",{}
1,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,True,1,True,"(43162802, 43162922, None)","{(43162802, 43163210, {'gene_id': {'ENSG000001...",{}
2,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,False,2,True,"(43161094, 43161190, None)","{(43161051, 43161190, {'gene_id': {'ENSG000001...",{}
3,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,False,2,True,"(43162802, 43162856, None)","{(43162802, 43163210, {'gene_id': {'ENSG000001...",{}
4,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr9,False,1,False,"(9442096, 9442246, None)","{(9442059, 9442380, {'gene_id': {'ENSG00000265...","{(9397482, 9574731, {'gene_id': {'ENSG00000153..."
...,...,...,...,...,...,...,...,...
142693,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,False,2,False,"(48233242, 48233392, None)","{(48233202, 48233477, {'gene_id': {'ENSG000001...","{(48232698, 48253806, {'gene_id': {'ENSG000001..."
142694,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,True,1,False,"(48233331, 48233481, None)","{(48233202, 48233477, {'gene_id': {'ENSG000001...","{(48232698, 48253806, {'gene_id': {'ENSG000001..."
142695,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,True,1,False,"(49586704, 49586854, None)","{(49586579, 49586878, {'gene_id': {'ENSG000002...","{(49586049, 49598399, {'gene_id': {'ENSG000002..."
142696,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,False,2,True,"(49563422, 49563424, None)",{},{}


In [10]:
linear_reads['Intergenic'] = (linear_reads.exons.transform(len)==0) & (linear_reads.introns.transform(len)==0)
linear_reads['common_genes'] = linear_reads.read_id.map(linear_reads.groupby('read_id').apply(infer_common_genes))
print(linear_reads.common_genes.transform(len).value_counts())
linear_reads['exons_f'] = linear_reads.apply(lambda row: IntervalTree(exon for exon in row['exons'] if len(exon.data['gene_id'].intersection(row['common_genes']))>0), axis=1)
linear_reads['introns_f'] = linear_reads.apply(lambda row: IntervalTree(intron for intron in row['introns'] if len(intron.data['gene_id'].intersection(row['common_genes']))>0), axis=1)
linear_reads

  linear_reads['common_genes'] = linear_reads.read_id.map(linear_reads.groupby('read_id').apply(infer_common_genes))


common_genes
1    90038
2    39312
0    11667
3     1578
4       91
5       12
Name: count, dtype: int64


Unnamed: 0,read_id,chrom,align_is_reverse,read,spliced,seg,exons,introns,Intergenic,common_genes,exons_f,introns_f
0,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,True,1,True,"(43161160, 43161190, None)","{(43161051, 43161190, {'gene_id': {'ENSG000001...",{},False,{ENSG00000100300.18},"[(43161051, 43161190, {'gene_id': {'ENSG000001...",[]
1,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,True,1,True,"(43162802, 43162922, None)","{(43162802, 43163210, {'gene_id': {'ENSG000001...",{},False,{ENSG00000100300.18},"[(43162802, 43163210, {'gene_id': {'ENSG000001...",[]
2,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,False,2,True,"(43161094, 43161190, None)","{(43161051, 43161190, {'gene_id': {'ENSG000001...",{},False,{ENSG00000100300.18},"[(43161051, 43161190, {'gene_id': {'ENSG000001...",[]
3,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,False,2,True,"(43162802, 43162856, None)","{(43162802, 43163210, {'gene_id': {'ENSG000001...",{},False,{ENSG00000100300.18},"[(43162802, 43163210, {'gene_id': {'ENSG000001...",[]
4,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr9,False,1,False,"(9442096, 9442246, None)","{(9442059, 9442380, {'gene_id': {'ENSG00000265...","{(9397482, 9574731, {'gene_id': {'ENSG00000153...",False,"{ENSG00000153707.19, ENSG00000265735.2}","[(9442059, 9442380, {'gene_id': {'ENSG00000265...","[(9397482, 9574731, {'gene_id': {'ENSG00000153..."
...,...,...,...,...,...,...,...,...,...,...,...,...
142693,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,False,2,False,"(48233242, 48233392, None)","{(48233202, 48233477, {'gene_id': {'ENSG000001...","{(48232698, 48253806, {'gene_id': {'ENSG000001...",False,{ENSG00000136156.15},"[(48233202, 48233477, {'gene_id': {'ENSG000001...","[(48232698, 48253806, {'gene_id': {'ENSG000001..."
142694,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,True,1,False,"(48233331, 48233481, None)","{(48233202, 48233477, {'gene_id': {'ENSG000001...","{(48232698, 48253806, {'gene_id': {'ENSG000001...",False,{ENSG00000136156.15},"[(48233202, 48233477, {'gene_id': {'ENSG000001...","[(48232698, 48253806, {'gene_id': {'ENSG000001..."
142695,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,True,1,False,"(49586704, 49586854, None)","{(49586579, 49586878, {'gene_id': {'ENSG000002...","{(49586049, 49598399, {'gene_id': {'ENSG000002...",False,{},[],[]
142696,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,False,2,True,"(49563422, 49563424, None)",{},{},True,{},[],[]


In [11]:
def classify_seg(row):
	if row['Intergenic'] is True:
		return 'Intergenic'
	
	if len(row['exons_f'])==0 and len(row['introns_f'])==0:
		return 'Ambiguous'
	
	if len(row['exons_f'])==0:
		if row['align_is_reverse'] is False and any(row['seg'].begin==intron.begin for intron in row['introns_f'] if intron.data['strand']=='+'):
			return "Starts at 5'ss"
		elif row['align_is_reverse'] is True and any(row['seg'].end==intron.end for intron in row['introns_f'] if intron.data['strand']=='-'):
			return "Starts at 5'ss"
		else:
			return 'Intronic'
	
	#TODO: Maybe move this inside next if and add after pre-mRNA check since segment could cross intron-exon junction AND get spliced at another exon junction
	if any([row['seg'].begin==exon.begin for exon in row['exons_f']]) or any([row['seg'].end==exon.end for exon in row['exons_f']]):
		return 'Exon junction'

	if len(row['introns_f'])==0:
		return 'Exonic'
	
	# At this point there's at least 1 filtered exon and 1 filtered intron
	# We need to see if it's pre-mRNA
	for exon, intron in it.product(row['exons_f'], row['introns_f']):
		if len(exon.data['gene_id'].intersection(intron.data['gene_id']))==0:
			continue
		
		# Check if we have an exon-to-intron junction
		if exon.end-1 == intron.begin:
			junc_spot = intron.begin
			exon_5bp = junc_spot-5
			intron_5bp = junc_spot+4

			# # Disregard if segment doesn't have at least 5bp on each side
			# if exon_5bp < row['seg'].begin or intron_5bp >= row['seg'].end:
			# 	continue
			# Disregard if last 5bp of exon overlaps any introns
			# Disregard if first 5bp of intron overlaps any exons
			# This accounts for alternative splice sites
			if len(row['introns_f'].overlap(exon_5bp, junc_spot)) > 0:
				continue
			if len(row['exons_f'].overlap(junc_spot, intron_5bp+1)) > 0:
				continue

			return 'pre-mRNA'
		
		# Check if we have an intron-to-exon junction
		elif intron.end-1 == exon.begin:
			junc_spot = exon.begin
			intron_5bp = junc_spot-5
			exon_5bp = junc_spot+4

			# # Disregard if segment doesn't have at least 5bp on each side
			# if intron_5bp < row['seg'].begin or exon_5bp >= row['seg'].end:
				# continue
			# Disregard if last 5bp of exon overlaps any introns
			# Disregard if first 5bp of intron overlaps any exons
			# This accounts for alternative splice sites
			if len(row['exons_f'].overlap(intron_5bp, junc_spot)) > 0:
				continue
			if len(row['introns_f'].overlap(junc_spot, exon_5bp+1)) > 0:
				continue

			return 'pre-mRNA'
		
	return 'Ambiguous'



linear_reads['seg_class'] = linear_reads.apply(classify_seg, axis=1)
print(linear_reads[['read_id', 'read', 'seg_class']].seg_class.value_counts())
linear_reads

seg_class
Exon junction     49030
Ambiguous         40435
Exonic            25832
Intronic          22269
Intergenic         5124
Starts at 5'ss        8
Name: count, dtype: int64


Unnamed: 0,read_id,chrom,align_is_reverse,read,spliced,seg,exons,introns,Intergenic,common_genes,exons_f,introns_f,seg_class
0,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,True,1,True,"(43161160, 43161190, None)","{(43161051, 43161190, {'gene_id': {'ENSG000001...",{},False,{ENSG00000100300.18},"[(43161051, 43161190, {'gene_id': {'ENSG000001...",[],Exon junction
1,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,True,1,True,"(43162802, 43162922, None)","{(43162802, 43163210, {'gene_id': {'ENSG000001...",{},False,{ENSG00000100300.18},"[(43162802, 43163210, {'gene_id': {'ENSG000001...",[],Exon junction
2,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,False,2,True,"(43161094, 43161190, None)","{(43161051, 43161190, {'gene_id': {'ENSG000001...",{},False,{ENSG00000100300.18},"[(43161051, 43161190, {'gene_id': {'ENSG000001...",[],Exon junction
3,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,False,2,True,"(43162802, 43162856, None)","{(43162802, 43163210, {'gene_id': {'ENSG000001...",{},False,{ENSG00000100300.18},"[(43162802, 43163210, {'gene_id': {'ENSG000001...",[],Exon junction
4,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr9,False,1,False,"(9442096, 9442246, None)","{(9442059, 9442380, {'gene_id': {'ENSG00000265...","{(9397482, 9574731, {'gene_id': {'ENSG00000153...",False,"{ENSG00000153707.19, ENSG00000265735.2}","[(9442059, 9442380, {'gene_id': {'ENSG00000265...","[(9397482, 9574731, {'gene_id': {'ENSG00000153...",Ambiguous
...,...,...,...,...,...,...,...,...,...,...,...,...,...
142693,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,False,2,False,"(48233242, 48233392, None)","{(48233202, 48233477, {'gene_id': {'ENSG000001...","{(48232698, 48253806, {'gene_id': {'ENSG000001...",False,{ENSG00000136156.15},"[(48233202, 48233477, {'gene_id': {'ENSG000001...","[(48232698, 48253806, {'gene_id': {'ENSG000001...",Ambiguous
142694,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,True,1,False,"(48233331, 48233481, None)","{(48233202, 48233477, {'gene_id': {'ENSG000001...","{(48232698, 48253806, {'gene_id': {'ENSG000001...",False,{ENSG00000136156.15},"[(48233202, 48233477, {'gene_id': {'ENSG000001...","[(48232698, 48253806, {'gene_id': {'ENSG000001...",Ambiguous
142695,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,True,1,False,"(49586704, 49586854, None)","{(49586579, 49586878, {'gene_id': {'ENSG000002...","{(49586049, 49598399, {'gene_id': {'ENSG000002...",False,{},[],[],Ambiguous
142696,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,False,2,True,"(49563422, 49563424, None)",{},{},True,{},[],[],Intergenic


In [12]:
def classify_read(seg_rows:pd.DataFrame) -> str:
	classes_set = set(seg_rows['seg_class'])
	if len(classes_set)==1:
		if classes_set == {'Exon junction'}:
			if seg_rows.spliced.any():
				return 'mRNA'
			else:
				return 'Exonic'
		else:
			return classes_set.pop()
		
	# Now we know it's 2 classes at minimum
	
	if 'Intergenic' in classes_set:
		return 'Ambiguous'
	
	if classes_set in (
					{'Exon junction', 'Exonic',},
					{'Exon junction', 'Exonic', 'Intronic'},
					{'Exon junction', 'Ambiguous'},
					{'Exon junction', 'Exonic', 'Ambiguous'},
					{'Exon junction', 'Exonic', 'Intronic', 'Ambiguous'},
					):
		return 'mRNA'
	
	if classes_set in (
					{'pre-mRNA', 'Exonic'},
					{'pre-mRNA', 'Intronic'},
					{'pre-mRNA', 'Exonic', 'Intronic'},
					{'pre-mRNA', 'Ambiguous'},
					{'pre-mRNA', 'Exonic', 'Ambiguous'},
					{'pre-mRNA', 'Intronic', 'Ambiguous'},
					{'pre-mRNA', 'Exonic', 'Intronic', 'Ambiguous'},
					):
		return 'pre-mRNA'
	
	if classes_set in (
					{"Starts at 5'ss", 'Intronic'},
					{"Starts at 5'ss", 'Ambiguous'},
					{"Starts at 5'ss", 'Intronic', 'Ambiguous'},
					):
		return "Starts at 5'ss"
	
	return tuple(classes_set)
	# return 'Ambiguous'


# linear_reads.seg_class = linear_reads.seg_class.replace(['Ambiguous m', 'Ambiguous f'], 'Ambiguous')
linear_reads['read_class'] = linear_reads.read_id.map(linear_reads.groupby('read_id').apply(classify_read))
print(linear_reads[['read_id', 'read_class']].drop_duplicates().read_class.value_counts())
print(linear_reads.read_id.nunique())

read_class
Ambiguous                               19182
mRNA                                    13291
Exonic                                  11810
Intronic                                10594
Intergenic                               1639
(Intronic, Ambiguous)                     413
(Intronic, Exonic)                        302
(Ambiguous, Exonic)                       211
(Exonic, Ambiguous)                       202
(Intronic, Exon junction)                  77
Starts at 5'ss                              8
(Intronic, Exon junction, Ambiguous)        7
Name: count, dtype: int64
57736


  linear_reads['read_class'] = linear_reads.read_id.map(linear_reads.groupby('read_id').apply(classify_read))


In [13]:
linear_reads.read_class = linear_reads.read_class.transform(lambda rc: 'Ambiguous' if isinstance(rc, tuple) else rc)

In [14]:
linear_reads_sum = linear_reads.groupby(['read_id', 'read_class'], as_index=False).agg({'spliced': any})
linear_reads_sum['stage_reached'] = 'linear_map'
linear_reads_sum = linear_reads_sum[['read_id', 'read_class', 'stage_reached', 'spliced']]
assert linear_reads_sum.read_id.is_unique
print(linear_reads_sum.spliced.value_counts())
print(linear_reads_sum.read_class.value_counts())
linear_reads_sum

spliced
False    38653
True     19083
Name: count, dtype: int64
read_class
Ambiguous         20394
mRNA              13291
Exonic            11810
Intronic          10594
Intergenic         1639
Starts at 5'ss        8
Name: count, dtype: int64


Unnamed: 0,read_id,read_class,stage_reached,spliced
0,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,mRNA,linear_map,True
1,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,Ambiguous,linear_map,False
2,NGSNJ-086:229:GW200110425th:1:1101:10004:12085,Ambiguous,linear_map,False
3,NGSNJ-086:229:GW200110425th:1:1101:10004:13244,Ambiguous,linear_map,False
4,NGSNJ-086:229:GW200110425th:1:1101:10004:13589,Ambiguous,linear_map,False
...,...,...,...,...
57731,NGSNJ-086:229:GW200110425th:1:1101:9995:25723,Intronic,linear_map,False
57732,NGSNJ-086:229:GW200110425th:1:1101:9995:26318,Ambiguous,linear_map,False
57733,NGSNJ-086:229:GW200110425th:1:1101:9995:27352,Intergenic,linear_map,False
57734,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,Ambiguous,linear_map,False


In [18]:
if os.path.isfile(f'{output_base}read_classes.tsv'):
	raise RuntimeError()
linear_reads_sum.to_csv(f'{output_base}read_classes.tsv', sep='\t', index=False)

RuntimeError: 