In [1]:
from dataclasses import dataclass, field
import os, sys, subprocess
import copy
import statistics
import math
import time
import datetime as dt
import itertools as it
import random
import string
import gzip

import pysam
import numpy as np
import pandas as pd
import pyfaidx
from intervaltree import Interval, IntervalTree

In [2]:
##### Constants #####

LINEAR_COLUMNS = ('read_id', 
				  'chrom', 
				#   'start', 
				#   'end', 
				  'blocks', 
				  'cigar',
				  'read',
				  )
CIGARTUPLE_CODES = {0: 'M',
					1: 'I',
					2: 'D',
					3: 'N',
					4: 'S',
					5: 'H',
					6: 'P',
					7: '=',
					8: 'X',
					9: 'B',
					}

In [3]:
##### Functions #####
def parse_attributes(attribute_string:str, anno_type:str) -> dict:
	if anno_type == 'gtf':
		attributes = attribute_string.rstrip('";').split('; ')
		attributes = [attr.split(' ') for attr in attributes]
		tags = [attr_val.strip('"') for attr_name, attr_val in attributes if attr_name=='tag']
		attributes = {attr_name: attr_val.strip('"') for attr_name, attr_val in attributes if attr_name!='tag'}
		attributes['tags'] = tags
	else:
		attributes = [attr.split('=') for attr in attribute_string.split(';')]
		attributes = [(attr[0].lstrip(), attr[1]) for attr in attributes]
		attributes = dict(attributes)
		if 'tag' in attributes:
			attributes['tags'] = attributes['tag'].split(',')

	return attributes


def parse_gene_info(ref_anno:str) -> dict:
	prev_ext, last_ext = ref_anno.split('.')[-2:]
	if last_ext == 'gz':
		in_file, anno_type = gzip.open(ref_anno, 'rt'), prev_ext
	else:
		in_file, anno_type = open(ref_anno), last_ext

	genes = {}
	for line in in_file:
		if line[0] != '#':
			_, _, feature, _, _, _, _, _, attributes = line.strip().split('\t')
			if feature == 'transcript':
				attributes = parse_attributes(attributes, anno_type)
				genes[attributes['gene_id']] = [attributes[e] for e in ['gene_name', 'gene_type']]
	in_file.close()
	
	return genes


def parse_introns(ref_introns:str) -> tuple:
	'''
	Returns a dict formatted as follows:
	{Chromosome: {Strand(+ or -): Intervaltree(StartPosition(int), EndPosition(int), {"gene_id": GeneID})}}
	'''

	if ref_introns.split('.')[-1] == 'gz':
		intron_file = gzip.open(ref_introns, 'rt')
	else:
		intron_file = open(ref_introns)

	introns, introns_done = {}, set()
	for line in intron_file:
		chrom, start, end, intron_info, _, strand = line.strip().split('\t')
		if chrom not in introns:
			introns[chrom] = IntervalTree()
		intron_id = '{}_{}_{}_{}'.format(chrom, strand, start, end)
		if intron_id not in introns_done:
			start, end = int(start), int(end)
			if end-start > 20:
				intron_genes = [i.split('-')[0] for i in intron_info.split(';')[-1].split('|')]
				introns[chrom].add(Interval(start, end, {'strand': strand, 'gene_id':intron_genes}))
				introns_done.add(intron_id)

	intron_file.close()
	return introns


def parse_exons(ref_exons:str) -> tuple:
	'''
	Returns a dict formatted as follows:
	{Chromosome: {Strand(+ or -): Intervaltree(StartPosition(int), EndPosition(int), {"gene_id": GeneID})}}
	'''
	if ref_exons.split('.')[-1] == 'gz':
		exon_file = gzip.open(ref_exons, 'rt')
	else:
		exon_file = open(ref_exons)

	exons, exons_done = {}, set()
	for line in exon_file:
		chrom, start, end, exon_info, _, strand = line.strip().split('\t')
		if chrom not in exons:
			exons[chrom] = IntervalTree()
		exon_id = '{}_{}_{}_{}'.format(chrom, strand, start, end)
		if exon_id not in exons_done:
			start, end = int(start), int(end)
			if end-start > 20:
				exon_genes = [i.split('-')[0] for i in exon_info.split(';')[-1].split('|')]
				exons[chrom].add(Interval(start, end, {'strand': strand, 'gene_id':exon_genes}))
				exons_done.add(exon_id)

	exon_file.close()
	return exons


def tree_covers_interval(tree:IntervalTree, interval:Interval) -> bool:
	total_coverage = False
	merged_tree = tree.copy()
	merged_tree.merge_overlaps(strict=False)
	for merged_interval in merged_tree:
		if merged_interval.contains_interval(interval):
			total_coverage = True
	
	return total_coverage

In [4]:
##### Settings #####
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

ref_anno='/datasets2/lariat_mapping/reference_files/human/ref_dir_a/annotation.gtf'
ref_introns='/datasets2/lariat_mapping/reference_files/human/ref_dir_a/introns.bed'
ref_exons='/datasets2/lariat_mapping/reference_files/human/ref_dir_a/exons.bed'
# output_base='/datasets2/lariat_mapping/testing/output/C22-1/'
# output_base='/datasets2/lariat_mapping/testing/output/HEK293T-1/'
# output_base='/datasets2/lariat_mapping/testing/output/C22-1_10m/'
# output_base='/datasets2/lariat_mapping/testing/output/HEK293T-1_10m/'
output_base='/datasets2/lariat_mapping/testing/100k_truncs/C22-1/'
# output_base='/datasets2/lariat_mapping/testing/100k_truncs/HEK293T-1/'
print(output_base)

/datasets2/lariat_mapping/testing/100k_truncs/C22-1/


In [5]:
genes = parse_gene_info(ref_anno)
print(list(genes.items())[0])
introns = parse_introns(ref_introns)
print(list(introns['chr1'])[:3])
exons = parse_exons(ref_exons)
print(list(exons['chr1'])[:3])

('ENSG00000290825.1', ['DDX11L2', 'lncRNA'])
[Interval(15118904, 15120971, {'strand': '-', 'gene_id': ['ENSG00000175147.13']}), Interval(111977008, 111981620, {'strand': '-', 'gene_id': ['ENSG00000171385.11']}), Interval(61251369, 61253410, {'strand': '-', 'gene_id': ['ENSG00000237853.6', 'ENSG00000237853.6', 'ENSG00000237853.6', 'ENSG00000237853.6']})]
[Interval(86569023, 86571408, {'strand': '-', 'gene_id': ['ENSG00000236915.3']}), Interval(35018512, 35019602, {'strand': '-', 'gene_id': ['ENSG00000163867.17']}), Interval(155562072, 155562286, {'strand': '+', 'gene_id': ['ENSG00000235919.5']})]


In [6]:
# Add gene types to introns
for chrom in introns.keys():
	for intron in introns[chrom]:
		gene_info = [genes[gene_id] for gene_id in intron.data['gene_id']]
		gene_types = set([type_ for name, type_ in gene_info])
		intron.data['gene_type'] = gene_types
print(list(introns['chr1'])[:3])

# Add gene types to exons
for chrom in exons.keys():
	for exon in exons[chrom]:
		gene_info = [genes[gene_id] for gene_id in exon.data['gene_id']]
		gene_types = set([type_ for name, type_ in gene_info])
		exon.data['gene_type'] = gene_types
print(list(exons['chr1'])[:3])

[Interval(15118904, 15120971, {'strand': '-', 'gene_id': ['ENSG00000175147.13'], 'gene_type': {'lncRNA'}}), Interval(111977008, 111981620, {'strand': '-', 'gene_id': ['ENSG00000171385.11'], 'gene_type': {'protein_coding'}}), Interval(61251369, 61253410, {'strand': '-', 'gene_id': ['ENSG00000237853.6', 'ENSG00000237853.6', 'ENSG00000237853.6', 'ENSG00000237853.6'], 'gene_type': {'lncRNA'}})]
[Interval(86569023, 86571408, {'strand': '-', 'gene_id': ['ENSG00000236915.3'], 'gene_type': {'lncRNA'}}), Interval(35018512, 35019602, {'strand': '-', 'gene_id': ['ENSG00000163867.17'], 'gene_type': {'protein_coding'}}), Interval(155562072, 155562286, {'strand': '+', 'gene_id': ['ENSG00000235919.5'], 'gene_type': {'lncRNA'}})]


In [7]:
linear_reads = []
for align in pysam.AlignmentFile(f'{output_base}/mapped_reads.bam', 'rb'):
	if align.is_unmapped:
		continue
	linear_reads.append([
					align.query_name, 
					align.reference_name, 
					# align.reference_start,
					# align.reference_end, 
					align.get_blocks(),
					align.cigartuples,
					# align.get_tags(),
					align.is_read1,
					])

linear_reads = pd.DataFrame(linear_reads, columns=LINEAR_COLUMNS)
linear_reads.cigar = linear_reads.cigar.transform(lambda cigar: tuple((CIGARTUPLE_CODES[op],length) for op, length in cigar))
# linear_reads.tags = linear_reads.tags.transform(lambda tags: {tag: val for tag, val in tags})
linear_reads.read = linear_reads.read.map({True: '1', False: '2'})
linear_reads['read_id'] = linear_reads.read_id + '/' + linear_reads.read
assert linear_reads.read_id.is_unique
linear_reads = linear_reads.sort_values(['read_id'])
linear_reads

Unnamed: 0,read_id,chrom,blocks,cigar,read
38756,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/1,chr22,"[(43161160, 43161190), (43162802, 43162922)]","((M, 30), (N, 1612), (M, 120))",1
38757,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/2,chr22,"[(43161094, 43161190), (43162802, 43162856)]","((M, 96), (N, 1612), (M, 54))",2
41412,NGSNJ-086:229:GW200110425th:1:1101:10004:11992/1,chr9,"[(9442096, 9442246)]","((M, 150),)",1
41413,NGSNJ-086:229:GW200110425th:1:1101:10004:11992/2,chr9,"[(9442107, 9442257)]","((M, 150),)",2
41720,NGSNJ-086:229:GW200110425th:1:1101:10004:12085/1,chr14,"[(49862579, 49862729)]","((M, 150),)",1
...,...,...,...,...,...
98699,NGSNJ-086:229:GW200110425th:1:1101:9995:27352/2,chr14,"[(32163953, 32164103)]","((M, 150),)",2
14870,NGSNJ-086:229:GW200110425th:1:1101:9995:4961/1,chr13,"[(48233331, 48233481)]","((M, 150),)",1
14871,NGSNJ-086:229:GW200110425th:1:1101:9995:4961/2,chr13,"[(48233242, 48233392)]","((M, 150),)",2
31980,NGSNJ-086:229:GW200110425th:1:1101:9995:9502/1,chr14,"[(49586704, 49586854)]","((M, 150),)",1


In [8]:
def infer_segments(row):
	if len(row['blocks']) == 1:
		return row['blocks']
	
	segs = []
	current_block = row['blocks'][0]
	for blocks_index in range(1, len(row['blocks'])):
		next_block = row['blocks'][blocks_index]
		cigar_index = 2*blocks_index - 1
		gap_type = row['cigar'][cigar_index][0]

		if gap_type == 'N':
			segs.append(current_block)
			current_block = next_block
		elif gap_type in ('I', 'D'):
			current_block = (current_block[0], next_block[1])
		else:
			raise RuntimeError(f"{row['blocks']}\n{row['cigar']}\n{blocks_index}")
	
	segs.append(current_block)
		
	return tuple(segs)

linear_reads['segs'] = linear_reads.apply(infer_segments, axis=1, result_type='reduce')
linear_reads

Unnamed: 0,read_id,chrom,blocks,cigar,read,segs
38756,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/1,chr22,"[(43161160, 43161190), (43162802, 43162922)]","((M, 30), (N, 1612), (M, 120))",1,"((43161160, 43161190), (43162802, 43162922))"
38757,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/2,chr22,"[(43161094, 43161190), (43162802, 43162856)]","((M, 96), (N, 1612), (M, 54))",2,"((43161094, 43161190), (43162802, 43162856))"
41412,NGSNJ-086:229:GW200110425th:1:1101:10004:11992/1,chr9,"[(9442096, 9442246)]","((M, 150),)",1,"[(9442096, 9442246)]"
41413,NGSNJ-086:229:GW200110425th:1:1101:10004:11992/2,chr9,"[(9442107, 9442257)]","((M, 150),)",2,"[(9442107, 9442257)]"
41720,NGSNJ-086:229:GW200110425th:1:1101:10004:12085/1,chr14,"[(49862579, 49862729)]","((M, 150),)",1,"[(49862579, 49862729)]"
...,...,...,...,...,...,...
98699,NGSNJ-086:229:GW200110425th:1:1101:9995:27352/2,chr14,"[(32163953, 32164103)]","((M, 150),)",2,"[(32163953, 32164103)]"
14870,NGSNJ-086:229:GW200110425th:1:1101:9995:4961/1,chr13,"[(48233331, 48233481)]","((M, 150),)",1,"[(48233331, 48233481)]"
14871,NGSNJ-086:229:GW200110425th:1:1101:9995:4961/2,chr13,"[(48233242, 48233392)]","((M, 150),)",2,"[(48233242, 48233392)]"
31980,NGSNJ-086:229:GW200110425th:1:1101:9995:9502/1,chr14,"[(49586704, 49586854)]","((M, 150),)",1,"[(49586704, 49586854)]"


In [9]:
linear_reads = linear_reads.explode('segs')
linear_reads['seg'] = linear_reads.segs.transform(lambda segs: Interval(*segs))
linear_reads = linear_reads.drop(columns=['blocks', 'read', 'segs', 'cigar'])
linear_reads

Unnamed: 0,read_id,chrom,seg
38756,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/1,chr22,"(43161160, 43161190, None)"
38756,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/1,chr22,"(43162802, 43162922, None)"
38757,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/2,chr22,"(43161094, 43161190, None)"
38757,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/2,chr22,"(43162802, 43162856, None)"
41412,NGSNJ-086:229:GW200110425th:1:1101:10004:11992/1,chr9,"(9442096, 9442246, None)"
...,...,...,...
14870,NGSNJ-086:229:GW200110425th:1:1101:9995:4961/1,chr13,"(48233331, 48233481, None)"
14871,NGSNJ-086:229:GW200110425th:1:1101:9995:4961/2,chr13,"(48233242, 48233392, None)"
31980,NGSNJ-086:229:GW200110425th:1:1101:9995:9502/1,chr14,"(49586704, 49586854, None)"
31981,NGSNJ-086:229:GW200110425th:1:1101:9995:9502/2,chr14,"(49563422, 49563424, None)"


In [10]:
def add_exons(row):
	overlap_exons = exons[row['chrom']].overlap(row['seg'].begin, row['seg'].end)
	# if any('protein_coding' in exon.data['gene_type'] for exon in overlap_exons):
	overlap_exons = {exon for exon in overlap_exons if 'protein_coding' in exon.data['gene_type']}

	return IntervalTree(overlap_exons)


def add_introns(row):
	if row['chrom'] not in introns.keys():
		return IntervalTree()

	overlap_introns = introns[row['chrom']].overlap(row['seg'].begin, row['seg'].end)
	# if any('protein_coding' in intron.data['gene_type'] for intron in overlap_introns):
	overlap_introns = {intron for intron in overlap_introns if 'protein_coding' in intron.data['gene_type']}

	return IntervalTree(overlap_introns)

linear_reads['exons'] = linear_reads.apply(add_exons, axis=1)
linear_reads['introns'] = linear_reads.apply(add_introns, axis=1)
linear_reads

Unnamed: 0,read_id,chrom,seg,exons,introns
38756,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/1,chr22,"(43161160, 43161190, None)","[(43161051, 43161190, {'strand': '+', 'gene_id...",[]
38756,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/1,chr22,"(43162802, 43162922, None)","[(43162802, 43163210, {'strand': '+', 'gene_id...",[]
38757,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/2,chr22,"(43161094, 43161190, None)","[(43161051, 43161190, {'strand': '+', 'gene_id...",[]
38757,NGSNJ-086:229:GW200110425th:1:1101:10004:11303/2,chr22,"(43162802, 43162856, None)","[(43162802, 43163210, {'strand': '+', 'gene_id...",[]
41412,NGSNJ-086:229:GW200110425th:1:1101:10004:11992/1,chr9,"(9442096, 9442246, None)",[],"[(9397482, 9574731, {'strand': '-', 'gene_id':..."
...,...,...,...,...,...
14870,NGSNJ-086:229:GW200110425th:1:1101:9995:4961/1,chr13,"(48233331, 48233481, None)","[(48233202, 48233477, {'strand': '+', 'gene_id...","[(48232698, 48253807, {'strand': '+', 'gene_id..."
14871,NGSNJ-086:229:GW200110425th:1:1101:9995:4961/2,chr13,"(48233242, 48233392, None)","[(48233202, 48233477, {'strand': '+', 'gene_id...","[(48232698, 48253807, {'strand': '+', 'gene_id..."
31980,NGSNJ-086:229:GW200110425th:1:1101:9995:9502/1,chr14,"(49586704, 49586854, None)",[],"[(49586049, 49598399, {'strand': '-', 'gene_id..."
31981,NGSNJ-086:229:GW200110425th:1:1101:9995:9502/2,chr14,"(49563422, 49563424, None)",[],[]


In [11]:
# def pull_gene_types(row):
# 	out = set()
# 	for exon in row['exons']:
# 		out = out.union(exon.data['gene_type'])
# 	for intron in row['introns']:
# 		out = out.union(intron.data['gene_type'])
		
# 	return out

# linear_reads['gene_types'] = linear_reads.apply(pull_gene_types, axis=1) 
# print(linear_reads.gene_types.value_counts())

In [12]:
def classify_seg(row):
	if any(exon.begin==row['seg'].begin for exon in row['exons']) or any(exon.end==row['seg'].end for exon in row['exons']):
		return 'Exon junction'
		
	if len(row['exons'])==0 and len(row['introns'])==0:
		return 'Inter-genic'

	if len(row['exons'])==0 and len(row['introns'])>0:
		#TODO: Check for start at 5'ss
		return 'Intronic'

	if len(row['exons'])>0 and len(row['introns'])==0:
		return 'Exonic'

	# We can deduce that there's at least 1 exon and 1 intron at this point
	if tree_covers_interval(IntervalTree(row['exons']), row['seg']) is True:
		return 'Ambiguous'
	if tree_covers_interval(IntervalTree(row['introns']), row['seg']) is True:
		return 'Ambiguous'

	return 'pre-mRNA'

linear_reads['seg_class'] = linear_reads.apply(classify_seg, axis=1)
print(linear_reads.seg_class.value_counts())

seg_class
Exon junction    46749
Intronic         31234
Inter-genic      26931
Exonic           16882
Ambiguous        10123
pre-mRNA          1118
Name: count, dtype: int64


In [13]:
def classify_read(seg_classes):
	seg_classes_set = set(seg_classes)

	if seg_classes_set == {'Ambiguous', }:
		return 'Ambiguous'

	if len(seg_classes_set)>1 and 'Inter-genic' in seg_classes_set:
		return 'Weird mix'

	seg_classes_set.discard('Ambiguous')

	if seg_classes_set in ({'Exon junction', },
							{'Exon junction', 'Exonic'}
							):
		return 'mRNA'
	
	if seg_classes_set in (
						{'Intronic', 'Exonic'},
						{'Intronic', 'Exon junction'},
						{'Intronic', 'Exonic', 'Exon junction'},
						{'pre-mRNA', 'Intronic'},
						{'pre-mRNA', 'Exonic'},
						{'pre-mRNA', 'Exon junction'},
						{'pre-mRNA', 'Exon junction', 'Intronic'},
						{'pre-mRNA', 'Exon junction', 'Exonic'},
						{'pre-mRNA', 'Exonic', 'Intronic'},
						{'pre-mRNA', 'Exon junction', 'Intronic', 'Exonic'},
						):
		return 'pre-mRNA'

	if len(seg_classes_set) == 1:
		return seg_classes_set.pop()

	return seg_classes_set


linear_reads['class_'] = linear_reads.read_id.map(linear_reads.groupby('read_id').seg_class.agg(classify_read))
print(linear_reads.class_.value_counts())

class_
mRNA           46689
Intronic       30351
Inter-genic    26381
Exonic         16560
Ambiguous      10036
pre-mRNA        1898
Weird mix       1122
Name: count, dtype: int64


In [29]:
def classify_pe(classes):
	classes_set = set(classes)

	if len(classes_set)==1:
		return classes_set.pop()

	if classes_set in (
					{'mRNA', 'Exonic'},
					{'mRNA', 'Ambiguous'},
					):
		return 'mRNA'

	if classes_set in (
					{'pre-mRNA', 'Exonic'},
					{'pre-mRNA', 'Intronic'},
					{'pre-mRNA', 'mRNA'},
					{'pre-mRNA', 'Ambiguous'},
					{'mRNA', 'Intronic'},
					{'Exonic', 'Intronic'},
					):
		return 'pre-mRNA'

	if 'Weird mix' in classes_set:
		return 'Weird mix'

	return tuple(classes_set)

linear_reads.read_id = linear_reads.read_id.transform(lambda rid: rid.split('/')[0])
print(linear_reads.groupby('read_id').class_.agg(classify_pe).value_counts().sum())
print(linear_reads.groupby('read_id').class_.agg(classify_pe).value_counts())

52672
class_
Intronic                    14580
Inter-genic                 12520
mRNA                        12123
Exonic                       7055
Ambiguous                    4121
pre-mRNA                     1231
Weird mix                     499
(Ambiguous, Exonic)           246
(Intronic, Ambiguous)         225
(Inter-genic, Exonic)          38
(Inter-genic, Intronic)        20
(Inter-genic, mRNA)             9
(Inter-genic, Ambiguous)        3
(Inter-genic, pre-mRNA)         2
Name: count, dtype: int64


In [30]:
print(linear_reads.read_id.nunique())

52672


In [17]:
lariat_rids = pd.read_csv(f'{output_base}lariat_reads.tsv', sep='\t').read_id
lariat_rids = set(lariat_rids)
print(len(lariat_rids))
lariat_rids

7


{'NGSNJ-086:229:GW200110425th:1:1101:15582:16830',
 'NGSNJ-086:229:GW200110425th:1:1101:15700:19695',
 'NGSNJ-086:229:GW200110425th:1:1101:2166:13448',
 'NGSNJ-086:229:GW200110425th:1:1101:24551:5713',
 'NGSNJ-086:229:GW200110425th:1:1101:30291:23797',
 'NGSNJ-086:229:GW200110425th:1:1101:5394:15374',
 'NGSNJ-086:229:GW200110425th:1:1101:8314:28354'}

In [18]:
circular_rids = pd.read_csv(f'{output_base}circularized_intron_reads.tsv', sep='\t').read_id
circular_rids = set(circular_rids)
print(len(circular_rids))
circular_rids

0


set()

In [19]:
temp_switch_rids = pd.read_csv(f'{output_base}template_switching_alignments.tsv', sep='\t').read_id
temp_switch_rids = set(temp_switch_rids)
print(len(temp_switch_rids))
temp_switch_rids

1856


{'NGSNJ-086:229:GW200110425th:1:1101:18530:1141',
 'NGSNJ-086:229:GW200110425th:1:1101:11722:20791',
 'NGSNJ-086:229:GW200110425th:1:1101:26440:13714',
 'NGSNJ-086:229:GW200110425th:1:1101:13295:19069',
 'NGSNJ-086:229:GW200110425th:1:1101:29225:1501',
 'NGSNJ-086:229:GW200110425th:1:1101:3052:12602',
 'NGSNJ-086:229:GW200110425th:1:1101:27019:6793',
 'NGSNJ-086:229:GW200110425th:1:1101:31042:1454',
 'NGSNJ-086:229:GW200110425th:1:1101:20609:26256',
 'NGSNJ-086:229:GW200110425th:1:1101:30608:25974',
 'NGSNJ-086:229:GW200110425th:1:1101:21703:25488',
 'NGSNJ-086:229:GW200110425th:1:1101:24415:13526',
 'NGSNJ-086:229:GW200110425th:1:1101:11849:24706',
 'NGSNJ-086:229:GW200110425th:1:1101:3558:2300',
 'NGSNJ-086:229:GW200110425th:1:1101:22336:11741',
 'NGSNJ-086:229:GW200110425th:1:1101:1090:17628',
 'NGSNJ-086:229:GW200110425th:1:1101:16396:7811',
 'NGSNJ-086:229:GW200110425th:1:1101:24442:1047',
 'NGSNJ-086:229:GW200110425th:1:1101:3902:14638',
 'NGSNJ-086:229:GW200110425th:1:1101:21043

In [20]:
assert len(lariat_rids.intersection(circular_rids)) == 0, lariat_rids.intersection(circular_rids)
assert len(lariat_rids.intersection(temp_switch_rids)) == 0, lariat_rids.intersection(temp_switch_rids)
assert len(temp_switch_rids.intersection(circular_rids)) == 0, temp_switch_rids.intersection(circular_rids)

In [21]:
lariat_failed = pd.read_csv(f'{output_base}failed_lariat_alignments.tsv', sep='\t')
lariat_failed = lariat_failed[['read_id', 'filter_failed']]
repeat_rids = lariat_failed.loc[lariat_failed.filter_failed.isin(('in_repeat', 'ubiquitin_gene'))]
print(len(repeat_rids))
lariat_failed

0


Unnamed: 0,read_id,filter_failed
0,NGSNJ-086:229:GW200110425th:1:1101:5394:15374,not_chosen
1,NGSNJ-086:229:GW200110425th:1:1101:24551:5713,not_chosen
2,NGSNJ-086:229:GW200110425th:1:1101:8314:28354,not_chosen


In [22]:
trim_failed = pd.read_csv(f'{output_base}failed_trimmed_alignments.tsv', sep='\t')
trim_failed.read_id = trim_failed.read_id.transform(lambda rid: rid[:-4].split('/')[0])
trim_failed = trim_failed[['read_id', 'filter_failed']].drop_duplicates(ignore_index=True)
trim_failed

Unnamed: 0,read_id,filter_failed
0,NGSNJ-086:229:GW200110425th:1:1101:32877:4319,overlaps_intron
1,NGSNJ-086:229:GW200110425th:1:1101:27706:14152,overlaps_intron
2,NGSNJ-086:229:GW200110425th:1:1101:4264:21433,overlaps_intron
3,NGSNJ-086:229:GW200110425th:1:1101:31222:17080,overlaps_intron
4,NGSNJ-086:229:GW200110425th:1:1101:9335:17816,overlaps_intron
...,...,...
4748,NGSNJ-086:229:GW200110425th:1:1101:32434:18364,fivep_intron_match
4749,NGSNJ-086:229:GW200110425th:1:1101:14407:17613,fivep_intron_match
4750,NGSNJ-086:229:GW200110425th:1:1101:13783:24706,fivep_intron_match
4751,NGSNJ-086:229:GW200110425th:1:1101:30879:11913,fivep_intron_match


In [23]:
fivep_passed_rids = pd.read_csv(f'{output_base}fivep_info_table.tsv', sep='\t').read_id
fivep_passed_rids = set(fivep_passed_rids.transform(lambda rid: rid[:-4].split('/')[0]))
print(len(fivep_passed_rids))
fivep_passed_rids

6862


{'NGSNJ-086:229:GW200110425th:1:1101:11577:16626',
 'NGSNJ-086:229:GW200110425th:1:1101:18530:1141',
 'NGSNJ-086:229:GW200110425th:1:1101:21169:3364',
 'NGSNJ-086:229:GW200110425th:1:1101:20681:25347',
 'NGSNJ-086:229:GW200110425th:1:1101:29225:1501',
 'NGSNJ-086:229:GW200110425th:1:1101:26060:9236',
 'NGSNJ-086:229:GW200110425th:1:1101:31042:1454',
 'NGSNJ-086:229:GW200110425th:1:1101:1172:20901',
 'NGSNJ-086:229:GW200110425th:1:1101:24415:13526',
 'NGSNJ-086:229:GW200110425th:1:1101:24939:11522',
 'NGSNJ-086:229:GW200110425th:1:1101:16396:7811',
 'NGSNJ-086:229:GW200110425th:1:1101:5095:16517',
 'NGSNJ-086:229:GW200110425th:1:1101:14091:1313',
 'NGSNJ-086:229:GW200110425th:1:1101:30237:25488',
 'NGSNJ-086:229:GW200110425th:1:1101:4562:22545',
 'NGSNJ-086:229:GW200110425th:1:1101:14362:11303',
 'NGSNJ-086:229:GW200110425th:1:1101:23845:7529',
 'NGSNJ-086:229:GW200110425th:1:1101:13783:12179',
 'NGSNJ-086:229:GW200110425th:1:1101:10357:16172',
 'NGSNJ-086:229:GW200110425th:1:1101:11216

In [24]:
genome_unmapped_rids = pyfaidx.Fasta(f'{output_base}unmapped_reads.fa')
genome_unmapped_rids = set([read_id.split('/')[0] for read_id in genome_unmapped_rids.keys()])
print(len(genome_unmapped_rids))

47328


In [25]:
read_class = []
for read_id in genome_unmapped_rids:
	if read_id in lariat_rids:
		read_class.append((read_id, 'Lariat'))
	elif read_id in repeat_rids:
		read_class.append((read_id, 'Repeat region'))
	elif read_id in circular_rids:
		read_class.append((read_id, 'Circularized intron'))
	elif read_id in temp_switch_rids:
		read_class.append((read_id, 'Template-switching'))
	elif read_id in fivep_passed_rids:
		read_class.append((read_id, "Has 5'ss alignment"))
	else:
		read_class.append((read_id, 'Unmapped'))

read_class = pd.DataFrame(read_class, columns=('read_id', 'class_'))
print(read_class.class_.value_counts())
read_class

class_
Unmapped              40466
Has 5'ss alignment     4999
Template-switching     1856
Lariat                    7
Name: count, dtype: int64


Unnamed: 0,read_id,class_
0,NGSNJ-086:229:GW200110425th:1:1101:11577:16626,Has 5'ss alignment
1,NGSNJ-086:229:GW200110425th:1:1101:3721:29105,Unmapped
2,NGSNJ-086:229:GW200110425th:1:1101:22200:27226,Unmapped
3,NGSNJ-086:229:GW200110425th:1:1101:21133:15013,Unmapped
4,NGSNJ-086:229:GW200110425th:1:1101:27679:5431,Unmapped
...,...,...
47323,NGSNJ-086:229:GW200110425th:1:1101:5014:5228,Unmapped
47324,NGSNJ-086:229:GW200110425th:1:1101:4246:26287,Unmapped
47325,NGSNJ-086:229:GW200110425th:1:1101:17960:26929,Has 5'ss alignment
47326,NGSNJ-086:229:GW200110425th:1:1101:10420:12211,Unmapped


In [26]:
read_class_counts = read_class.class_.value_counts().to_dict()
print(read_class_counts)

{'Unmapped': 40466, "Has 5'ss alignment": 4999, 'Template-switching': 1856, 'Lariat': 7}


##### Code snippets

In [27]:
# plot = (p9.ggplot(, p9.aes(x="", y="")) +
# 		# p9.geom_() +
# 		# p9.geom_() +
# 		# p9.scale_x_() +
# 		# p9.scale_y_() +
# 		# p9.scale_color_brewer(type='', palette=1, direction=1) +
# 		# p9.scale_fill_brewer(type='', palette=1, direction=1) +
# 		p9.labs(title="") +
# 		p9.theme()
# )
# plot.show()
# # plot.save(PLOTS_DIR + '/.png', dpi=500)

##### Discarded code

In [28]:
%%script false --no-raise-error


# # linear_reads['indels'] = linear_reads.cigar.transform(lambda cigar: sum(length for op, length in cigar if op in ('I', 'D')))
# # linear_reads['mismatches'] = linear_reads.tags.transform(lambda tags: tags['XM'])
# # linear_reads['edit_dist'] = linear_reads.tags.transform(lambda tags: tags['NM'])
# # print(linear_reads.indels.value_counts())
# # print(linear_reads.mismatches.value_counts())
# # print(linear_reads.edit_dist.value_counts())
# linear_reads['spliced'] = linear_reads.n_segs>1
# print(linear_reads.spliced.value_counts())
# linear_reads