In [1]:
from dataclasses import dataclass, field
import os, sys, subprocess
import copy
import statistics
import math
import time
import datetime as dt
import itertools as it
import random

import scalene
import requests
import gzip
import pysam
from defusedxml import ElementTree
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import cluster, decomposition, linear_model, metrics, model_selection, preprocessing, tree, utils 
from statsmodels.api import qqplot
import plotnine as p9
from mizani.formatters import percent_format, comma_format
from Bio import SeqIO, Seq, SeqFeature, SeqRecord, AlignIO, Align, Entrez, Graphics
from Bio.Blast import NCBIWWW
from intervaltree import Interval, IntervalTree

In [2]:
##### Constants #####
DNA_BASES = ('A', 'T', 'G', 'C')
HUMAN_CHROMOSOMES = ('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM')

KEYS_FILE = '/home/tmooney/Misc/keys.txt'
GENCODE_v19_GENOME_DIR = '/datasets2/genomes/fasta_files/hg19.gencode.v19.primary_genome_by_chromosome'
GENCODE_v19_ANNOTATIONS_FILE = '/datasets2/genomes/annotations/hg19.gencode.v19.annotation.gtf'
INTROPOLIS_HG19_FILE = '/datasets2/genomes/intropolis/intropolis.v1.hg19.tsv'
INTROPOLIS_LIFTOVER_FILE = '/datasets2/genomes/intropolis/intropolis.v1.hg19_with_liftover_to_hg38.tsv'
BOWTIE2_HG19_INDEX = '/datasets2/genomes/indices/bowtie2/hg38.fa'
SPLICEAI_HG19_DIR = '/datasets2/genomes/SpliceAI/hg19.gencode.v19.primary_genome'
GENCODE_v44_GENOME_FILE = '/datasets2/genomes/fasta_files/hg38.gencode.v44.primary_assembly_standard_chromosomes.fa'
GENCODE_v44_BASIC_ANNOTATIONS_FILE = '/datasets2/genomes/annotations/hg38.gencode.v44.basic.annotation.gtf'
GENCODE_v44_COMP_ANNOTATIONS_FILE = '/datasets2/genomes/annotations/hg38.gencode.v44.comprehensive.annotation.gtf'
HGNC_ANNOTATIONS_FILE = '/datasets2/genomes/annotations/hgnc_complete_set_2023-07-01.tsv'
BOWTIE2_HG38_INDEX = '/datasets2/genomes/indices/bowtie2/hg19.fa'
SPLICEAI_HG38_DIR = '/datasets2/genomes/SpliceAI/hg38.gencode.v41.primary_genome'
SALMON_HG38_INDEX = '/datasets2/genomes/indices/salmon/hg38.gencode.v44'

WORKING_DIR = '/home/tmooney/Lariat_mapping'
COLLECTED_DATA_DIR = WORKING_DIR + '/collected_data'
FINAL_RESULTS_DIR = WORKING_DIR + '/final_results'
LOG_DIR = WORKING_DIR + '/logs'
OUTPUT_DIR = WORKING_DIR + '/output'
REFERENCE_DIR = WORKING_DIR + '/reference_data'
SEQUENCES_DIR = REFERENCE_DIR + '/sequences'
ANNOTATIONS_DIR = REFERENCE_DIR + '/annotations'
SCRIPTS_DIR= WORKING_DIR + '/scripts'
VISUALS_DIR = WORKING_DIR + '/visuals'
PLOTS_DIR = VISUALS_DIR + '/plots'

# INTRONS_BED = '/home/tmooney/Lariat_mapping/testing/references/hg38.gencode.v44.comprehensive.introns_mod.bed'
INTRONS_BED = '/home/tmooney/Lariat_mapping/testing/new_references/introns.bed'
GTF_FILE = '/home/tmooney/Lariat_mapping/testing/new_references/annotation.gtf'
NCRNA_TYPES = ('lncRNA',
			'Mt_rRNA',
			'Mt_tRNA',
			'miRNA',
			'misc_RNA',
			'rRNA',
			'scRNA',
			'snRNA',
			'snoRNA',
			'ribozyme',
			'sRNA',
			'scaRNA')

IG_TcR_TYPES = ('IG_C_gene',
				'IG_D_gene',
				'IG_J_gene',
				'IG_LV_gene',
				'IG_V_gene',
				'TR_C_gene',
				'TR_J_gene',
				'TR_V_gene',
				'TR_D_gene')

READ_SPECIES =  ('Lariat', 'Repeat region', 'Circular intron', 'Template switching', "Unmapped, has a 5'ss alignment", "Unmapped", "Ends at a 5'ss", 'pre-mRNA', 'mRNA', 'Intronic', 'Exonic', 'Genic, ambiguous', 'Inter-genic')

In [3]:
##### Functions #####
def tree_covers_interval(tree:IntervalTree, interval:Interval) -> bool:
	total_coverage = False
	merged_tree = tree.copy()
	merged_tree.merge_overlaps(strict=False)
	for merged_interval in merged_tree:
		if merged_interval.contains_interval(interval):
			total_coverage = True
	
	return total_coverage


def process_fivep_sites(sites:str):
	out = []
	for site in sites.split(','):
		chrom, start, end, strand = site.split(';')
		start = int(start)
		end = int(end)
		out.append((chrom, start, end, strand))

	return tuple(out)


def parse_attributes(attribute_string:str) -> dict:
	attributes = attribute_string.rstrip('";').split('; ')
	attributes = [attr.split(' ') for attr in attributes]
	tags = [attr_val.strip('"') for attr_name, attr_val in attributes if attr_name=='tag']
	attributes = {attr_name: attr_val.strip('"') for attr_name, attr_val in attributes if attr_name!='tag'}
	attributes['tags'] = tags

	return attributes

def parse_transcript_info(ref_gtf:str):
	'''
	
	'''
	# Count header lines in GTF
	header_lines = 0
	with open(ref_gtf) as r:
		for line in r:
			if line.startswith('##'):
				header_lines += 1
			else:
				break

	# Load GTF
	transcripts = pd.read_csv(ref_gtf, names=['chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'attributes'], skiprows=header_lines, sep='\t')
	transcripts = transcripts.loc[transcripts.feature=='transcript'].reset_index(drop=True)
	
	# Pull out transcript x gene info
	transcripts.attributes = transcripts.attributes.transform(parse_attributes)
	transcripts['transcript_id'] = transcripts.attributes.transform(lambda attributes: attributes['transcript_id'])
	transcripts['gene_id'] = transcripts.attributes.transform(lambda attributes: attributes['gene_id'])
	transcripts['gene_name'] = transcripts.attributes.transform(lambda attributes: attributes['gene_name'])
	transcripts['gene_type'] = transcripts.attributes.transform(lambda attributes: attributes['gene_type'])

	return transcripts[['transcript_id', 'gene_id', 'gene_name', 'gene_type']]


def parse_feature_name(name:str, transcripts):
	feat, chrom, start, end, strand, annotations = name.split(';')

	start = int(start)
	end = int(end)
	annotations = [transcript_and_num.split('-') for transcript_and_num in annotations.split('|')]
	annotations = tuple([ (transcript_id, int(num)) for transcript_id, num in annotations ])
	gene_types = tuple(set([transcripts[transcript_id][2] for transcript_id, num in annotations]))
	
	return (feat, chrom, start, end, strand, annotations, gene_types)



In [4]:
##### Settings #####
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

p9.theme_set(p9.theme_classic)

test_dir = '/home/tmooney/Lariat_mapping/output/100k_truncs'
sources = (('C22-1', 'R1_100'), ('C22-1', 'R2_100'), ('C22-3', 'R1_100'), ('C22-3', 'R2_100'), ('C22-5', 'R1_100'), ('C22-5', 'R2_100'), 
		('HEK293T-1', 'R1_100'), ('HEK293T-1', 'R2_100'), ('HEK293T-4', 'R1_100'), ('HEK293T-4', 'R2_100'), ('HEK293T-5', 'R1_100'), ('HEK293T-5', 'R2_100'))
sources = [(sample, r_dir, f'{sample}|{r_dir}', f'{test_dir}/{sample}_{r_dir}k_lariat_mapping') for sample, r_dir in sources]

samp, r_dir, source, out_dir = sources[0]
input_fastq = f'/home/tmooney/Lariat_mapping/testing/input_data/{samp}_{r_dir}k.fastq.gz'
print(samp, r_dir, source, out_dir)

C22-1 R1_100 C22-1|R1_100 /home/tmooney/Lariat_mapping/output/100k_truncs/C22-1_R1_100k_lariat_mapping


In [26]:
LINEAR_COLUMNS = ('read_id', 'chrom', 'forward', 'start', 'end', 'blocks', 'cigar', 'tags', 'is_paired', 'mate_is_mapped', 'read',)
linear_reads = []
for align in pysam.AlignmentFile('/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Lariat_mapping/output/100k_truncs/mapped_reads.bam', 'rb'):
	if align.is_unmapped:
		continue
	linear_reads.append([align.query_name, 
					align.reference_name, 
					align.is_forward,
					align.reference_start, 
					align.reference_end, 
					align.get_blocks(),
					align.cigarstring,
					align.get_tags(),
					align.is_paired,
					align.mate_is_mapped,
					align.is_read1,
					])

linear_reads = pd.DataFrame(linear_reads, columns=LINEAR_COLUMNS)
linear_reads

Unnamed: 0,read_id,chrom,forward,start,end,blocks,cigar,tags,is_paired,mate_is_mapped,read
0,NGSNJ-086:229:GW200110425th:1:1101:1868:1000,chr2,False,76672340,76672490,"[(76672340, 76672490)]",150M,"[(AS, -10), (XN, 0), (XM, 2), (XO, 0), (XG, 0)...",True,False,False
1,NGSNJ-086:229:GW200110425th:1:1101:3857:1000,chr3,False,15780069,15780219,"[(15780069, 15780219)]",150M,"[(AS, -30), (XN, 0), (XM, 6), (XO, 0), (XG, 0)...",True,False,False
2,NGSNJ-086:229:GW200110425th:1:1101:5430:1000,chr3,True,15780035,15780185,"[(15780035, 15780185)]",150M,"[(AS, -15), (XN, 0), (XM, 3), (XO, 0), (XG, 0)...",True,False,False
3,NGSNJ-086:229:GW200110425th:1:1101:11632:1000,chr1,True,109642834,109642984,"[(109642834, 109642984)]",150M,"[(AS, -6), (XN, 0), (XM, 2), (XO, 0), (XG, 0),...",True,False,False
4,NGSNJ-086:229:GW200110425th:1:1101:11668:1000,chr1,False,183083780,183084730,"[(183083780, 183083854), (183084654, 183084730)]",74M800N76M,"[(AS, 0), (XN, 0), (XM, 0), (XO, 0), (XG, 0), ...",True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
38497,NGSNJ-086:229:GW200110425th:1:1101:14570:29136,chr1,True,156287269,156288739,"[(156287269, 156287338), (156288658, 156288739)]",69M1320N81M,"[(AS, 0), (XN, 0), (XM, 0), (XO, 0), (XG, 0), ...",True,True,False
38498,NGSNJ-086:229:GW200110425th:1:1101:14913:29136,chr3,False,127771281,127771706,"[(127771281, 127771402), (127771677, 127771706)]",121M275N29M,"[(AS, -30), (XN, 0), (XM, 6), (XO, 0), (XG, 0)...",True,False,False
38499,NGSNJ-086:229:GW200110425th:1:1101:14950:29136,chr1,True,200833115,200833265,"[(200833115, 200833265)]",150M,"[(AS, -30), (XN, 0), (XM, 6), (XO, 0), (XG, 0)...",True,False,True
38500,NGSNJ-086:229:GW200110425th:1:1101:16559:29136,chr3,False,15780034,15780179,"[(15780034, 15780042), (15780042, 15780179)]",8M5I137M,"[(AS, -35), (XN, 0), (XM, 3), (XO, 1), (XG, 5)...",True,False,False


In [36]:
linear_reads.sort_values('read_id')

Unnamed: 0,read_id,chrom,forward,start,end,blocks,cigar,tags,is_paired,mate_is_mapped,read,pa
15367,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr3,True,15773247,15780208,"[(15773247, 15773254), (15780065, 15780208)]",7M6811N143M,"{'AS': -15, 'ZS': -15, 'XN': 0, 'XM': 3, 'XO':...",True,True,True,CP
15368,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr3,False,15773258,15780219,"[(15773258, 15773260), (15780071, 15780219)]",2M6811N148M,"{'AS': -15, 'ZS': -15, 'XN': 0, 'XM': 3, 'XO':...",True,True,False,CP
15498,NGSNJ-086:229:GW200110425th:1:1101:10004:12085,chr3,True,15780037,15780187,"[(15780037, 15780187)]",150M,"{'AS': 0, 'XN': 0, 'XM': 0, 'XO': 0, 'XG': 0, ...",True,False,False,UP
17515,NGSNJ-086:229:GW200110425th:1:1101:10004:13589,chr3,False,141595425,141595575,"[(141595425, 141595575)]",150M,"{'AS': -4, 'XN': 0, 'XM': 1, 'XO': 0, 'XG': 0,...",True,True,False,DP
17514,NGSNJ-086:229:GW200110425th:1:1101:10004:13589,chr3,True,141595427,141595577,"[(141595427, 141595577)]",150M,"{'AS': -8, 'XN': 0, 'XM': 2, 'XO': 0, 'XG': 0,...",True,True,True,DP
...,...,...,...,...,...,...,...,...,...,...,...,...
31072,NGSNJ-086:229:GW200110425th:1:1101:9995:23719,chr3,True,15780037,15780187,"[(15780037, 15780187)]",150M,"{'AS': 0, 'XN': 0, 'XM': 0, 'XO': 0, 'XG': 0, ...",True,True,False,CP
31071,NGSNJ-086:229:GW200110425th:1:1101:9995:23719,chr3,False,15780073,15780223,"[(15780073, 15780223)]",150M,"{'AS': 0, 'XN': 0, 'XM': 0, 'XO': 0, 'XG': 0, ...",True,True,True,CP
34666,NGSNJ-086:229:GW200110425th:1:1101:9995:26318,chr1,True,167905021,167905171,"[(167905021, 167905171)]",150M,"{'AS': 0, 'XN': 0, 'XM': 0, 'XO': 0, 'XG': 0, ...",True,True,True,CP
34667,NGSNJ-086:229:GW200110425th:1:1101:9995:26318,chr1,False,167905021,167905171,"[(167905021, 167905171)]",150M,"{'AS': 0, 'XN': 0, 'XM': 0, 'XO': 0, 'XG': 0, ...",True,True,False,CP


In [29]:
print(linear_reads.read.value_counts())

read
True     19379
False    19123
Name: count, dtype: int64


In [30]:
linear_reads.tags = linear_reads.tags.apply(lambda tags: {tag: val for tag, val in tags})

In [33]:
linear_reads['pa'] = linear_reads.tags.transform(lambda tags: tags['YT'])
# print(linear_reads.read_id.nunique())
# print(linear_reads.read_id_r.nunique())
linear_reads.pa.value_counts()

pa
CP    26432
UP     9044
DP     3026
Name: count, dtype: int64

In [34]:
linear_reads[['mate_is_mapped', 'pa']].value_counts().sort_index()

mate_is_mapped  pa
False           UP     8066
True            CP    26432
                DP     3026
                UP      978
Name: count, dtype: int64

In [21]:
print(linear_reads.read_id.nunique())
print(linear_reads.loc[linear_reads.mate_is_mapped].read_id.nunique())
print(linear_reads.is_paired.value_counts())
print(linear_reads[['mate_is_mapped', 'mate_is_forward']].value_counts().sort_index())

23284
15218
is_paired
True    38502
Name: count, dtype: int64
mate_is_mapped  mate_is_forward
False           True                8066
True            False              15212
                True               15224
Name: count, dtype: int64


In [58]:
transcripts = parse_transcript_info(GTF_FILE)
transcripts['gene_info'] = transcripts.apply(lambda row: (row['gene_id'], row['gene_name'], row['gene_type']), axis=1)
transcripts = transcripts.set_index('transcript_id')
transcripts = transcripts['gene_info'].to_dict()
transcripts

{'ENST00000456328.2': ('ENSG00000290825.1', 'DDX11L2', 'lncRNA'),
 'ENST00000450305.2': ('ENSG00000223972.6',
  'DDX11L1',
  'transcribed_unprocessed_pseudogene'),
 'ENST00000488147.1': ('ENSG00000227232.5',
  'WASH7P',
  'unprocessed_pseudogene'),
 'ENST00000619216.1': ('ENSG00000278267.1', 'MIR6859-1', 'miRNA'),
 'ENST00000473358.1': ('ENSG00000243485.5', 'MIR1302-2HG', 'lncRNA'),
 'ENST00000469289.1': ('ENSG00000243485.5', 'MIR1302-2HG', 'lncRNA'),
 'ENST00000607096.1': ('ENSG00000284332.1', 'MIR1302-2', 'miRNA'),
 'ENST00000417324.1': ('ENSG00000237613.2', 'FAM138A', 'lncRNA'),
 'ENST00000461467.1': ('ENSG00000237613.2', 'FAM138A', 'lncRNA'),
 'ENST00000606857.1': ('ENSG00000268020.3',
  'OR4G4P',
  'unprocessed_pseudogene'),
 'ENST00000642116.1': ('ENSG00000290826.1', 'ENSG00000290826', 'lncRNA'),
 'ENST00000492842.2': ('ENSG00000240361.3',
  'OR4G11P',
  'transcribed_unprocessed_pseudogene'),
 'ENST00000641515.2': ('ENSG00000186092.7', 'OR4F5', 'protein_coding'),
 'ENST0000046643

In [59]:
introns = functions.load_bed(INTRONS_BED)
introns = introns.rename(columns={'feat': 'chrom'})

introns = introns.groupby(['chrom', 'start', 'end', 'strand'], as_index=False).name.agg(','.join)
introns['coords'] = introns[['chrom', 'start', 'end', 'strand']].apply(lambda row: tuple(row.to_list()), axis=1)
introns['fivep_pos'] = introns.apply(lambda row: row['start'] if row['strand']=='+' else row['end']-1, axis=1)
introns['annotations'] = introns.name.transform(lambda name: parse_feature_name(name)[-1])
introns['adj_exons'] = introns.annotations.transform(lambda annos: tuple([ ((tid, num), (tid, num+1)) for tid, num in annos]))
# introns['interval'] = introns.apply(lambda row: Interval(row['start'], row['end']), axis=1)
print(introns.info())
introns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389738 entries, 0 to 389737
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   chrom        389738 non-null  object
 1   start        389738 non-null  int64 
 2   end          389738 non-null  int64 
 3   strand       389738 non-null  object
 4   name         389738 non-null  object
 5   coords       389738 non-null  object
 6   fivep_pos    389738 non-null  int64 
 7   annotations  389738 non-null  object
 8   adj_exons    389738 non-null  object
dtypes: int64(3), object(6)
memory usage: 26.8+ MB
None


Unnamed: 0,chrom,start,end,strand,name,coords,fivep_pos,annotations,adj_exons
0,chr1,12721,13220,+,intron;chr1;12721;13220;+;ENST00000456328.2-2,"(chr1, 12721, 13220, +)",12721,"((ENST00000456328.2, 2),)","(((ENST00000456328.2, 2), (ENST00000456328.2, ..."
1,chr1,30039,30563,+,intron;chr1;30039;30563;+;ENST00000473358.1-1,"(chr1, 30039, 30563, +)",30039,"((ENST00000473358.1, 1),)","(((ENST00000473358.1, 1), (ENST00000473358.1, ..."
2,chr1,30667,30975,+,intron;chr1;30667;30975;+;ENST00000469289.1-1|...,"(chr1, 30667, 30975, +)",30667,"((ENST00000469289.1, 1), (ENST00000473358.1, 2))","(((ENST00000469289.1, 1), (ENST00000469289.1, ..."
3,chr1,35174,35276,-,intron;chr1;35174;35276;-;ENST00000417324.1-2,"(chr1, 35174, 35276, -)",35275,"((ENST00000417324.1, 2),)","(((ENST00000417324.1, 2), (ENST00000417324.1, ..."
4,chr1,35481,35720,-,intron;chr1;35481;35720;-;ENST00000417324.1-1|...,"(chr1, 35481, 35720, -)",35719,"((ENST00000417324.1, 1), (ENST00000461467.1, 1))","(((ENST00000417324.1, 1), (ENST00000417324.1, ..."
...,...,...,...,...,...,...,...,...,...
389733,chrY,57209733,57209821,+,intron;chrY;57209733;57209821;+;ENST0000071127...,"(chrY, 57209733, 57209821, +)",57209733,"((ENST00000711272.1, 6), (ENST00000711275.1, 5...","(((ENST00000711272.1, 6), (ENST00000711272.1, ..."
389734,chrY,57209980,57210639,+,intron;chrY;57209980;57210639;+;ENST0000071127...,"(chrY, 57209980, 57210639, +)",57209980,"((ENST00000711272.1, 7), (ENST00000711276.1, 5...","(((ENST00000711272.1, 7), (ENST00000711272.1, ..."
389735,chrY,57210792,57211551,+,intron;chrY;57210792;57211551;+;ENST0000071127...,"(chrY, 57210792, 57211551, +)",57210792,"((ENST00000711272.1, 8), (ENST00000711276.1, 6...","(((ENST00000711272.1, 8), (ENST00000711272.1, ..."
389736,chrY,57211569,57211760,+,intron;chrY;57211569;57211760;+;ENST0000071128...,"(chrY, 57211569, 57211760, +)",57211569,"((ENST00000711285.1, 2),)","(((ENST00000711285.1, 2), (ENST00000711285.1, ..."


In [60]:
exploded_introns = introns.explode('adj_exons', ignore_index=True)
exploded_introns

Unnamed: 0,chrom,start,end,strand,name,coords,fivep_pos,annotations,adj_exons
0,chr1,12721,13220,+,intron;chr1;12721;13220;+;ENST00000456328.2-2,"(chr1, 12721, 13220, +)",12721,"((ENST00000456328.2, 2),)","((ENST00000456328.2, 2), (ENST00000456328.2, 3))"
1,chr1,30039,30563,+,intron;chr1;30039;30563;+;ENST00000473358.1-1,"(chr1, 30039, 30563, +)",30039,"((ENST00000473358.1, 1),)","((ENST00000473358.1, 1), (ENST00000473358.1, 2))"
2,chr1,30667,30975,+,intron;chr1;30667;30975;+;ENST00000469289.1-1|...,"(chr1, 30667, 30975, +)",30667,"((ENST00000469289.1, 1), (ENST00000473358.1, 2))","((ENST00000469289.1, 1), (ENST00000469289.1, 2))"
3,chr1,30667,30975,+,intron;chr1;30667;30975;+;ENST00000469289.1-1|...,"(chr1, 30667, 30975, +)",30667,"((ENST00000469289.1, 1), (ENST00000473358.1, 2))","((ENST00000473358.1, 2), (ENST00000473358.1, 3))"
4,chr1,35174,35276,-,intron;chr1;35174;35276;-;ENST00000417324.1-2,"(chr1, 35174, 35276, -)",35275,"((ENST00000417324.1, 2),)","((ENST00000417324.1, 2), (ENST00000417324.1, 3))"
...,...,...,...,...,...,...,...,...,...
1378153,chrY,57211620,57211760,+,intron;chrY;57211620;57211760;+;ENST0000071127...,"(chrY, 57211620, 57211760, +)",57211620,"((ENST00000711272.1, 9), (ENST00000711276.1, 7...","((ENST00000711272.1, 9), (ENST00000711272.1, 10))"
1378154,chrY,57211620,57211760,+,intron;chrY;57211620;57211760;+;ENST0000071127...,"(chrY, 57211620, 57211760, +)",57211620,"((ENST00000711272.1, 9), (ENST00000711276.1, 7...","((ENST00000711276.1, 7), (ENST00000711276.1, 8))"
1378155,chrY,57211620,57211760,+,intron;chrY;57211620;57211760;+;ENST0000071127...,"(chrY, 57211620, 57211760, +)",57211620,"((ENST00000711272.1, 9), (ENST00000711276.1, 7...","((ENST00000711277.1, 5), (ENST00000711277.1, 6))"
1378156,chrY,57211620,57211760,+,intron;chrY;57211620;57211760;+;ENST0000071127...,"(chrY, 57211620, 57211760, +)",57211620,"((ENST00000711272.1, 9), (ENST00000711276.1, 7...","((ENST00000711278.1, 7), (ENST00000711278.1, 8))"


In [78]:
star_aligns = pd.read_csv(f'{out_dir}/STAR/overlaps.bed', sep='\t', header=None)
star_aligns.columns = ['chrom', 'align_start', 'align_end', 'read_id', 'quality', 'align_orient', '_', '_', '_', 'n_segments', 'block_lengths', 'block_starts', '_', '_', '_', '_', 'features', '_', '_']
star_aligns = star_aligns.drop(columns='_')
# No multimapping
star_aligns['align_loc'] = star_aligns.apply(lambda row: f"{row['chrom']}_{row['align_start']}_{row['align_end']}_{row['align_orient']}", axis=1)
assert star_aligns.groupby('read_id').align_loc.apply(lambda id_: id_.nunique()==1).all()
star_aligns

Unnamed: 0,chrom,align_start,align_end,read_id,quality,align_orient,n_segments,block_lengths,block_starts,features,align_loc
0,chr2,73249031,73249180,NGSNJ-086:229:GW200110425th:1:1101:16459:24298,255,+,1,149,0,intron;chr2;73249179;73249818;+;ENST0000025809...,chr2_73249031_73249180_+
1,chr2,73249031,73249180,NGSNJ-086:229:GW200110425th:1:1101:16459:24298,255,+,1,149,0,exon;chr2;73248990;73249061;+;ENST00000488856.1-4,chr2_73249031_73249180_+
2,chr2,73249031,73249180,NGSNJ-086:229:GW200110425th:1:1101:16459:24298,255,+,1,149,0,exon;chr2;73248990;73249179;+;ENST00000540468....,chr2_73249031_73249180_+
3,chr19,40284680,40284817,NGSNJ-086:229:GW200110425th:1:1101:16984:24298,255,+,1,137,0,intron;chr19;40282304;40284863;-;ENST000004564...,chr19_40284680_40284817_+
4,chr19,40284680,40284817,NGSNJ-086:229:GW200110425th:1:1101:16984:24298,255,+,1,137,0,intron;chr19;40282776;40284764;-;ENST000004231...,chr19_40284680_40284817_+
...,...,...,...,...,...,...,...,...,...,...,...
340111,chr5,133959933,133968508,NGSNJ-086:229:GW200110425th:1:1101:14977:24298,255,+,2,8763,08512,intron;chr5;133960020;133968525;-;ENST00000507...,chr5_133959933_133968508_+
340112,chr5,133959933,133968508,NGSNJ-086:229:GW200110425th:1:1101:14977:24298,255,+,2,8763,08512,exon;chr5;133959493;133960020;-;ENST0000023151...,chr5_133959933_133968508_+
340113,chr5,133959933,133968508,NGSNJ-086:229:GW200110425th:1:1101:14977:24298,255,+,2,8763,08512,exon;chr5;133959534;133960020;-;ENST0000050719...,chr5_133959933_133968508_+
340114,chr5,133959933,133968508,NGSNJ-086:229:GW200110425th:1:1101:14977:24298,255,+,2,8763,08512,exon;chr5;133959777;133960020;-;ENST0000050991...,chr5_133959933_133968508_+


In [79]:
def infer_segments(row):
	starts = row['block_starts'].split(',')[:-1]
	starts = [row['align_start']+int(start) for start in starts]
	lengths = row['block_lengths'].split(',')[:-1]
	lengths = [int(length) for length in lengths]

	segments = []
	for i in range(len(starts)):
		segment_start = starts[i]
		segment_end = starts[i] + lengths[i] 
		segments.append(Interval(segment_start, segment_end))

	return tuple(segments)



star_aligns.features = star_aligns.features.transform(lambda features: parse_feature_name(features, transcripts) if features!='.' else '.')
star_aligns = star_aligns.groupby(['read_id', 'chrom', 'align_start', 'align_end', 'align_orient', 'quality', 'n_segments', 'block_lengths', 'block_starts']).agg({'features':tuple}).reset_index()
print(star_aligns.at[0, 'features'])
star_aligns['segment'] = star_aligns.apply(infer_segments, axis=1)
star_aligns

(('exon', 'chr22', 43161051, 43161190, '+', (('ENST00000583777.5', 2), ('ENST00000337554.8', 3), ('ENST00000428336.5', 3), ('ENST00000329563.8', 3), ('ENST00000396265.4', 3)), ('protein_coding',)), ('exon', 'chr22', 43162802, 43163210, '+', (('ENST00000329563.8', 4),), ('protein_coding',)), ('exon', 'chr22', 43162802, 43163241, '+', (('ENST00000583777.5', 3), ('ENST00000396265.4', 4)), ('protein_coding',)), ('exon', 'chr22', 43162802, 43163242, '+', (('ENST00000337554.8', 4),), ('protein_coding',)))


Unnamed: 0,read_id,chrom,align_start,align_end,align_orient,quality,n_segments,block_lengths,block_starts,features,segment
0,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,43161160,43162922,-,255,2,30120,01642,"((exon, chr22, 43161051, 43161190, +, (('ENST0...","((43161160, 43161190, None), (43162802, 431629..."
1,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr9,9442096,9442246,+,3,1,150,0,"((exon, chr9, 9442059, 9442380, +, (('ENST0000...","((9442096, 9442246, None),)"
2,NGSNJ-086:229:GW200110425th:1:1101:10004:12085,chr14,49586699,49586849,-,3,1,150,0,"((exon, chr14, 49586579, 49586878, +, (('ENST0...","((49586699, 49586849, None),)"
3,NGSNJ-086:229:GW200110425th:1:1101:10004:12117,chr14,49862679,49862792,+,3,1,113,0,"((intron, chr14, 49862316, 49863828, -, (('ENS...","((49862679, 49862792, None),)"
4,NGSNJ-086:229:GW200110425th:1:1101:10004:13244,chr14,49862600,49862750,-,3,1,150,0,"((intron, chr14, 49862316, 49863828, -, (('ENS...","((49862600, 49862750, None),)"
...,...,...,...,...,...,...,...,...,...,...,...
83001,NGSNJ-086:229:GW200110425th:1:1101:9995:27352,chr14,32163959,32164109,-,255,1,150,0,"(.,)","((32163959, 32164109, None),)"
83002,NGSNJ-086:229:GW200110425th:1:1101:9995:3928,chr11,65546991,65547095,+,255,1,104,0,"((intron, chr11, 65546920, 65547438, -, (('ENS...","((65546991, 65547095, None),)"
83003,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,48233331,48253811,-,255,2,1464,020476,"((exon, chr13, 48233202, 48233477, +, (('ENST0...","((48233331, 48233477, None), (48253807, 482538..."
83004,NGSNJ-086:229:GW200110425th:1:1101:9995:5807,chr1,114750152,114757991,-,255,2,5567,07772,"((exon, chr1, 114749816, 114750207, -, (('ENST...","((114750152, 114750207, None), (114757924, 114..."


In [80]:
def infer_seg_features(row):
	if row['features'] == ('.',):
		return ('.',)
	
	out = []
	for feat in row['features']:
		if row['segment'].overlaps(feat[2], feat[3]):
			out.append(feat)

	# If any protein-coding gene features, remove non-protein-coding gene features
	protein_coding = [feat for feat in row['features'] if 'protein_coding' in feat[6]]
	if len(protein_coding) > 0:
		out = tuple(protein_coding)

	return tuple(out)


star_aligns = star_aligns.explode('segment')
star_aligns['seg_features'] = star_aligns.apply(infer_seg_features, axis=1)
star_aligns

Unnamed: 0,read_id,chrom,align_start,align_end,align_orient,quality,n_segments,block_lengths,block_starts,features,segment,seg_features
0,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,43161160,43162922,-,255,2,30120,01642,"((exon, chr22, 43161051, 43161190, +, (('ENST0...","(43161160, 43161190, None)","((exon, chr22, 43161051, 43161190, +, (('ENST0..."
0,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,43161160,43162922,-,255,2,30120,01642,"((exon, chr22, 43161051, 43161190, +, (('ENST0...","(43162802, 43162922, None)","((exon, chr22, 43161051, 43161190, +, (('ENST0..."
1,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr9,9442096,9442246,+,3,1,150,0,"((exon, chr9, 9442059, 9442380, +, (('ENST0000...","(9442096, 9442246, None)","((intron, chr9, 9397482, 9574731, -, (('ENST00..."
2,NGSNJ-086:229:GW200110425th:1:1101:10004:12085,chr14,49586699,49586849,-,3,1,150,0,"((exon, chr14, 49586579, 49586878, +, (('ENST0...","(49586699, 49586849, None)","((intron, chr14, 49586049, 49598399, -, (('ENS..."
3,NGSNJ-086:229:GW200110425th:1:1101:10004:12117,chr14,49862679,49862792,+,3,1,113,0,"((intron, chr14, 49862316, 49863828, -, (('ENS...","(49862679, 49862792, None)","((intron, chr14, 49862316, 49863828, -, (('ENS..."
...,...,...,...,...,...,...,...,...,...,...,...,...
83003,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,48233331,48253811,-,255,2,1464,020476,"((exon, chr13, 48233202, 48233477, +, (('ENST0...","(48233331, 48233477, None)","((exon, chr13, 48233202, 48233477, +, (('ENST0..."
83003,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,48233331,48253811,-,255,2,1464,020476,"((exon, chr13, 48233202, 48233477, +, (('ENST0...","(48253807, 48253811, None)","((exon, chr13, 48233202, 48233477, +, (('ENST0..."
83004,NGSNJ-086:229:GW200110425th:1:1101:9995:5807,chr1,114750152,114757991,-,255,2,5567,07772,"((exon, chr1, 114749816, 114750207, -, (('ENST...","(114750152, 114750207, None)","((exon, chr1, 114749816, 114750207, -, (('ENST..."
83004,NGSNJ-086:229:GW200110425th:1:1101:9995:5807,chr1,114750152,114757991,-,255,2,5567,07772,"((exon, chr1, 114749816, 114750207, -, (('ENST...","(114757924, 114757991, None)","((exon, chr1, 114749816, 114750207, -, (('ENST..."


In [81]:
def identify_segment(row):
	if row['seg_features']==('.',):
		return 'Inter-genic'
	
	exons = [feat for feat in row['seg_features'] if feat[0]=='exon']
	introns = [feat for feat in row['seg_features'] if feat[0]=='intron']

	if len(exons) > 0 and len(introns) == 0:
		return 'Exonic'

	if len(exons) == 0 and len(introns) > 0:
		# Check for start at 5'ss
		for intron in introns:
			if intron[5]=='+' and intron[2]==row['segment'].begin:
				return "Ends at a 5'ss"
			elif intron[5]=='-' and intron[3]==row['segment'].end:
				return "Ends at a 5'ss"
		return 'Intronic'

	# We can deduce that there's at least 1 exon and 1 intron at this point
	exons_interval = IntervalTree.from_tuples([(exon[2], exon[3]) for exon in exons])
	if tree_covers_interval(exons_interval, row['segment']) is True:
		return 'Genic, ambiguous'
	introns_interval = IntervalTree.from_tuples([(intron[2], intron[3]) for intron in introns])
	if tree_covers_interval(introns_interval, row['segment']) is True:
		return 'Genic, ambiguous'

	return 'pre-mRNA'

star_aligns['seg_species'] = star_aligns.apply(identify_segment, axis=1)
star_aligns

Unnamed: 0,read_id,chrom,align_start,align_end,align_orient,quality,n_segments,block_lengths,block_starts,features,segment,seg_features,seg_species
0,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,43161160,43162922,-,255,2,30120,01642,"((exon, chr22, 43161051, 43161190, +, (('ENST0...","(43161160, 43161190, None)","((exon, chr22, 43161051, 43161190, +, (('ENST0...",Exonic
0,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,43161160,43162922,-,255,2,30120,01642,"((exon, chr22, 43161051, 43161190, +, (('ENST0...","(43162802, 43162922, None)","((exon, chr22, 43161051, 43161190, +, (('ENST0...",Exonic
1,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr9,9442096,9442246,+,3,1,150,0,"((exon, chr9, 9442059, 9442380, +, (('ENST0000...","(9442096, 9442246, None)","((intron, chr9, 9397482, 9574731, -, (('ENST00...",Intronic
2,NGSNJ-086:229:GW200110425th:1:1101:10004:12085,chr14,49586699,49586849,-,3,1,150,0,"((exon, chr14, 49586579, 49586878, +, (('ENST0...","(49586699, 49586849, None)","((intron, chr14, 49586049, 49598399, -, (('ENS...",Intronic
3,NGSNJ-086:229:GW200110425th:1:1101:10004:12117,chr14,49862679,49862792,+,3,1,113,0,"((intron, chr14, 49862316, 49863828, -, (('ENS...","(49862679, 49862792, None)","((intron, chr14, 49862316, 49863828, -, (('ENS...","Genic, ambiguous"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
83003,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,48233331,48253811,-,255,2,1464,020476,"((exon, chr13, 48233202, 48233477, +, (('ENST0...","(48233331, 48233477, None)","((exon, chr13, 48233202, 48233477, +, (('ENST0...","Genic, ambiguous"
83003,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,48233331,48253811,-,255,2,1464,020476,"((exon, chr13, 48233202, 48233477, +, (('ENST0...","(48253807, 48253811, None)","((exon, chr13, 48233202, 48233477, +, (('ENST0...","Genic, ambiguous"
83004,NGSNJ-086:229:GW200110425th:1:1101:9995:5807,chr1,114750152,114757991,-,255,2,5567,07772,"((exon, chr1, 114749816, 114750207, -, (('ENST...","(114750152, 114750207, None)","((exon, chr1, 114749816, 114750207, -, (('ENST...","Genic, ambiguous"
83004,NGSNJ-086:229:GW200110425th:1:1101:9995:5807,chr1,114750152,114757991,-,255,2,5567,07772,"((exon, chr1, 114749816, 114750207, -, (('ENST...","(114757924, 114757991, None)","((exon, chr1, 114749816, 114750207, -, (('ENST...","Genic, ambiguous"


In [82]:
def identify_read_species(df):
	assigned_species = set(df['seg_species'].values)
	if len(assigned_species) == 1:
		assigned_species = assigned_species.pop()
		if len(df)>1 and assigned_species == 'Exonic':
			return 'mRNA'
		else:
			return assigned_species
	
	if 'pre-mRNA' in assigned_species:
		return 'pre-mRNA'
	if 'Genic, ambiguous' in assigned_species:
		return 'Genic, ambiguous'

	if assigned_species in ({'Intronic', 'Exonic'},
							{'Inter-genic', 'Exonic'},
							{'Inter-genic', 'Intronic'}):
		return 'Weird spliced'

	print(assigned_species)
	return 'Dunno'

# spliced_aligns['species'] = spliced_aligns.read_id.map(spliced_aligns.groupby('read_id').apply(identify_spliced_species))

star_species = star_aligns.groupby('read_id').apply(identify_read_species).reset_index(name='species')
print(star_species.species.value_counts())
star_species

species
Intronic            29983
Genic, ambiguous    20755
Exonic              20182
mRNA                 8960
Inter-genic          2372
pre-mRNA              747
Weird spliced           7
Name: count, dtype: int64




Unnamed: 0,read_id,species
0,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,mRNA
1,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,Intronic
2,NGSNJ-086:229:GW200110425th:1:1101:10004:12085,Intronic
3,NGSNJ-086:229:GW200110425th:1:1101:10004:12117,"Genic, ambiguous"
4,NGSNJ-086:229:GW200110425th:1:1101:10004:13244,"Genic, ambiguous"
...,...,...
83001,NGSNJ-086:229:GW200110425th:1:1101:9995:27352,Inter-genic
83002,NGSNJ-086:229:GW200110425th:1:1101:9995:3928,Intronic
83003,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,"Genic, ambiguous"
83004,NGSNJ-086:229:GW200110425th:1:1101:9995:5807,"Genic, ambiguous"


In [None]:
# unspliced_aligns = star_aligns.loc[star_aligns.n_segments==1].copy().reset_index()
# spliced_aligns = star_aligns.loc[star_aligns.n_segments>1].copy().reset_index()
# print(len(unspliced_aligns), len(spliced_aligns))
# assert unspliced_aligns.read_id.is_unique

In [None]:
# def unspliced_filter_features(row):
# 	out = row['features']

# 	# If any protein-coding gene features, remove non-protein-coding gene features
# 	protein_coding = [feat for feat in row['features'] if feat.data[6]=='protein_coding']
# 	if len(protein_coding) > 0:
# 		out = IntervalTree(protein_coding)
	
# 	return out

def unspliced_filter_features(row):
	if row['features'] == ('.',):
		return IntervalTree([])
	
	out = [Interval(feature[2], feature[3], feature) for feature in row['features']]
	return IntervalTree(out)


def identify_unspliced_species(row):
	if len(row['filtered_features']) == 0:
		return 'Inter-genic'

	exons = IntervalTree([feat for feat in row['filtered_features'] if feat.data[0]=='exon'])
	introns = IntervalTree([feat for feat in row['filtered_features'] if feat.data[0]=='intron'])

	if len(exons) > 0 and len(introns) == 0:
		return 'Exonic'

	if len(exons) == 0 and len(introns) > 0:
		# Check for start at 5'ss
		for intron in introns:
			if intron.data[4]=='+' and intron.begin==row['align_start']:
				return "Ends at a 5'ss"
			elif intron.data[4]=='-' and intron.end==row['align_end']:
				return "Ends at a 5'ss"
		return 'Intronic'

	# We can deduce that there's at least 1 exon and 1 intron at this point
	if tree_covers_interval(exons, Interval(row['align_start'], row['align_end'])) is True:
		return 'Genic, ambiguous'
	if tree_covers_interval(introns, Interval(row['align_start'], row['align_end'])) is True:
		return 'Genic, ambiguous'

	return 'pre-mRNA'



unspliced_aligns = unspliced_aligns.set_index('read_id', drop=False)
unspliced_aligns['filtered_features'] = unspliced_aligns.apply(unspliced_filter_features, axis=1, result_type='reduce')
# unspliced_aligns['filtered_gene_types'] = unspliced_aligns.filtered_features.transform(lambda features: tuple(set([feat.data[6] for feat in features])))
unspliced_aligns['species'] = unspliced_aligns.apply(identify_unspliced_species, axis=1)
print(unspliced_aligns.species.value_counts())
unspliced_aligns

In [None]:
unspliced_aligns['full_coverage'] = unspliced_aligns.apply(lambda row: tree_covers_interval(row['filtered_features'], Interval(row['align_start'], row['align_end'])), axis=1)
print(unspliced_aligns.full_coverage.value_counts())

In [None]:
def spliced_filter_features(group:pd.DataFrame):
	# out = group.features.iloc[0]
	# all_gids = set([feat.data[4] for feat in out])
	# n_segments = len(group.segments.iloc[0])
	
	# # Remove features from genes that don't cover all segments
	# for gid in all_gids:
	# 	segments_in_gene = group.loc[group.seg_gids.transform(str).str.contains(gid)]
	# 	if len(segments_in_gene) < n_segments:
	# 		out = IntervalTree([feat for feat in out if feat.data[4] != gid])
	
	# If any protein-coding gene features, remove non-protein-coding gene features
	out = group.features.iloc[0]
	protein_coding = [feat for feat in out if feat.data[6]=='protein_coding']
	if len(protein_coding) > 0:
		out = IntervalTree(protein_coding)
	
	return out


def infer_seg_features(row, filtered:bool):
	seg_interval = Interval(row['seg_start'], row['seg_end'])
	features = row['filtered_features'] if filtered is True else row['features']

	out = IntervalTree([feat for feat in features if feat.overlaps(seg_interval)])
	return out



spliced_aligns = spliced_aligns.explode('segments').reset_index(drop=True)
spliced_aligns['seg_start'] = spliced_aligns.segments.transform(lambda s: s.begin)
spliced_aligns['seg_end'] = spliced_aligns.segments.transform(lambda s: s.end)
spliced_aligns['seg_features'] = spliced_aligns.apply(infer_seg_features, filtered=False, axis=1)
spliced_aligns['seg_gids'] = spliced_aligns.seg_features.transform(lambda features: tuple(set([feat.data[4] for feat in features])))
spliced_aligns['filtered_features'] = spliced_aligns.read_id.map(spliced_aligns.groupby('read_id').apply(spliced_filter_features))
spliced_aligns['filtered_seg_features'] = spliced_aligns.apply(infer_seg_features, filtered=True, axis=1)
spliced_aligns['filtered_seg_gene_types'] = spliced_aligns.filtered_seg_features.transform(lambda features: tuple(set([feat.data[6] for feat in features])))
spliced_aligns

In [None]:
assert set(unspliced_aligns.filtered_gene_types.unique()).issubset({(), ('ncRNA',), ('protein_coding',), ('Ig or TcR',)})
assert set(spliced_aligns.filtered_seg_gene_types.unique()).issubset({(), ('ncRNA',), ('protein_coding',), ('Ig or TcR',)})

In [None]:
def identify_segment(row):
	if len(row['filtered_seg_features'])==0:
		return 'Inter-genic'
	
	seg_interval = Interval(row['seg_start'], row['seg_end'])
	exons = IntervalTree([feat for feat in row['filtered_seg_features'] if feat.data[0]=='exon'])
	introns = IntervalTree([feat for feat in row['filtered_seg_features'] if feat.data[0]=='intron'])

	if len(exons) > 0 and len(introns) == 0:
		return 'Exonic'

	if len(exons) == 0 and len(introns) > 0:
		# Check for start at 5'ss
		for intron in introns:
			if intron.data[3]=='+' and intron.begin==seg_interval.begin:
				return "Ends at a 5'ss"
			elif intron.data[3]=='-' and intron.end==seg_interval.end:
				return "Ends at a 5'ss"
		return 'Intronic'

	# We can deduce that there's at least 1 exon and 1 intron at this point
	if tree_covers_interval(exons, seg_interval) is True:
		return 'Genic, ambiguous'
	if tree_covers_interval(introns, seg_interval) is True:
		return 'Genic, ambiguous'

	return 'pre-mRNA'

spliced_aligns['seg_species'] = spliced_aligns.apply(identify_segment, axis=1)
print(spliced_aligns.seg_species.value_counts())

In [None]:
def identify_spliced_species(df):	
	assigned_species = set(df['seg_species'].values)
	if len(assigned_species) == 1:
		assigned_species = assigned_species.pop()
		if assigned_species == 'Exonic':
			return 'mRNA'
		else:
			return assigned_species
	
	if 'pre-mRNA' in assigned_species:
		return 'pre-mRNA'
	if 'Genic, ambiguous' in assigned_species:
		return 'Genic, ambiguous'
	if assigned_species in ({'Intronic', 'Exonic'},
							{'Inter-genic', 'Exonic'},
							{'Inter-genic', 'Intronic'}):
		return 'Weird spliced'

	print(assigned_species)
	return 'Dunno'

spliced_aligns['species'] = spliced_aligns.read_id.map(spliced_aligns.groupby('read_id').apply(identify_spliced_species))
print(spliced_aligns.drop_duplicates('read_id').species.value_counts())

In [None]:
# If weird spliced is less than 1% prevalent, lump it with Genic, ambiguous
prop = spliced_aligns.species.eq('Weird spliced').value_counts(normalize=True)[True]*100
assert prop < 1, prop
spliced_aligns.species = spliced_aligns.species.replace('Weird spliced', 'Genic, ambiguous')

In [None]:
print(spliced_aligns.drop_duplicates('read_id').species.value_counts())

In [None]:
assert spliced_aligns.species.eq('Dunno').sum() == 0
# spliced_aligns.loc[spliced_aligns.species.eq('Dunno')]

In [7]:
output_base='/Users/trumanmooney/Downloads/alpha_main_merge/testing/output/C22_test_read1_single_end_hisat2/C22_test_'

In [26]:
lariat_rids = pd.read_csv(f'{output_base}lariat_reads.tsv', sep='\t', usecols=[0]).read_id
lariat_rids = set(lariat_rids)
lariat_rids

{'NGSNJ-086:229:GW200110425th:1:1101:2166:13448',
 'NGSNJ-086:229:GW200110425th:1:1101:24551:5713',
 'NGSNJ-086:229:GW200110425th:1:1101:30291:23797',
 'NGSNJ-086:229:GW200110425th:1:1101:5394:15374',
 'NGSNJ-086:229:GW200110425th:1:1101:8314:28354'}

In [13]:
circular_rids = pd.read_csv(f'{output_base}circularized_introns.tsv', sep='\t').read_id
circular_rids = set(circular_rids)
circular_rids

set()

In [28]:
temp_switch_rids = pd.read_csv(f'{output_base}template_switching_alignments.tsv', sep='\t').read_id
temp_switch_rids = set(temp_switch_rids)
temp_switch_rids

FileNotFoundError: [Errno 2] No such file or directory: '/Users/trumanmooney/Downloads/alpha_main_merge/testing/output/C22_test_read1_single_end_hisat2/C22_test_template_switching_alignments.tsv'

In [15]:
lariat_failed = pd.read_csv(f'{output_base}failed_lariat_alignments.tsv', sep='\t')
lariat_failed = lariat_failed[['read_id', 'filter_failed']]
lariat_failed

Unnamed: 0,read_id,read_is_reverse,read_seq,chrom,strand,head_start,head_end,fivep_pos,bp_pos,read_bp_nt,...,genomic_bp_nt,threep_pos,bp_dist_to_threep,gene_id,gene_name,gene_type,read_num,align_mismatch,total_mapped_reads,filter_failed
0,NGSNJ-086:229:GW200110425th:1:1101:8314:28354,False,GGGGACCCAGTGGAGCACACGGGTGAGCCCAGTGCAGCCGGGTGGC...,chr1,-,1634741,1634763,1634913,1634741,G,...,A,1634708,-33,ENSG00000008128.23_12,CDK11A,protein_coding,1,True,55100,not_chosen


In [20]:
trim_failed = pd.read_csv(f'{output_base}failed_trimmed_alignments.tsv', sep='\t')
trim_failed.read_id = trim_failed.read_id.transform(lambda rid: rid[:-4].split('/')[0])
trim_failed = trim_failed[['read_id', 'filter_failed']].drop_duplicates(ignore_index=True)
trim_failed

Unnamed: 0,read_id,filter_failed
0,NGSNJ-086:229:GW200110425th:1:1101:21630:7106,within_intron
1,NGSNJ-086:229:GW200110425th:1:1101:6072:17143,within_intron
2,NGSNJ-086:229:GW200110425th:1:1101:13648:19335,within_intron
3,NGSNJ-086:229:GW200110425th:1:1101:28818:13416,within_intron
4,NGSNJ-086:229:GW200110425th:1:1101:2700:7952,within_intron
...,...,...
213,NGSNJ-086:229:GW200110425th:1:1101:14172:15546,within_intron
214,NGSNJ-086:229:GW200110425th:1:1101:23439:13902,within_intron
215,NGSNJ-086:229:GW200110425th:1:1101:20618:16470,within_intron
216,NGSNJ-086:229:GW200110425th:1:1101:13295:19069,within_intron


In [24]:
fivep_passed_rids = pd.read_csv(f'{output_base}fivep_info_table.tsv', sep='\t').read_id
fivep_passed_rids = set(fivep_passed_rids.transform(lambda rid: rid[:-4].split('/')[0]))
fivep_passed_rids

{'NGSNJ-086:229:GW200110425th:1:1101:30219:27336',
 'NGSNJ-086:229:GW200110425th:1:1101:12698:9111',
 'NGSNJ-086:229:GW200110425th:1:1101:13856:9643',
 'NGSNJ-086:229:GW200110425th:1:1101:4788:11506',
 'NGSNJ-086:229:GW200110425th:1:1101:12292:17331',
 'NGSNJ-086:229:GW200110425th:1:1101:24361:16031',
 'NGSNJ-086:229:GW200110425th:1:1101:24939:25770',
 'NGSNJ-086:229:GW200110425th:1:1101:26024:5478',
 'NGSNJ-086:229:GW200110425th:1:1101:22010:22733',
 'NGSNJ-086:229:GW200110425th:1:1101:5168:22623',
 'NGSNJ-086:229:GW200110425th:1:1101:23818:10238',
 'NGSNJ-086:229:GW200110425th:1:1101:11460:20149',
 'NGSNJ-086:229:GW200110425th:1:1101:28411:17848',
 'NGSNJ-086:229:GW200110425th:1:1101:9200:26412',
 'NGSNJ-086:229:GW200110425th:1:1101:30942:23171',
 'NGSNJ-086:229:GW200110425th:1:1101:6054:9471',
 'NGSNJ-086:229:GW200110425th:1:1101:5538:9298',
 'NGSNJ-086:229:GW200110425th:1:1101:31295:8688',
 'NGSNJ-086:229:GW200110425th:1:1101:2917:17628',
 'NGSNJ-086:229:GW200110425th:1:1101:3052:1

In [27]:
assert len(lariat_rids.intersection(circular_rids)) == 0, lariat_rids.intersection(circular_rids)
assert len(lariat_rids.intersection(temp_switch_rids)) == 0, lariat_rids.intersection(temp_switch_rids)
assert len(temp_switch_rids.intersection(circular_rids)) == 0, temp_switch_rids.intersection(circular_rids)

NameError: name 'temp_switch_rids' is not defined

In [None]:
read_class = []
for read_id in fivep_passed_rids:
	if read_id in lariat_rids:
		read_class.append((read_id, 'Lariat'))
	elif read_id in lariat_failed.loc[lariat_failed.filter_failed.isin(('in_repeat', 'ubiquitin_gene'))]:
		read_class.append((read_id, 'Repeat region'))
	elif read_id in circular_rids:
		read_class.append((read_id, 'Circularized intron'))
	elif read_id in temp_switch_rids:
		read_class.append((read_id, 'Template-switching'))
	else:
		read_class.append((read_id, "Has 5'ss alignment"))

read_class = pd.DataFrame(read_class, columns='read_id')
read_class

In [None]:
read_class_counts = read_class.value_counts().to_dict()
read_class_counts

In [None]:
lariat_passed = pd.read_csv(f'{out_dir}/lariat_reads.tsv', sep='\t')
lariat_passed = lariat_passed.set_index('read_id', verify_integrity=True)

matched_introns = []
for rid, row in lariat_passed.iterrows():
	matches = introns.loc[(introns.chrom==row['chrom']) & 
						(introns.strand==row['strand']) & 
						(introns.fivep_pos==row['fivep_pos']) & 
						(introns.start<row['bp_pos']) & 
						(introns.end>row['bp_pos'])]
	matched_introns.append(tuple(matches.coords))
lariat_passed['introns'] = matched_introns

lariat_passed

In [None]:
lariat_failed = pd.read_csv(f'{out_dir}/failed_lariat_mappings.tsv', sep='\t')

matched_introns = []
for rid, row in lariat_failed.iterrows():
	matches = introns.loc[(introns.chrom==row['chrom']) & 
						(introns.strand==row['strand']) & 
						(introns.fivep_pos==row['fivep_pos']) & 
						(introns.start<row['bp_pos']) & 
						(introns.end>row['bp_pos'])]
	matched_introns.append(tuple(matches.coords))
lariat_failed['introns'] = matched_introns

lariat_failed

In [None]:
def process_fivep_sites(sites:str):
	out = []
	for site in sites.split(','):
		chrom, start, end, strand = site.split(';')
		start = int(start)
		end = int(end)
		out.append((chrom, start, end, strand))

	return tuple(out)

fivep_passed = pd.read_csv(f'{out_dir}/fivep_info_table.tsv', sep='\t')
fivep_passed.read_id = fivep_passed.read_id.str.slice(0, -4)
fivep_passed.fivep_sites = fivep_passed.fivep_sites.transform(process_fivep_sites)
fivep_passed

In [None]:
fivep_failed = pd.read_csv(f'{out_dir}/failed_fivep_alignments.tsv', sep='\t')
fivep_failed.fivep_site = fivep_failed.fivep_site.transform(process_fivep_sites)
assert fivep_failed.fivep_site.transform(len).eq(1).all()
fivep_failed

In [None]:
trim_failed = pd.read_csv(f'{out_dir}/failed_trimmed_alignments.tsv', sep='\t')
trim_failed.read_id = trim_failed.read_id.str.slice(0, -4)
trim_failed = pd.merge(trim_failed, fivep_passed[['read_id', 'read_is_reverse', 'fivep_sites', 'fivep_seq']], 'left', on=['read_id', 'read_is_reverse'], validate='many_to_one')
print(trim_failed.filter_failed.value_counts())
trim_failed

In [None]:
n=2
print(trim_failed.at[0, 'trim_seq'][-n:])
print(trim_failed.at[0, 'fivep_seq'][:n])

In [None]:
len('GTAAGTAGTGCTTGTGCTCA')

In [None]:
def seq_match(row, n:int):
	if not row['align_is_reverse']:
		return row['trim_seq'][-n:]==row['fivep_seq'][:n]
	else:	
		return Seq.reverse_complement(row['trim_seq'][:n])==row['fivep_seq'][:n]
	

	
trim_failed['match_1'] = trim_failed.apply(seq_match, n=1, axis=1)
print(trim_failed.match_1.value_counts())
trim_failed['match_2'] = trim_failed.apply(seq_match, n=2, axis=1)
print(trim_failed.match_2.value_counts())
trim_failed['match_3'] = trim_failed.apply(seq_match, n=3, axis=1)
print(trim_failed.match_3.value_counts())
trim_failed

In [None]:
unspliced_dict = unspliced_aligns.species.to_dict()
spliced_dict = spliced_aligns[['read_id', 'species']].drop_duplicates().set_index('read_id')['species'].to_dict()

In [None]:
input_rids = []
with gzip.open(input_fastq) as w:
	for read_id_line in w:
		rid = read_id_line.decode()[1:].split(' ')[0]
		input_rids.append(rid)
		w.readline()
		w.readline()
		w.readline()
print(len(input_rids))
input_rids

In [None]:
BAD_TRIM_FILTERS = ('mismatch_number', 'mismatch_percent', 'indel')

read_species = []
for rid in input_rids:
	# Passed lariat filtering
	if rid in lariat_passed.index:
		read_species.append((rid, 'Lariat'))

	# Passed trim filtering, failed final lariat filtering
	elif rid in lariat_failed.read_id.values:
		fails = set(lariat_failed.loc[lariat_failed.read_id==rid, 'filter_failed'])
		assert len(fails)==1, f'{source},{rid},{fails}'
		the_filter = fails.pop()

		if the_filter == 'near_ss':
			read_species.append((rid, 'Circular intron'))
		elif the_filter in ('ubiquitin_gene', 'in_repeat'):
			read_species.append((rid, 'Repeat region'))
		else:
			raise RuntimeError(rid)

	# Trim seq(s) mapped (not counting output alignments that were filtered out for bad quality), failed trim filtering
	# elif rid in trim_failed.loc[~trim_failed.filter_failed.isin(BAD_TRIM_FILTERS), 'read_id'].values:
	elif rid in trim_failed.read_id.values:
		read_species.append((rid, 'Template switching'))
	
	# Mapped linearly, no trim seq(s) mapped
	elif rid in unspliced_dict:
		read_species.append((rid, unspliced_dict[rid]))
	elif rid in spliced_dict:
		read_species.append((rid, spliced_dict[rid]))
	
	# 5'ss mapped to it, no linear or trim seq(s) mapped
	elif rid in fivep_passed.read_id.values or rid in fivep_failed.read_id.values:
		read_species.append((rid, "Unmapped, has a 5'ss alignment"))

	# No 5'ss mapped to it and no linear alignments
	else:
		read_species.append((rid, "Unmapped"))


read_species = pd.DataFrame(read_species, columns=['read_id', 'species'])
assert read_species.read_id.is_unique
read_species = read_species.set_index('read_id')
print(read_species.species.value_counts().sort_index())
read_species

In [None]:
unspliced_aligns['introns'] = unspliced_aligns.apply(lambda row: tuple(set([(row['chrom'], feat.begin, feat.end, feat.data[3]) for feat in row['filtered_features'] if feat.data[0]=='intron'])), axis=1)

unspliced_aligns['exon_annos'] = unspliced_aligns.filtered_features.transform(lambda features: set([(feat.data[1], feat.data[2]) for feat in features if feat.data[0]=='exon']))
merged = pd.merge(unspliced_aligns.loc[unspliced_aligns.species=='Exonic'].explode('exon_annos'), exploded_introns.explode('adj_exons'), left_on='exon_annos', right_on='adj_exons')
merged = merged.groupby('read_id').coords.agg(lambda coords: tuple(set(coords)))
unspliced_aligns.loc[unspliced_aligns.species=='Exonic', 'introns'] = unspliced_aligns.loc[unspliced_aligns.species=='Exonic', 'read_id'].map(merged)

In [None]:
spliced_aligns['introns'] = spliced_aligns.apply(lambda row: tuple(set([(row['chrom'], feat.begin, feat.end, feat.data[3]) for feat in row['filtered_seg_features'] if feat.data[0]=='intron'])), axis=1)

spliced_aligns['exon_annos'] = spliced_aligns.filtered_seg_features.transform(lambda features: set([(feat.data[1], feat.data[2]) for feat in features if feat.data[0]=='exon']))
merged = pd.merge(spliced_aligns.loc[spliced_aligns.seg_species=='Exonic'].explode('exon_annos'), exploded_introns.explode('adj_exons'), left_on='exon_annos', right_on='adj_exons')
merged = merged.groupby('read_id').coords.agg(lambda coords: tuple(set(coords)))
spliced_aligns.loc[spliced_aligns.seg_species=='Exonic', 'introns'] = spliced_aligns.loc[spliced_aligns.seg_species=='Exonic', 'read_id'].map(merged)
spliced_aligns

In [None]:
def filter_ends_at_fivep_introns(row, df:str):
	matches = set()
	start = row['seg_start'] if df=='spliced' else row['align_start']
	end = row['seg_end'] if df=='spliced' else row['align_end']
	for intron in row['introns']:
		if intron[-1]=='+' and intron[1]==start:
			matches.add(intron)
		elif intron[-1]=='-' and intron[2]==end:
			matches.add(intron)

	return tuple(matches)



spliced_aligns.loc[spliced_aligns.species=="Ends at a 5'ss", 'introns'] = spliced_aligns.loc[spliced_aligns.species=="Ends at a 5'ss"].apply(filter_ends_at_fivep_introns,  df='spliced', axis=1)
unspliced_aligns.loc[unspliced_aligns.species=="Ends at a 5'ss", 'introns'] = unspliced_aligns.loc[unspliced_aligns.species=="Ends at a 5'ss"].apply(filter_ends_at_fivep_introns, df='unspliced', axis=1)

In [None]:
matched_introns = []
# def get_introns(, lariat_passed, lariat_failed, unspliced_aligns, spliced_aligns, )
for rid in read_species.index:
	species = read_species.at[rid, 'species']
	
	if species in ('Inter-genic', 'Unmapped'):
		matched_introns.append(())

	elif species == 'Lariat':
		matched_introns.append(lariat_passed.at[rid, 'introns'])

	elif species == 'Circular intron':
		matches = lariat_failed.loc[(lariat_failed.read_id==rid) & (lariat_failed.filter_failed=='near_ss'), 'introns']
		matches = set([feat for feats in matches for feat in feats])
		matched_introns.append(tuple(matches))

	elif species == 'Repeat region':
		matches = lariat_failed.loc[(lariat_failed.read_id==rid) & (lariat_failed.filter_failed.isin(('in_repeat', 'ubiquitin_gene'))), 'introns']
		matches = set([feat for feats in matches for feat in feats])
		matched_introns.append(tuple(matches))

	elif species == 'Template switching':
		# rows = trim_failed.loc[(trim_failed.read_id==rid) & (~trim_failed.filter_failed.isin(BAD_TRIM_FILTERS))]
		# introns_from_rows = set()
		# for i, row in rows.iterrows():
		# 	for fivep_chrom, fivep_start, fivep_end, fivep_strand in row['fivep_sites']:
		# 		matches = introns.loc[(introns.chrom==fivep_chrom) & (introns.strand==fivep_strand)]
		# 		matches = matches.loc[((matches.strand=='+') & (matches.start==fivep_start)) | ((matches.strand=='-') & (matches.end-1==fivep_end))]
		# 		introns_from_rows.update(matches.coords)
		# matched_introns.append(tuple(introns_from_rows))
		fivep_sites = trim_failed.loc[trim_failed.read_id==rid, 'fivep_sites']
		fivep_sites = set([site for sites in fivep_sites for site in sites])
		all_matches = set()
		for fivep_chrom, fivep_start, fivep_end, fivep_strand in fivep_sites:
			matches = introns.loc[(introns.chrom==fivep_chrom) & (introns.strand==fivep_strand)]
			matches = matches.loc[((matches.strand=='+') & (matches.fivep_pos==fivep_start)) | ((matches.strand=='-') & (matches.fivep_pos==fivep_end-1))]
			all_matches.update(matches.coords)
		matched_introns.append(tuple(all_matches))
	
	elif species ==  "Unmapped, has a 5'ss alignment":
		fivep_sites = fivep_failed.loc[(fivep_failed.read_id==rid) & (fivep_failed.fail_reason!='furthest_upstream'), 'fivep_site']
		fivep_sites = pd.concat([fivep_sites, fivep_passed.loc[fivep_passed.read_id==rid, 'fivep_sites']])
		fivep_sites = set([site for sites in fivep_sites for site in sites])
		all_matches = set()
		for fivep_chrom, fivep_start, fivep_end, fivep_strand in fivep_sites:
			matches = introns.loc[(introns.chrom==fivep_chrom) & (introns.strand==fivep_strand)]
			matches = matches.loc[((matches.strand=='+') & (matches.fivep_pos==fivep_start)) | ((matches.strand=='-') & (matches.fivep_pos==fivep_end-1))]
			all_matches.update(matches.coords)
		matched_introns.append(tuple(all_matches))

	elif species == 'Genic, ambiguous':
		if rid in unspliced_aligns.index:
			matched_introns.append(unspliced_aligns.at[rid, 'introns'])
		else:
			matches = spliced_aligns.loc[(spliced_aligns.read_id==rid) & (spliced_aligns.introns.notna()), 'introns']
			matches = set([feat for feats in matches for feat in feats])
			matched_introns.append(tuple(matches))

	# For exonic, match introns adjacent to the exon
	elif species == 'Exonic':
		if pd.isna(unspliced_aligns.at[rid, 'introns']):
			matched_introns.append(())
		else:
			matched_introns.append(unspliced_aligns.at[rid, 'introns'])

	elif species == 'Intronic':
		if rid in unspliced_aligns.index:
			matched_introns.append(unspliced_aligns.at[rid, 'introns'])
		else:
			matches = spliced_aligns.loc[spliced_aligns.read_id==rid, 'introns']
			matches = set([feat for feats in matches for feat in feats])
			matched_introns.append(tuple(matches))

	# For mRNA, match introns for which both adjacent exons are covered
	elif species == 'mRNA':
		# Gather adjacent exons
		exons = spliced_aligns.loc[spliced_aligns.read_id==rid, 'filtered_seg_features']
		exons = [feat for feats in exons for feat in feats]
		exons = [feat for feat in exons if feat.data[0]=='exon']
		exon_pairs = set()
		for exon_1, exon_2 in it.product(exons, exons):
			if exon_1.data[1] == exon_2.data[1] and exon_1.data[2] == exon_2.data[2]-1:
				exon_pairs.add( ((exon_1.data[1], exon_1.data[2]), (exon_2.data[1], exon_2.data[2])) )
				# exon_pairs = introns.loc[introns.adj_exons.transform(lambda adj_exons: exon_1_anno in adj_exons and exon_2_anno in adj_exons)]
				# all_matches.update(matches.coords)
			# exon_1_anno = (exon_1.data[1], exon_1.data[2]) 
			# exon_2_anno = (exon_2.data[1], exon_2.data[2]) 
		matches = exploded_introns.loc[exploded_introns.adj_exons.isin(exon_pairs), 'coords']
		matches = set(matches)
		matched_introns.append(tuple(matches))

	elif species == 'pre-mRNA':
		if rid in unspliced_aligns.index:
			# row = unspliced_aligns.loc[rid]
			# matches = [feat for feat in row['filtered_features'] if feat.data[0]=='intron']
			# matched_introns.append(tuple(matches))
			matched_introns.append(unspliced_aligns.at[rid, 'introns'])
		else:
			# rows = spliced_aligns.loc[(spliced_aligns.read_id==rid) & (spliced_aligns.seg_species=='pre-mRNA')]
			# matches = [feat for feats in rows['filtered_seg_features'] for feat in feats]
			# matches = [feat for feat in matches if feat.data[0]=='intron']
			matches = spliced_aligns.loc[spliced_aligns.read_id==rid, 'introns']
			matches = set([feat for feats in matches for feat in feats])
			matched_introns.append(tuple(matches))

	elif species == "Ends at a 5'ss":
		if rid in unspliced_aligns.index:
			matched_introns.append(unspliced_aligns.at[rid, 'introns'])
		else:
			matches = spliced_aligns.loc[spliced_aligns.read_id==rid, 'introns']
			matches = set([feat for feats in matches for feat in feats])
			matched_introns.append(tuple(matches))

	else:
		raise RuntimeError(rid, species)

read_species['introns'] = matched_introns
read_species

In [None]:
# read_species['intron_count'] = read_species.introns.transform(len)
for species in READ_SPECIES:
	print(species)
	print(read_species.loc[read_species.species==species, ['species', 'intron_count']].value_counts().sort_index(), '\n')

In [None]:
read_species = read_species.explode('introns').rename(columns={'introns': 'intron'}).reset_index()
read_species.loc[read_species.intron.notna(), ['chrom', 'intron_start', 'intron_end', 'strand']] = read_species[read_species.intron.notna()].intron.to_list()
read_species.intron_start = read_species.intron_start.astype(int, errors='ignore')
read_species.intron_end = read_species.intron_end.astype(int, errors='ignore')
read_species.to_csv(f'/home/tmooney/Lariat_mapping/testing/output/100k_truncs/{samp}_{r_dir}k_lariat_mapping/read_species.tsv', sep='\t')
read_species

In [None]:
read_species[read_species.duplicated(keep=False)].sort_values(['read_id', 'species', 'intron'])

In [None]:
# exit()
d = done

In [None]:
# read_species = pd.read_csv('/home/tmooney/Lariat_mapping/testing/output/100k_truncs/C22-1_R1_100k_lariat_mapping/read_species.tsv', sep='\t')
read_species = read_species.drop_duplicates()
print(read_species.info())
read_species

In [None]:
read_total = 100_000
per_intron_counts = read_species.groupby(['intron', 'species'], as_index=False).agg('size')
per_intron_counts = per_intron_counts.rename(columns={'size': 'reads'})
per_intron_counts['read_prop'] = per_intron_counts.reads / read_total
per_intron_counts['reads_total'] = per_intron_counts.intron.map(per_intron_counts.groupby('intron').reads.agg('sum'))
per_intron_counts['species_rep'] = per_intron_counts.reads / per_intron_counts.reads_total
per_intron_counts

In [None]:
per_intron_counts.sort_values('reads_total', ascending=False)[:20]

In [None]:
df = read_species.species.value_counts().reset_index()
df = df.rename(columns={'count': 'reads'})
df.species = pd.Categorical(df.species, READ_SPECIES + ('Dunno',), ordered=True)
df['source'] = source
print([row.to_list() for i, row in df.iterrows()])
df

In [None]:
df = [['Intronic', 29109, 'C22-1|R1_100'], ['Exonic', 20108, 'C22-1|R1_100'], ['Inter-genic', 16477, 'C22-1|R1_100'], ['Unmapped', 15668, 'C22-1|R1_100'], ['Genic, ambiguous', 13954, 'C22-1|R1_100'], ['mRNA', 1619, 'C22-1|R1_100'], ['Template switching', 1490, 'C22-1|R1_100'], ["Unmapped, has a 5'ss alignment", 968, 'C22-1|R1_100'], ['pre-mRNA', 590, 'C22-1|R1_100'], ["Ends at a 5'ss", 10, 'C22-1|R1_100'], ['Lariat', 5, 'C22-1|R1_100'], ['Repeat region', 2, 'C22-1|R1_100']] + \
	[['Intronic', 28879, 'C22-1|R2_100'], ['Exonic', 20098, 'C22-1|R2_100'], ['Inter-genic', 16433, 'C22-1|R2_100'], ['Unmapped', 15848, 'C22-1|R2_100'], ['Genic, ambiguous', 14093, 'C22-1|R2_100'], ['mRNA', 1578, 'C22-1|R2_100'], ['Template switching', 1563, 'C22-1|R2_100'], ["Unmapped, has a 5'ss alignment", 918, 'C22-1|R2_100'], ['pre-mRNA', 577, 'C22-1|R2_100'], ["Ends at a 5'ss", 11, 'C22-1|R2_100'], ['Lariat', 2, 'C22-1|R2_100']] + \
	[['Intronic', 29161, 'C22-3|R1_100'], ['Exonic', 20028, 'C22-3|R1_100'], ['Inter-genic', 16456, 'C22-3|R1_100'], ['Unmapped', 15978, 'C22-3|R1_100'], ['Genic, ambiguous', 13873, 'C22-3|R1_100'], ['mRNA', 1603, 'C22-3|R1_100'], ['Template switching', 1391, 'C22-3|R1_100'], ["Unmapped, has a 5'ss alignment", 919, 'C22-3|R1_100'], ['pre-mRNA', 587, 'C22-3|R1_100'], ["Ends at a 5'ss", 3, 'C22-3|R1_100'], ['Lariat', 1, 'C22-3|R1_100']] + \
	[['Intronic', 28977, 'C22-3|R2_100'], ['Exonic', 20002, 'C22-3|R2_100'], ['Inter-genic', 16585, 'C22-3|R2_100'], ['Unmapped', 16052, 'C22-3|R2_100'], ['Genic, ambiguous', 13943, 'C22-3|R2_100'], ['mRNA', 1505, 'C22-3|R2_100'], ['Template switching', 1416, 'C22-3|R2_100'], ["Unmapped, has a 5'ss alignment", 933, 'C22-3|R2_100'], ['pre-mRNA', 582, 'C22-3|R2_100'], ['Repeat region', 3, 'C22-3|R2_100'], ["Ends at a 5'ss", 2, 'C22-3|R2_100']] + \
	[['Intronic', 28434, 'C22-5|R1_100'], ['Exonic', 20399, 'C22-5|R1_100'], ['Inter-genic', 16801, 'C22-5|R1_100'], ['Unmapped', 15365, 'C22-5|R1_100'], ['Genic, ambiguous', 14349, 'C22-5|R1_100'], ['mRNA', 1716, 'C22-5|R1_100'], ['Template switching', 1349, 'C22-5|R1_100'], ["Unmapped, has a 5'ss alignment", 1034, 'C22-5|R1_100'], ['pre-mRNA', 541, 'C22-5|R1_100'], ["Ends at a 5'ss", 6, 'C22-5|R1_100'], ['Lariat', 3, 'C22-5|R1_100'], ['Repeat region', 2, 'C22-5|R1_100'], ['Circular intron', 1, 'C22-5|R1_100']] + \
	[['Intronic', 28383, 'C22-5|R2_100'], ['Exonic', 20246, 'C22-5|R2_100'], ['Inter-genic', 16776, 'C22-5|R2_100'], ['Unmapped', 15470, 'C22-5|R2_100'], ['Genic, ambiguous', 14378, 'C22-5|R2_100'], ['mRNA', 1738, 'C22-5|R2_100'], ['Template switching', 1409, 'C22-5|R2_100'], ["Unmapped, has a 5'ss alignment", 1045, 'C22-5|R2_100'], ['pre-mRNA', 539, 'C22-5|R2_100'], ["Ends at a 5'ss", 8, 'C22-5|R2_100'], ['Repeat region', 4, 'C22-5|R2_100'], ['Lariat', 4, 'C22-5|R2_100']] + \
	[['Exonic', 22887, 'HEK293T-1|R1_100'], ['Intronic', 22738, 'HEK293T-1|R1_100'], ['Inter-genic', 19406, 'HEK293T-1|R1_100'], ['Unmapped', 16508, 'HEK293T-1|R1_100'], ['Genic, ambiguous', 14170, 'HEK293T-1|R1_100'], ['mRNA', 1816, 'HEK293T-1|R1_100'], ['Template switching', 1102, 'HEK293T-1|R1_100'], ["Unmapped, has a 5'ss alignment", 776, 'HEK293T-1|R1_100'], ['pre-mRNA', 593, 'HEK293T-1|R1_100'], ['Repeat region', 2, 'HEK293T-1|R1_100'], ['Lariat', 1, 'HEK293T-1|R1_100'], ["Ends at a 5'ss", 1, 'HEK293T-1|R1_100']] + \
	[['Exonic', 22616, 'HEK293T-1|R2_100'], ['Intronic', 21850, 'HEK293T-1|R2_100'], ['Inter-genic', 19052, 'HEK293T-1|R2_100'], ['Unmapped', 17342, 'HEK293T-1|R2_100'], ['Genic, ambiguous', 15088, 'HEK293T-1|R2_100'], ['mRNA', 1764, 'HEK293T-1|R2_100'], ["Unmapped, has a 5'ss alignment", 800, 'HEK293T-1|R2_100'], ['Template switching', 797, 'HEK293T-1|R2_100'], ['pre-mRNA', 684, 'HEK293T-1|R2_100'], ["Ends at a 5'ss", 4, 'HEK293T-1|R2_100'], ['Repeat region', 3, 'HEK293T-1|R2_100']] + \
	[['Intronic', 26355, 'HEK293T-4|R1_100'], ['Exonic', 22348, 'HEK293T-4|R1_100'], ['Inter-genic', 17736, 'HEK293T-4|R1_100'], ['Unmapped', 14907, 'HEK293T-4|R1_100'], ['Genic, ambiguous', 14136, 'HEK293T-4|R1_100'], ['mRNA', 1741, 'HEK293T-4|R1_100'], ['Template switching', 1411, 'HEK293T-4|R1_100'], ["Unmapped, has a 5'ss alignment", 787, 'HEK293T-4|R1_100'], ['pre-mRNA', 576, 'HEK293T-4|R1_100'], ["Ends at a 5'ss", 2, 'HEK293T-4|R1_100'], ['Lariat', 1, 'HEK293T-4|R1_100']] + \
	[['Intronic', 26151, 'HEK293T-4|R2_100'], ['Exonic', 22234, 'HEK293T-4|R2_100'], ['Inter-genic', 17563, 'HEK293T-4|R2_100'], ['Unmapped', 15186, 'HEK293T-4|R2_100'], ['Genic, ambiguous', 14297, 'HEK293T-4|R2_100'], ['mRNA', 1769, 'HEK293T-4|R2_100'], ['Template switching', 1406, 'HEK293T-4|R2_100'], ["Unmapped, has a 5'ss alignment", 822, 'HEK293T-4|R2_100'], ['pre-mRNA', 567, 'HEK293T-4|R2_100'], ["Ends at a 5'ss", 5, 'HEK293T-4|R2_100']] + \
	[['Intronic', 26701, 'HEK293T-5|R1_100'], ['Exonic', 21825, 'HEK293T-5|R1_100'], ['Inter-genic', 17413, 'HEK293T-5|R1_100'], ['Unmapped', 15274, 'HEK293T-5|R1_100'], ['Genic, ambiguous', 14210, 'HEK293T-5|R1_100'], ['mRNA', 1780, 'HEK293T-5|R1_100'], ['Template switching', 1371, 'HEK293T-5|R1_100'], ["Unmapped, has a 5'ss alignment", 873, 'HEK293T-5|R1_100'], ['pre-mRNA', 550, 'HEK293T-5|R1_100'], ['Repeat region', 2, 'HEK293T-5|R1_100'], ["Ends at a 5'ss", 1, 'HEK293T-5|R1_100']] + \
	[['Intronic', 26484, 'HEK293T-5|R2_100'], ['Exonic', 21783, 'HEK293T-5|R2_100'], ['Inter-genic', 17425, 'HEK293T-5|R2_100'], ['Unmapped', 15507, 'HEK293T-5|R2_100'], ['Genic, ambiguous', 14293, 'HEK293T-5|R2_100'], ['mRNA', 1744, 'HEK293T-5|R2_100'], ['Template switching', 1373, 'HEK293T-5|R2_100'], ["Unmapped, has a 5'ss alignment", 860, 'HEK293T-5|R2_100'], ['pre-mRNA', 528, 'HEK293T-5|R2_100'], ["Ends at a 5'ss", 2, 'HEK293T-5|R2_100'], ['Repeat region', 1, 'HEK293T-5|R2_100']]

df = pd.DataFrame(df, columns=['species', 'reads', 'source'])
for sou, species in it.product(df.source.unique(), READ_SPECIES):
	if len(df.loc[(df.source==sou) & (df.species==species)]) == 0:
		df.loc[len(df)] = [species, 0, sou]
df

In [None]:
df.species = pd.Categorical(df.species, READ_SPECIES, ordered=True)
df[['samp', 'r_dir']] = df.source.str.split('|', expand=True)
df['cell'] = df.samp.transform(lambda samp: samp.split('-')[0]).map({'HEK293T': 'WT', 'C22': 'DBR1-KO'})
df = df[['cell', 'samp', 'r_dir', 'source', 'species', 'reads']]
df['prop'] = df.reads / 100_000
df = df.sort_values(['cell', 'samp', 'r_dir', 'species'], ignore_index=True)
df

In [None]:
mRNA_ratios = []
for cell in df.cell.unique():
	for s in df.loc[df.cell==cell, 'source'].unique():
		pre_count = df.loc[(df.source==s) & (df.species=='mRNA'), 'reads'].iloc[0]
		m_count = df.loc[(df.source==s) & (df.species=='pre-mRNA'), 'reads'].iloc[0]
		mRNA_ratios.append([cell, pre_count/m_count])

mRNA_ratios = pd.DataFrame(mRNA_ratios, columns=['cell', 'ratio'])
mRNA_ratios

In [None]:
mann_whitney_u = stats.levene(mRNA_ratios.loc[mRNA_ratios.cell=='WT', 'ratio'], mRNA_ratios.loc[mRNA_ratios.cell=='DBR1-KO', 'ratio'])
print(mann_whitney_u)

In [None]:
plot = (p9.ggplot(mRNA_ratios, p9.aes(x="cell", y="ratio", fill='cell')) +
		p9.geom_boxplot() +
		# p9.geom_() +
		# p9.scale_x_() +
		p9.scale_y_continuous(limits=(2,4)) +
		# p9.scale_color_brewer(type='', palette=1, direction=1) +
		# p9.scale_fill_brewer(type='', palette=1, direction=1) +
		p9.scale_fill_brewer(type='qual', palette=2, direction=-1, guide=None) +
		p9.labs(title=f"p={mann_whitney_u.pvalue:.3}", x="", y="mRNA reads / pre-mRNA reads") +
		p9.theme(axis_title=p9.element_text(size=16), axis_text_x=p9.element_text(size=12), axis_text_y=p9.element_text(size=12))
)
plot.show()
plot.save(PLOTS_DIR + '/premRNA_ratio.png', dpi=500)

In [None]:
df_sum = df.groupby(['cell', 'species'])['reads'].agg(['mean', 'sem']).reset_index()
df_sum 

In [None]:
species_ratios = {}
for species in df_sum.species.unique():
	c22_mean = df_sum.loc[(df_sum.cell=='DBR1-KO') & (df_sum.species==species), 'mean'].iloc[0] 
	c22_sem = df_sum.loc[(df_sum.cell=='DBR1-KO') & (df_sum.species==species), 'sem'].iloc[0] 
	hek_mean = df_sum.loc[(df_sum.cell=='WT') & (df_sum.species==species), 'mean'].iloc[0] 
	hek_sem = df_sum.loc[(df_sum.cell=='WT') & (df_sum.species==species), 'sem'].iloc[0] 

	ratio = c22_mean/hek_mean if hek_mean!=0 else pd.NA
	error = ratio * np.sqrt((0.05*c22_sem/c22_mean)**2 + (0.05*hek_sem/hek_mean)**2) if pd.notna(ratio) else pd.NA
	species_ratios[species] = (ratio, error)
species_ratios

In [None]:
species_ratios = pd.DataFrame(species_ratios)
species_ratios = species_ratios.transpose().reset_index()
species_ratios.columns = ['species', 'ratio', 'error']
species_ratios['label'] = species_ratios.apply(lambda row: f"{row['ratio']:.1%} ± {row['error']:.1%}" if pd.notna(row['ratio']) else 'N/A', axis=1)
species_ratios

In [None]:
plot = (p9.ggplot(df_sum, p9.aes(x="species")) +
		p9.geom_col(p9.aes(y="mean", fill='cell'), position='dodge') +
		p9.geom_errorbar(p9.aes(ymin="mean-sem", ymax="mean+sem", group='cell'), width=0.9, position='dodge') +
		p9.geom_text(data=species_ratios, mapping=p9.aes(x='species', y=35000, label='label'), ha='center', size=10) +
		p9.scale_y_continuous(limits=(0,38_000), breaks=np.arange(0,40_000,10_000), labels=comma_format()) +
		# p9.annotate('text', x=13.5, y=35000, label='Enrichment') +
		p9.scale_fill_brewer(type='qual', palette=2, direction=-1) +
		p9.labs(title="", x="Read species", y="Read count", fill='Cell line') +
		p9.coord_flip() +
		p9.guides(fill=p9.guide_legend(reverse=True, position='right'), color=None) +
		p9.theme(axis_title=p9.element_text(size=16), axis_text=p9.element_text(size=10), legend_text=p9.element_text(size=10))
)
plot.save('/home/tmooney/Lariat_mapping/visuals/plots/species_100k.png', dpi=500, height=6, width=10)
plot.show()

In [None]:
# ##### Snippets #####
# plot = (p9.ggplot(, p9.aes(x="", y="")) +
# 		# p9.geom_() +
# 		# p9.geom_() +
# 		# p9.scale_x_() +
# 		# p9.scale_y_() +
# 		# p9.scale_color_brewer(type='', palette=1, direction=1) +
# 		# p9.scale_fill_brewer(type='', palette=1, direction=1) +
# 		p9.labs(title="") +
# 		p9.theme()
# )
# print(plot)
# # plot.save(PLOTS_DIR + '/.png', dpi=500)

##### Discarded code

In [None]:
%%script false --no-raise-error
# def process_features(features, ):
# 	if features == ('.',) or features == ('.', '.'):
# 		return IntervalTree([])
	
# 	out = []
# 	for feat in features:
# 		# feat, tid, num, start, end, strand, gid, name, type_ = feat.split(';')
# 		if type_ in ('artifact', 'TEC'):
# 			continue
# 		elif type_ in NCRNA_TYPES:
# 			type_ = 'ncRNA'
# 		elif type_ in IG_TcR_TYPES:
# 			type_ = 'Ig or TcR'
# 		start = int(start)
# 		end = int(end)
# 		num = int(num)
# 		out.append(Interval(start, end, (feat, tid, num, strand, gid, name, type_)))
	
# 	return IntervalTree(out)

	
		# rows = lariat_failed.loc[lariat_failed.read_id==rid]
		# introns_from_rows = set()
		# for i, row in rows.iterrows():
		# 	matches = introns.loc[(introns.chrom==row['chrom']) & (introns.strand==row['strand'])]
		# 	matches = matches.loc[((matches.strand=='+') & (matches.start==row['fivep_pos'])) | ((matches.strand=='-') & (matches.end-1==row['fivep_pos']))]
		# 	matches = matches.loc[((matches.strand=='+') & (matches.end-1-row['bp_pos']<=2)) | ((matches.strand=='-') & (row['bp_pos']-matches.start<=2))]
		# 	introns_from_rows.update(matches.coords)
		# matched_introns.append(tuple(introns_from_rows))

		# rows = lariat_failed.loc[lariat_failed.read_id==rid]
		# introns_from_rows = set()
		# for i, row in rows.iterrows():
		# 	matches = introns.loc[(introns.chrom==row['chrom']) & (introns.strand==row['strand'])]
		# 	matches = matches.loc[((matches.strand=='+') & (matches.start==row['fivep_pos'])) | ((matches.strand=='-') & (matches.end-1==row['fivep_pos']))]
		# 	matches = matches.loc[(matches.start<row['bp_pos']) & (matches.end>row['bp_pos'])]
		# 	introns_from_rows.update(matches.coords)
		# matched_introns.append(tuple(introns_from_rows))
			# rows = spliced_aligns.loc[spliced_aligns.read_id==rid]
			# assert len(rows['chrom'].unique())==1
			# chrom = rows['chrom'].unique()[0]
			# matches = [feat for feats in rows['filtered_seg_features'] for feat in feats]
			# matches = [(chrom, feat.begin, feat.end, feat.data[3]) for feat in matches if feat.data[0]=='intron']

		# row = lariat_passed.loc[rid]
		# matches = introns.loc[(introns.chrom==row['chrom']) & (introns.strand==row['strand'])]
		# matches = matches.loc[((matches.strand=='+') & (matches.start==row['fivep_pos'])) | ((matches.strand=='-') & (matches.end-1==row['fivep_pos']))]
		# matches = matches.loc[(matches.start<row['bp_pos']) & (matches.end>row['bp_pos'])]

				# row = unspliced_aligns.loc[rid]
		# exons = [feat for feat in row['filtered_features'] if feat.data[0]=='exon']
		# matches = set()
		# for exon in exons:
		# 	feat, tid, num, strand, gid, name, type_ = exon.data
		# 	m = introns.loc[(introns.transcript_id==tid) & ((introns.intron_num-num==0) | (introns.intron_num-num==-1))]
		# 	matches.update(m['intron'])
		# matched_introns.append(tuple(matches))
		
		# if rid in unspliced_aligns.index:
		# 	row = unspliced_aligns.loc[rid]
		# 	matches = [feat for feat in row['filtered_features'] if feat.data[0]=='intron']
		# 	matched_introns.append(tuple(matches))
		# else:
		# 	rows = spliced_aligns.loc[spliced_aligns.read_id==rid]
		# 	matches = [feat for feats in rows['filtered_seg_features'] for feat in feats]
		# 	matches = [feat for feat in matches if feat.data[0]=='intron']
		# 	matched_introns.append(tuple(matches))

		# Get the intron between each adjacent exon pair
		# for exon_1, _ in adj_exons:
			# feat, tid, num, strand, gid, name, type_ = exon_1.data
			# m = introns.loc[(introns.transcript_id==tid) & (introns.intron_num==num)]
			# assert len(m) == 1, m
			# matches.update(m.intron)

			
# BAD_TRIM_FILTERS = set(('mismatch_number', 'mismatch_percent', 'indel'))
# trim_mapped_rids = set()
# for align in pysam.AlignmentFile(f'{out_dir}/trimmed_reads_to_genome.bam', 'rb'):
# 	trim_mapped_rids.add(align.query_name[:-4])

# # Get rid of those that mapped but got filtered out due to low quality alignment
# rids_to_remove = set()
# for rid in trim_failed.read_id.unique():
# 	fail_reasons = set(trim_failed.loc[trim_failed.read_id==rid, 'filter_failed'])
# 	if fail_reasons.issubset(BAD_ALIGNMENT_FILTERS):
# 		rids_to_remove.add(rid)

# trim_mapped_rids = trim_mapped_rids.difference(rids_to_remove)
# trim_mapped_rids

plot = (p9.ggplot(df, p9.aes(x="species")) +
		p9.geom_col(p9.aes(y="reads")) +
		p9.scale_fill_brewer(type='qual', palette=2, direction=-1) +
		p9.labs(title=source) +
		p9.coord_flip() +
		p9.theme()
)
print(plot)

# mapped_species_counts = []
# for key, val in unspliced_aligns['species'].value_counts().to_dict().items():
# 	mapped_species_counts.append([source, 'unspliced', key, val])
# for key, val in spliced_aligns[['read_id', 'align_id', 'species']].drop_duplicates()['species'].value_counts().to_dict().items():
# 	mapped_species_counts.append([source, 'spliced', key, val])
# mapped_species_counts


# run_data = pd.read_csv(f'{out_dir}/run_data.tsv', sep='\t', header=None)
# total_reads = run_data.loc[run_data[0]=='ref_mapped_reads', 1].iloc[0] + run_data.loc[run_data[0]=='ref_unmapped_reads', 1].iloc[0]
# run_data.loc[len(run_data)] = ['total_reads', total_reads] 

# run_data = run_data.rename(columns={0:'cat', 1:'reads'}).reset_index(drop=True)
# run_data.cat = run_data.cat.str.replace('_reads', '')
# run_data
# READ_SPECIES =  ('Lariat', 'Repeat region', 'Circular intron', 'Template switching', "Unknown, has at least one 5'ss alignment", "Unknown, no 5'ss alignments")
# read_species_counts = []
# unmapped = run_data.loc[run_data.cat=='ref_unmapped', 'reads'].iloc[0]
# # no_fivep_maps = unmapped - run_data.loc[run_data.cat=='fivep_mapped', 'reads'].iloc[0]

# for species in READ_SPECIES:
# 	# if species == "Unknown, no 5'ss alignments":
# 		# species_counts.append([source, unmapped, species, no_fivep_maps])
# 		# species_counts.append([source, unmapped, species])
# 		# continue
		
# 	count = len(read_species.loc[read_species_counts.species==species])
# 	read_species_counts.append([source, unmapped, species, count])

# read_species_counts = pd.DataFrame(read_species_counts, columns=['source', 'unmapped', 'species', 'reads'])
# read_species_counts['species'] = pd.Categorical(read_species_counts.species, READ_SPECIES, ordered=True)
# read_species_counts['prop'] = read_species_counts.reads / read_species_counts.unmapped
# read_species_counts
# m = pd.merge(read_species, unspliced_aligns[['read_id', 'species', 'filtered_gene_types']], 'inner', on='read_id')
# print(m[['species_x', 'species_y']].value_counts().sort_index())
# m
# species_counts_avg = species_counts.groupby(['samp', 'species'])[['reads', 'prop']].agg('mean').reset_index()
# species_counts_avg['label'] = species_counts_avg.reads.transform(lambda val: f'{val:,.0f}')
# species_counts_avg
# plot = (p9.ggplot(species_counts, p9.aes(x="species", fill='samp')) +
# 		p9.geom_col(p9.aes(y="prop"), position='dodge') +
# 		# p9.geom_text(p9.aes(y="prop+0.05", label="label", color='samp'), ha='left', size=8, position=p9.position_dodge(width=1)) +
# 		# p9.annotate('text', x=6.5, y=1.05, label='Avg. reads', fontweight='bold') +
# 		p9.scale_y_continuous(limits=(0,1), breaks=np.arange(0,1.2,0.2)) +
# 		p9.scale_fill_brewer(type='qual', palette=2, direction=-1) +
# 		# p9.scale_color_brewer(type='qual', palette=2, direction=-1) +
# 		p9.labs(title="", x="Read species", y="Proportion of unmapped reads", fill='Sample') +
# 		p9.coord_flip() +
# 		p9.guides(fill=p9.guide_legend(reverse=True), color=None) +
# 		p9.theme()
# )
# print(plot)
# plot.save(PLOTS_DIR + '/unmapped.png', dpi=500)

OVERLAPS_COLUMNS = ('chrom', 'align_start', 'align_end', 'read_id', '_', 'align_orient', '_', '_', '_', '_', '_', '_', '_', 'f_start', 'f_end', 'f_name', '_', 'f_strand')
def load_feature_overlaps(overlap_bed:str, include_overlap_info:bool=False):
	'''
	Returns a Series 
	start and end positions are 0-based inclusive
	Indices are alignment ids ( [read id];[reference name];[alignment start position];[alignment end position];[alignment orientation] )
	Values are tuples of overlap lists ( ([[transcript id], [feature number], [feature strand], [feature start position], [feature end position]]...) )
									or ( ([[transcript id], [feature number], [feature strand], [feature start position], [feature end position], [overlap start position], [overlap end position], [overlap length]]...) )
	'''
	overlaps = pd.read_csv(overlap_bed, sep='\t', header=None)
	overlaps.columns = OVERLAPS_COLUMNS
	overlaps.align_end += -1
	overlaps.f_end += -1

	overlaps['tid'] = overlaps.f_name.transform(lambda name: name.split('_')[0])
	overlaps['f_num'] = overlaps.f_name.transform(lambda name: name.split('_')[2])
	overlaps['f_num'] = overlaps.f_num.transform(lambda num: int(num)+1)	# exon/intron number is 0-based too, apparently
	overlaps['align_id'] = overlaps.apply(lambda row: f"{row['read_id']};{row['chrom']};{row['align_start']};{row['align_end']};{row['align_orient']}", axis=1)

	if include_overlap_info is True:
		overlaps['overlap_start'] = overlaps[['align_start', 'f_start']].max(axis=1)
		overlaps['overlap_end'] = overlaps[['align_end', 'f_end']].min(axis=1)
		overlaps['overlap_len'] = overlaps.overlap_end - overlaps.overlap_start + 1
		overlaps['combined'] = overlaps[['tid', 'f_num', 'f_strand', 'f_start', 'f_end', 'overlap_start', 'overlap_end', 'overlap_len']].values.tolist()
	else:
		overlaps['combined'] = overlaps[['tid', 'f_num', 'f_strand', 'f_start', 'f_end']].values.tolist()

	overlaps = overlaps.groupby('align_id')['combined'].agg(tuple)
	return overlaps

	

lariat_passed_rids = {}
for sample, r, source, out_dir in sources:
	rids = pd.read_csv(f'{out_dir}/lariat_reads.tsv', sep='\t', usecols=[3])
	lariat_passed_rids[source] = set(rids.read_id)

lariat_passed_rids
failed_lariats = pd.DataFrame()
for sample, r, source, out_dir in sources:
	data = pd.read_csv(f'{out_dir}/failed_lariat_mappings.tsv', sep='\t')
	data = data.assign(samp=sample, r=r, source=source)
	failed_lariats = pd.concat([failed_lariats, data], ignore_index=True)

print(len(failed_lariats.read_id.unique()))
print(len(failed_lariats[['read_id', 'source']].drop_duplicates()))
failed_lariats
trim_passed_rids = {}
for sample, r, source, out_dir in sources:
	rids = pd.read_csv(f'{out_dir}/final_info_table.tsv', sep='\t', usecols=[0])
	trim_passed_rids[source] = set(rids.read_id.str.slice(0,-4))
trim_passed_rids
trim_failed = pd.DataFrame()
for sample, r, source, out_dir in sources:
	data = pd.read_csv(f'{out_dir}/failed_trimmed_alignments.tsv', sep='\t', nrows=6000)
	data = data.assign(samp=sample, r=r, source=source)
	trim_failed = pd.concat([trim_failed, data], ignore_index=True)

print(trim_failed.filter_failed.value_counts())
trim_failed['read_id_orient'] = trim_failed.read_id
trim_failed.read_id = trim_failed.read_id.str.slice(0, -4)
trim_failed
fivep_passed_rids = {}
for sample, r, source, out_dir in sources:
	rids = pd.read_csv(f'{out_dir}/fivep_info_table.tsv', sep='\t', usecols=[0])
	fivep_passed_rids[source] = set(rids.read_id.str.slice(0, -4))
fivep_passed_rids
run_data = pd.DataFrame()
for sample, r, source, out_dir in sources:
	data = pd.read_csv(f'{test_dir}/{sample}_{r}_lariat_mapping/run_data.tsv', sep='\t', header=None)
	total_reads = data.loc[data[0]=='ref_mapped_reads', 1].iloc[0] + data.loc[data[0]=='ref_unmapped_reads', 1].iloc[0]
	data.loc[len(data)] = ['total_reads', total_reads] 
	data = data.assign(samp=sample, r=r, source=source)
	run_data = pd.concat([run_data, data])

run_data = run_data.rename(columns={0:'cat', 1:'reads'}).reset_index(drop=True)
run_data.cat = run_data.cat.str.replace('_reads', '')
run_data.samp = pd.Categorical(run_data.samp, categories=['HEK293T', 'C22'], ordered=True)
samp_means = run_data.groupby(['samp', 'cat'])['reads'].aggregate('mean')
for samp, cat in samp_means.index:
	run_data.loc[(run_data.samp==samp) & (run_data.cat==cat), 'samp_mean'] = samp_means[samp, cat]
run_data
reads_with_fivep = {}
for sample, r, source, out_dir in sources:
	rids = set()
	with open(f'{out_dir}/fivep_to_reads.sam') as r:
		for line in r:
			rids.add(line.split('\t')[2])
	reads_with_fivep[source] = rids	
reads_with_fivep
BAD_ALIGNMENT_FILTERS = set(('mismatch_num', 'mismatch_percent', 'indel', 'map_quality'))
reads_with_trim = {}
for sample, r, source, out_dir in sources:
	rids = set()
	for align in pysam.AlignmentFile(f'{test_dir}/{sample}_{r}_lariat_mapping/trimmed_reads_to_genome.bam', 'rb'):
		rids.add(align.query_name[:-4])

	# Get rid of those that mapped but got filtered out due to low quality alignment
	for rid in rids:
		rids_to_remove = set()
		if rid in trim_passed_rids:
			continue

		fail_reasons = set(trim_failed.loc[(trim_failed.source==source) & (trim_failed.read_id==rid), 'filter_failed'])
		if len(fail_reasons)==0:
			continue
		if fail_reasons.issubset(BAD_ALIGNMENT_FILTERS):
			rids_to_remove.add(rid)

	rids = rids.difference(rids_to_remove)
	reads_with_trim[source] = rids	

reads_with_trim
fates = []
for sample, r, source, out_dir in sources:
	rids = reads_with_fivep[source]
	for rid in rids:

		if rid not in fivep_passed_rids[source]:
			# if rid in start_at_fivep.loc[start_at_fivep.source==source, 'read_id'].values:
				# fates.append((sample, r, source, rid, "Starts at a 5'ss"))
				# continue
			fates.append((sample, r, source, rid, "Filtered after 5'ss mapping"))
			continue
		
		if rid not in reads_with_trim[source]:
			fates.append((sample, r, source, rid, "Unknown, has at least one 5'ss alignment"))
			continue

		if rid not in trim_passed_rids[source]:
			fates.append((sample, r, source, rid, 'Template switching'))
			continue
		
		if rid not in lariat_passed_rids[source]:
			fails = set(failed_lariats.loc[(failed_lariats.source==source) & (failed_lariats.read_id==rid), 'filter_failed'])
			assert len(fails)==1, f'{source},{rid},{fails}'
			
			if len(fails)==1 and fails.pop()=='near_ss':
				fates.append((sample, r, source, rid, 'Circular intron'))
				continue
			if len(fails)==1 and fails.pop() in ('ubiquitin_gene', 'in_repeat'):
				fates.append((sample, r, source, rid, 'Repeat region'))
				continue

			raise RuntimeError(rid)
		
		fates.append((sample, r, source, rid, 'Lariat'))

fates = pd.DataFrame(fates, columns=['samp', 'r', 'source', 'read_id', 'fate'])
assert fates[['source', 'read_id']].duplicated().sum()==0, fates[['source', 'read_id']].duplicated().sum()
fates
READ_SPECIES =  ('Lariat', 'Repeat region', 'Circular intron', 'Template switching', "Unknown, has at least one 5'ss alignment", "Unknown, no 5'ss alignments")
species_counts = []
for sample, r, source, out_dir in sources:
	unmapped = run_data.loc[(run_data.cat=='ref_unmapped') & (run_data.source==source), 'reads'].iloc[0]
	no_fivep_maps = unmapped - run_data.loc[(run_data.cat=='fivep_mapped') & (run_data.source==source), 'reads'].iloc[0]

	for species in READ_SPECIES:
		if species == "Unknown, no 5'ss alignments":
			species_counts.append([sample, r, source, unmapped, species, no_fivep_maps])
			continue
			
		count = len(fates.loc[(fates.source==source) & (fates.fate==species)])
		species_counts.append([sample, r, source, unmapped, species, count])

species_counts = pd.DataFrame(species_counts, columns=['samp', 'r', 'source',  'unmapped', 'species', 'reads'])
species_counts['species'] = pd.Categorical(species_counts.species, READ_SPECIES, ordered=True)
species_counts['prop'] = species_counts.reads / species_counts.unmapped
species_counts
species_counts_avg = species_counts.groupby(['samp', 'species'])[['reads', 'prop']].agg('mean').reset_index()
species_counts_avg['label'] = species_counts_avg.reads.transform(lambda val: f'{val:,.0f}')
species_counts_avg
plot = (p9.ggplot(species_counts_avg, p9.aes(x="species", fill='samp')) +
		p9.geom_col(p9.aes(y="prop"), position='dodge') +
		# p9.geom_text(p9.aes(y="prop+0.05", label="label", color='samp'), ha='left', size=8, position=p9.position_dodge(width=1)) +
		# p9.annotate('text', x=6.5, y=1.05, label='Avg. reads', fontweight='bold') +
		p9.scale_y_continuous(limits=(0,1), breaks=np.arange(0,1.2,0.2)) +
		p9.scale_fill_brewer(type='qual', palette=2, direction=-1) +
		# p9.scale_color_brewer(type='qual', palette=2, direction=-1) +
		p9.labs(title="", x="Read species", y="Proportion of unmapped reads", fill='Sample') +
		p9.coord_flip() +
		p9.guides(fill=p9.guide_legend(reverse=True), color=None) +
		p9.theme()
)
print(plot)
plot.save(PLOTS_DIR + '/unmapped.png', dpi=500)



# mapped_reads = functions.load_bam( f'{out_dir}/mapped_reads.bam')
# mapped_reads = mapped_reads.rename(columns={'q_id': 'read_id', 'r_id': 'chrom', 'orient': 'align_orient', 'r_start': 'align_start', 'r_end': 'align_end'})
# mapped_reads
# mapped_reads = functions.load_bam( f'{out_dir}/mapped_reads.bam')
# mapped_reads = mapped_reads.rename(columns={'q_id': 'read_id', 'r_id': 'chrom', 'orient': 'align_orient', 'r_start': 'align_start', 'r_end': 'align_end'})
# mapped_reads
# mapped_reads = functions.load_bam( f'{out_dir}/mapped_reads.bam')
# mapped_reads = mapped_reads.rename(columns={'q_id': 'read_id', 'r_id': 'chrom', 'orient': 'align_orient', 'r_start': 'align_start', 'r_end': 'align_end'})
# mapped_reads
# mapped_reads = functions.load_bam( f'{out_dir}/mapped_reads.bam')
# mapped_reads = mapped_reads.rename(columns={'q_id': 'read_id', 'r_id': 'chrom', 'orient': 'align_orient', 'r_start': 'align_start', 'r_end': 'align_end'})
# mapped_reads
# mapped_reads = mapped_reads[mapped_reads.mapped].reset_index(drop=True).drop(columns=['mapped', 'q_start', 'q_end', 'quality', 'seq'])
# mapped_reads['align_id'] = mapped_reads.apply(lambda row: f"{row['read_id']};{row['chrom']};{row['align_start']};{row['align_end']};{row['align_orient']}", axis=1)
# assert mapped_reads.align_id.is_unique
# mapped_reads
# intron_overlaps = load_feature_overlaps(f'{out_dir}/intron_overlaps.bed')
# mapped_reads['introns'] = mapped_reads.align_id.map(intron_overlaps.to_dict())
# print(mapped_reads.introns.notna().sum())
# exon_overlaps = load_feature_overlaps(f'{out_dir}/exon_overlaps.bed')
# mapped_reads['exons'] = mapped_reads.align_id.map(exon_overlaps.to_dict())
# print(mapped_reads.exons.notna().sum())
# tx2gene = pd.read_csv('/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/RNA-seq data mining/reference_data/annotations/tx2gene.gencode.v44.csv')
# tx2gene = tx2gene.set_index('TXNAME').GENEID.to_dict()
# tx2gene
# def append_gene_id(features):
# 	if pd.isna(features):
# 		return pd.NA
	
# 	appended_features = []
# 	for tid, f_num, f_strand, f_start, f_end in features:
# 		gene_id = tx2gene[tid] 
# 		appended_features.append([tid, f_num, f_strand, f_start, f_end, gene_id])

# 	return tuple(appended_features) 

# mapped_reads.introns = mapped_reads.introns.transform(append_gene_id)
# mapped_reads.exons = mapped_reads.exons.transform(append_gene_id)
# print(mapped_reads.at[1,'introns'])
# print(mapped_reads.at[1,'exons'])
# def pair_features(row):
# 	features = []
# 	if pd.notna(row['exons']):
# 		features += [['exon', *exon] for exon in row['exons']]
# 	if pd.notna(row['introns']):
# 		features += [['intron', *intron] for intron in row['introns']]
# 	if features == []:
# 		return (), ()
	
# 	same_tid = []
# 	diff_tid_same_gid = []
# 	for feature_1, feature_2 in it.permutations(features, 2):
# 		if feature_1[1] == feature_2[1]:
# 			same_tid.append((feature_1, feature_2))
# 			continue
# 		if feature_1[-1] == feature_2[-1]:
# 			diff_tid_same_gid.append((feature_1, feature_2))
		
# 	return tuple(same_tid), tuple(diff_tid_same_gid)

# mapped_reads[['same_tid', 'same_gid']] = mapped_reads.apply(pair_features, axis=1, result_type='expand')
# mapped_reads
# def cat_pairs(same_id):
# 	# cats = {'pre-mRNA': [], 'mRNA': [], 'in_then_in': [], 'ex_ex_far': [], 'in_in_far': [], 'in_ex_far': []}
# 	cats = []
# 	for feature_1, feature_2 in same_id:
# 		num_within_1 = abs(feature_1[2] - feature_2[2]) <= 1

# 		if feature_1[0] == 'exon' and feature_2[0] == 'exon':
# 			if num_within_1 is True:
# 				# cats['mRNA'].append(feature_1[1])
# 				cats.append('mRNA')
# 			else:
# 				# cats['ex_ex_far'].append(feature_1[1])
# 				cats.append('ex_ex_far')

# 		if feature_1[0] == 'intron' and feature_2[0] == 'intron':
# 			if num_within_1 is True:
# 				# cats['in_then_in'].append(feature_1[1])
# 				cats.append('in_then_in')
# 			else:
# 				# cats['in_in_far'].append(feature_1[1])
# 				cats.append('in_in_far')
		
# 		if num_within_1 is True:
# 			# cats['pre-mRNA'].append(feature_1[1])
# 			cats.append('pre-mRNA')
# 		else:
# 			# cats['in_ex_far'].append(feature_1[1])
# 			cats.append('in_ex_far')

# 	# return sorted(cats)
# 	return set(cats)

		

# mapped_reads['same_tid_cat'] = mapped_reads.same_tid.transform(cat_pairs)
# mapped_reads['same_gid_cat'] = mapped_reads.same_gid.transform(cat_pairs)
# print(mapped_reads.same_tid_cat[63])
# print(mapped_reads.same_gid_cat[1])
# mapped_reads
# print(mapped_reads.loc[mapped_reads.same_tid_cat.transform(len)>0, 'same_tid_cat'].value_counts())
# print(mapped_reads.loc[mapped_reads.same_gid_cat.transform(len)>0, 'same_gid_cat'].value_counts())
# mapped_reads['type'] = mapped_reads.apply(categorize_mapped_species, axis=1)
# print(mapped_reads.type.value_counts())
# mapped_reads
# out = [(source, sample, r, type_, count) for type_, count in mapped_reads.type.value_counts().to_dict().items()]
# print(out)
# mapped_reads['gaps'] = mapped_reads.cigar.transform(gaps_from_cigartuples)
# print(mapped_reads.gaps.value_counts())
# mapped_reads['skips'] = mapped_reads.cigar.transform(skips_from_cigartuples)
# print(mapped_reads.skips.value_counts())
# mapped_avg = [('C22|R2', 'C22', 'R2', 'Exonic', 15510), ('C22|R2', 'C22', 'R2', 'Genic, ambiguous', 15289), ('C22|R2', 'C22', 'R2', 'Intronic', 12314), ('C22|R2', 'C22', 'R2', 'Inter-genic', 3197), ('C22|R2', 'C22', 'R2', 'pre-mRNA', 1359), ('C22|R2', 'C22', 'R2', "Ends at a 5'ss", 9)] + \
# 	[('C22|R1', 'C22', 'R1', 'Exonic', 15507), ('C22|R1', 'C22', 'R1', 'Genic, ambiguous', 15483), ('C22|R1', 'C22', 'R1', 'Intronic', 12332), ('C22|R1', 'C22', 'R1', 'Inter-genic', 3204), ('C22|R1', 'C22', 'R1', 'pre-mRNA', 1384), ('C22|R1', 'C22', 'R1', "Ends at a 5'ss", 4), ('C22|R1', 'C22', 'R1', 'mRNA', 1)] + \
# 	[('HEK293T|R1', 'HEK293T', 'R1', 'Exonic', 17228), ('HEK293T|R1', 'HEK293T', 'R1', 'Genic, ambiguous', 14489), ('HEK293T|R1', 'HEK293T', 'R1', 'Intronic', 9101), ('HEK293T|R1', 'HEK293T', 'R1', 'Inter-genic', 3485), ('HEK293T|R1', 'HEK293T', 'R1', 'pre-mRNA', 1339), ('HEK293T|R1', 'HEK293T', 'R1', "Ends at a 5'ss", 4)] + \
# 	[('HEK293T|R2', 'HEK293T', 'R2', 'Exonic', 18974), ('HEK293T|R2', 'HEK293T', 'R2', 'Genic, ambiguous', 11244), ('HEK293T|R2', 'HEK293T', 'R2', 'Intronic', 8803), ('HEK293T|R2', 'HEK293T', 'R2', 'Inter-genic', 3461), ('HEK293T|R2', 'HEK293T', 'R2', 'pre-mRNA', 1253), ('HEK293T|R2', 'HEK293T', 'R2', "Ends at a 5'ss", 6)]

# mapped_avg = pd.DataFrame(mapped_avg, columns = ['source', 'samp', 'r', 'type', 'reads'])
# mapped_avg
# mapped_avg.type = pd.Categorical(mapped_avg.type, MAPPED_SPECIES, ordered=True)
# mapped_avg['mapped'] = mapped_avg.source.map(mapped_avg.groupby('source').reads.agg('sum'))
# mapped_avg['prop'] = mapped_avg.reads / mapped_avg.mapped
# mapped_avg
# mapped_avg = mapped_avg.groupby(['samp', 'type'])[['reads', 'prop']].agg('mean').reset_index().fillna(0)
# mapped_avg['label'] = mapped_avg.reads.transform(lambda val: f'{val:,.0f}')
# mapped_avg
# plot = (p9.ggplot(mapped_avg, p9.aes(x="type", fill='samp')) +
# 		p9.geom_col(p9.aes(y="prop"), position='dodge') +
# 		p9.geom_text(p9.aes(y="prop+0.05", label="label", color='samp'), ha='left', size=8, position=p9.position_dodge(width=1)) +
# 		# p9.annotate('text', x=6.5, y=1.05, label='Avg. reads', fontweight='bold') +
# 		p9.scale_y_continuous(limits=(0,1), breaks=np.arange(0,1.2,0.2)) +
# 		p9.scale_fill_brewer(type='qual', palette=2, direction=-1) +
# 		# p9.scale_color_brewer(type='qual', palette=2, direction=-1) +
# 		p9.labs(title="", x="Read species", y="Proportion of mapped reads", fill='Sample') +
# 		p9.coord_flip() +
# 		p9.guides(fill=p9.guide_legend(reverse=True), color=None) +
# 		p9.theme()
# )
# print(plot)
# plot.save(PLOTS_DIR + '/mapped.png', dpi=500)

MAPPED_SPECIES = (
				  'pre-mRNA', 				# At least 1 exon overlap AND at least 1 intron overlap AND at least 1 shared transcript ID between an exon and an intron
				  'mRNA', 					# At least 2 exon overlaps AND at least 1 shared transcript ID between two exons AND not "pre-mRNA"
				  "Ends at a 5'ss", 		# No exon overlaps AND read starts at the same position as a (+)-strand intron's start OR reads ends at the same position as a (-)-strand intron's end
				  'Intronic', 				# No exon overlaps AND not "Ends at a 5'ss"
				  'Exonic', 				# At least 1 exon overlap AND no intron overlaps AND not "mRNA"
				  'Genic, ambiguous', 		# At least 1 exon overlap AND at least 1 intron overlap AND not "pre-mRNA"
				  'Inter-genic',			# No exon overlaps AND no intron overlaps
				  )

# def categorize_mapped_species(row: pd.Series):
# 	if pd.isna(row['exons']) and pd.isna(row['introns']):
# 		return 'Inter-genic'
	
# 	if pd.isna(row['exons']) and pd.notna(row['introns']):
# 		for tid, intron_num, intron_strand, intron_start, intron_end, gene_id in row['introns']:
# 			if intron_strand == '+' and row['align_start'] == intron_start:
# 				return "Ends at a 5'ss"
# 			if intron_strand == '-' and row['align_end'] == intron_end:
# 				return "Ends at a 5'ss"
	
# 		return 'Intronic'
	
# 	# At this point we know we have at least 1 exon
# 	if pd.notna(row['exons']) and pd.isna(row['introns']):
		
	
# 	exon_tids = set([exon[0] for exon in row['exons']])
# 	intron_tids = set([intron[0] for intron in row['introns']]) if pd.notna(row['introns']) else set()
# 	for exon, intron in it.product(row['exons'], row['introns']):
		
# 	# At least 1 exon and intron share a tid
# 	if len(exon_tids.intersection(intron_tids)) > 0:
# 		return 'pre-mRNA'
# 	# At least 2 exons share a tid
# 	if len(row['exons']) > len(exon_tids):
# 		return 'mRNA'
# 	# Only exons that don't share a tid
# 	if pd.isna(row['introns']):
# 		return 'Exonic'
	
# 	# At this point we know we have at least 1 exon
# 	return 'Genic, ambiguous'


# fivep_failed = pd.DataFrame()
# for sample, r, source, out_dir in sources:
# 	data = pd.read_csv(f'{out_dir}/failed_fivep_alignments.tsv', sep='\t')
# 	data = data.assign(samp=sample, r=r, source=source)
# 	fivep_failed = pd.concat([fivep_failed, data], ignore_index=True)

# print(fivep_failed.fail_reason.value_counts())
# fivep_failed
# fivep_failed_rids = {}
# for sample, r, source, out_dir in sources:
# 	rids = pd.read_csv(f'{out_dir}/failed_fivep_alignments.tsv', sep='\t', usecols=[0])
# 	fivep_failed_rids[source] = set(rids.read_id)
# fivep_failed_rids



# df = fate_counts.copy()
# df.loc[df.fate.isin(["Filtered after 5'ss mapping", "Starts at a 5'ss", 'No trimmed alignments']), 'fate'] = "Unknown, has a 5'ss alignment"
# df = df.groupby(['source', 'samp', 'r', 'fate', 'unmapped'], as_index=False, observed=True)['reads'].agg(sum)

# zerorows = []
# for i, row in df[['source', 'samp', 'r', 'unmapped']].drop_duplicates().iterrows():
# 	for fate in READ_SPECIES:
# 		if len(df.loc[(df.source==row['source']) & (df.fate==fate)]) == 0:
# 			zerorows.append([row['source'], row['samp'], row['r'], fate, row['unmapped'], 0])
# df = pd.concat([df, pd.DataFrame(zerorows, columns=df.columns)])

# df.fate = pd.Categorical(df.fate, categories=READ_SPECIES, ordered=True) 
# df['prop'] = df.reads / df.unmapped
# df = df.sort_values(['source', 'fate'])
# df

# fate_counts = fates[['source', 'fate']].value_counts().sort_index()
# fate_counts = fate_counts.reset_index().set_axis(['source', 'fate', 'reads'], axis=1)
# fate_counts[['samp', 'r']] = fate_counts.source.str.split('|', expand=True)
# for sample, r, source, out_dir in sources:
# 	unmapped = run_data.loc[(run_data.cat=='ref_unmapped') & (run_data.source==source), 'reads'].iloc[0]
# 	no_fivep_maps = unmapped - run_data.loc[(run_data.cat=='fivep_mapped') & (run_data.source==source), 'reads'].iloc[0]
	
# 	fate_counts.loc[fate_counts.source==source, 'unmapped'] = unmapped
# 	fate_counts.loc[len(fate_counts)] = [source, "Unknown, no 5'ss alignments", no_fivep_maps, sample, r, unmapped]
# 	covered = len(fates.loc[fates.source==source]) + no_fivep_maps
	
# 	print(source, covered, 'of', unmapped)
# fate_counts['prop'] = fate_counts.reads / fate_counts.unmapped
# fate_counts = fate_counts.sort_values(['source', 'fate'])
# fate_counts


start_at_fivep = fivep_failed.loc[(~fivep_failed.read_is_reverse) & (fivep_failed.read_fivep_start==0)]
start_at_fivep

plot = (p9.ggplot(read_outcomes, p9.aes(x='category')) +
		p9.geom_boxplot(p9.aes(y='rpm')) +
		p9.geom_text(p9.aes(y='mean_rpm', label='label'), size=10, nudge_y=2000, family='arial') +
		p9.scale_y_continuous(labels=comma_format()) +
		p9.labs(x='Read classification',
		  		y="Reads per million mapped reads") +
		p9.facet_wrap('samp')
	)
print(plot)
plot.save('/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Lariat_mapping/visuals/plots/read_classes.png', dpi=500, height=6, width=10)

read_means = read_outcomes.groupby(['samp', 'category'])['rpm'].aggregate('mean')
for samp, cat in read_means.index:
	read_outcomes.loc[(read_outcomes.samp==samp) & (read_outcomes.category==cat), 'mean_rpm'] = read_means[samp, cat]

read_outcomes['label'] = read_outcomes.mean_rpm.transform(lambda x: comma_format()([int(round(x))])[0])
read_outcomes
for sample, r, source, out_dir in sources:
	x = read_outcomes.loc[read_outcomes.source=='C22|R1', 'mapped_to_genome'].iloc[0] + read_outcomes.loc[read_outcomes.source=='C22|R1', 'reads'].sum()
	print(source, x)
rows = []
for sample, r, source, out_dir in sources:
	
	og_mapped_to_genome = run_data.loc[(run_data.cat=='ref_mapped') & (run_data.source==source), 'reads'].iloc[0]
	premRNA = run_data.loc[(run_data.cat=='premRNA') & (run_data.source==source), 'reads'].iloc[0]
	unmapped = run_data.loc[(run_data.cat=='ref_unmapped') & (run_data.source==source), 'reads'].iloc[0]
	no_fivep_maps = unmapped - run_data.loc[(run_data.cat=='fivep_mapped') & (run_data.source==source), 'reads'].iloc[0]
	# fivep_filtered_out = 
	no_trim_align = run_data.loc[(run_data.cat=='fivep_filtered') & (run_data.source==source), 'reads'].iloc[0] - run_data.loc[(run_data.cat=='trimmed_mapped') & (run_data.source==source), 'reads'].iloc[0]
	
	start_at_fivep_rids = set(start_at_fivep.loc[start_at_fivep.source==source, 'read_id'])
	lariats_rids = set(lariats.loc[lariats.source==source, 'read_id'])
	# trimmed_mapped_nonlariat_rids = fivep_passed.loc[(fivep_passed.source==source) & (fivep_passed.trimmed_mapped) & (~fivep_passed.passed_trimmed_filters)]
	# trimmed_mapped_nonlariat_rids = set(trimmed_mapped_nonlariat_rids['read_id'])
	repetitive_rids = set(failed_lariats.loc[(failed_lariats.filter_failed.isin(['ubiquitin_gene', 'in_repeat'])) & (~failed_lariats.read_id.isin(lariats.read_id.values)), 'read_id'])
	circular_rids = set(failed_lariats.loc[(failed_lariats.filter_failed=='near_ss') & (~failed_lariats.read_id.isin(lariats.read_id)), 'read_id'])

	print('start at fivep and lariat', len(start_at_fivep.intersection(lariats_rids)), start_at_fivep_rids.intersection(lariats_rids))
	# print('end at fivep and trimmed mapped nonlariat', len(end_at_fivep_rids.intersection(trimmed_mapped_nonlariat_rids)), end_at_fivep_rids.intersection(trimmed_mapped_nonlariat_rids))
	# print('lariat and trimmed mapped nonlariat', len(lariats_rids.intersection(trimmed_mapped_nonlariat_rids)), lariats_rids.intersection(trimmed_mapped_nonlariat_rids))

	row = [sample, r, source, og_mapped_to_genome, premRNA, unmapped, no_fivep_maps, len(start_at_fivep), no_trim_align, len(repetitive_rids), len(circular_rids), len(lariats_rids)]
	rows.append(row)

read_outcomes = pd.DataFrame(rows, columns = ['samp', 'r', 'source', 'mapped_to_genome', 'pre-mRNA', 'unmapped', 'no fivep', "Ends at a 5'ss", 'no trim', 'repetitive', 'circular', 'Lariat'])
read_outcomes = read_outcomes.melt(id_vars=['samp', 'r', 'source', 'mapped_to_genome', 'unmapped'], var_name='category', value_name='reads')
read_outcomes['rpm'] = (read_outcomes.reads * 1_000_000)/ read_outcomes.mapped_to_genome
read_outcomes.category = pd.Categorical(read_outcomes.category, categories=read_outcomes.category.unique(), ordered=True)
read_outcomes.samp = pd.Categorical(read_outcomes.samp, categories=['HEK293T', 'C22'], ordered=True)
read_outcomes = read_outcomes.sort_values('source')
read_outcomes



df.fate = df.fate.map({'filt fivep': 'Unknown', 'no trim': 'Unknown', 'start fivep': "Starts at a 5'ss", 'circular': 'Circular intron', 'repeat': 'Repetitive region', 'lariat':'Lariat', 'temp switch': 'Template switching', 'no fivep': 'Unknown'})


for sample, r, source, out_dir in sources:
	trimmed_mapped_rids = set()
	for align in pysam.AlignmentFile(f'{out_dir}/trimmed_reads_to_genome.bam', 'rb'):	
		trimmed_mapped_rids.add(align.query_name)
		
	fivep_passed.loc[fivep_passed.source==source, 'trimmed_mapped'] = fivep_passed.loc[fivep_passed.source==source, 'read_id'].isin(trimmed_mapped_rids)
	fivep_passed.loc[fivep_passed.source==source, 'passed_trimmed_filters'] = fivep_passed.loc[fivep_passed.source==source, 'read_id'].isin(threep_passed_rids[source]) 

	reads_total = len(fivep_passed.loc[(fivep_passed.source==source), 'read_id'].unique())
	reads_mapped = len(fivep_passed.loc[(fivep_passed.source==source) & (fivep_passed.passed_trimmed_filters), 'read_id'].unique())
	print(f'{reads_mapped:,} of {reads_total:,}')

fivep_passed

end_at_fivep_counts = {}
for sample, r, source, out_dir in sources:
	unique_reads = end_at_fivep.loc[end_at_fivep.source==source, 'read_id'].unique()
	end_at_fivep_counts[source] = len(unique_reads)
end_at_fivep_counts



funnel = copy.deepcopy(run_data)

for sample, r, source, out_dir in sources:
	unmapped = funnel.loc[(funnel.source==source) & (funnel.cat=='ref_unmapped'), 'reads'].iloc[0]
	funnel.loc[funnel.source==source, 'unmapped'] = unmapped

funnel['unmap_p'] = funnel.reads / funnel.unmapped
funnel
funnel.cat = funnel.cat.map({'fivep_mapped': "At least one 5'ss mapped to the read", 'trimmed_mapped':"Trimmed section mapped to the genome", 'filtered_lariats':'Lariats'})
funnel = funnel.dropna()
funnel.cat = pd.Categorical(funnel.cat, categories=['Lariats', "Trimmed section mapped to the genome", "Mapped to a 5'ss"], ordered=True)
funnel
funnel.loc[funnel.samp=='C22', 'unmap_p']
samp_means = funnel.groupby(['samp', 'cat'])['unmap_p'].aggregate('mean')
for samp, cat in samp_means.index:
	funnel.loc[(funnel.samp==samp) & (funnel.cat==cat), 'unmap_p_mean'] = samp_means[samp, cat]
	funnel.loc[(funnel.samp==samp) & (funnel.cat==cat), 'unmap_p_sem'] = funnel.loc[(funnel.samp==samp) & (funnel.cat==cat), 'unmap_p'].sem()
	
funnel['ebar_top'] = funnel.unmap_p_mean + funnel.unmap_p_sem
funnel['ebar_bottom'] = funnel.unmap_p_mean - funnel.unmap_p_sem
funnel
samp_means = funnel.groupby(['samp', 'cat'])['reads'].aggregate('mean')
for samp, cat in samp_means.index:
	funnel.loc[(funnel.samp==samp) & (funnel.cat==cat), 'reads_mean'] = samp_means[samp, cat]

funnel.reads_mean = funnel.reads_mean.astype(int)
funnel['label_y'] = funnel.unmap_p_mean + 0.01
funnel
plot = (p9.ggplot(funnel, p9.aes(x='cat', fill='samp', group='samp')) +
		p9.geom_col(p9.aes(y='unmap_p_mean'), position='dodge') +
		p9.geom_errorbar(p9.aes(ymax='ebar_top', ymin='ebar_bottom'), position='dodge', width=0.9) +
		# p9.geom_boxplot(p9.aes(y='unmap_p')) +
		p9.geom_text(p9.aes(y='label_y', label='reads_mean'), position=p9.position_dodge(width=0.9), size=10, family='arial') +
		p9.scale_y_continuous(labels=percent_format()) +
		p9.scale_fill_brewer(type='qual', palette=2) +
		p9.labs(x='Lariat-mapping step', 
		  		y='% of unmapped reads',
				fill='Cell line') +
		p9.coord_flip() +
		p9.theme()
		# p9.facet_wrap('samp')
	)
plot
plot.save('/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Lariat_mapping/visuals/plots/funnel.png', dpi=500)