In [1]:
import gzip

import pysam
import numpy as np
import pandas as pd
from intervaltree import Interval, IntervalTree

In [2]:
LINEAR_COLUMNS = ('read_id', 
				  'chrom', 
				#   'start', 
				#   'end', 
				  'blocks', 
				  'cigar',
				  'read',
				  )
CIGARTUPLE_CODES = {0: 'M',
					1: 'I',
					2: 'D',
					3: 'N',
					4: 'S',
					5: 'H',
					6: 'P',
					7: '=',
					8: 'X',
					9: 'B',
					}

In [3]:
def infer_segments(row):
	if len(row['blocks']) == 1:
		return row['blocks']
	
	segs = []
	current_block = row['blocks'][0]
	for blocks_index in range(1, len(row['blocks'])):
		next_block = row['blocks'][blocks_index]
		cigar_index = 2*blocks_index - 1
		gap_type = row['cigar'][cigar_index][0]

		if gap_type == 'N':
			segs.append(current_block)
			current_block = next_block
		elif gap_type in ('I', 'D'):
			current_block = (current_block[0], next_block[1])
		else:
			raise RuntimeError(f"{row['blocks']}\n{row['cigar']}\n{blocks_index}")
	
	segs.append(current_block)
		
	return tuple(segs)

In [4]:
ref_exons = '/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Projects/Lariat_mapping/reference_data/hg38_ref/exons.tsv.gz'
ref_introns = '/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Projects/Lariat_mapping/reference_data/hg38_ref/introns.tsv.gz'
single_end = False
output_base = '/Users/trumanmooney/Library/CloudStorage/OneDrive-BrownUniversity/Documents/Projects/Lariat_mapping/output/pipeline/C22-1_100k/'

In [5]:
introns = pd.read_csv(ref_introns, sep='\t')
introns['gene_id'] = introns.gene_id.transform(lambda gid: set(gid.split(',')))
introns['interval'] = introns.apply(lambda row: Interval(row['start'], row['end'], {'gene_id': row['gene_id']}), axis=1)
introns = introns.groupby('chrom').interval.agg(IntervalTree).to_dict()

In [6]:
exons = pd.read_csv(ref_exons, sep='\t')
exons['gene_id'] = exons.gene_id.transform(lambda gid: set(gid.split(',')))
exons['interval'] = exons.apply(lambda row: Interval(row['start'], row['end'], {'gene_id': row['gene_id']}), axis=1)
exons = exons.groupby('chrom').interval.agg(IntervalTree).to_dict()

In [7]:
linear_reads = []
for align in pysam.AlignmentFile(f'{output_base}/mapped_reads.bam', 'rb'):
	linear_reads.append([
					align.query_name, 
					align.reference_name, 
					align.get_blocks(),
					align.cigartuples,
					align.is_read1,
					])

linear_reads = pd.DataFrame(linear_reads, columns=LINEAR_COLUMNS)

linear_reads.cigar = linear_reads.cigar.transform(lambda cigar: tuple((CIGARTUPLE_CODES[op],length) for op, length in cigar))
linear_reads.read = linear_reads.read.map({True: '1', False: '2'})
linear_reads = linear_reads.sort_values(['read_id'])

linear_reads

Unnamed: 0,read_id,chrom,blocks,cigar,read
41609,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,"[(43161160, 43161190), (43162802, 43162922)]","((M, 30), (N, 1612), (M, 120))",2
41608,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,"[(43161160, 43161190), (43162802, 43162922)]","((M, 30), (N, 1612), (M, 120))",1
44423,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr14,"[(49586616, 49586766)]","((M, 150),)",2
44422,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr14,"[(49586616, 49586766)]","((M, 150),)",1
44761,NGSNJ-086:229:GW200110425th:1:1101:10004:12085,chr14,"[(49586699, 49586849)]","((M, 150),)",2
...,...,...,...,...,...
105752,NGSNJ-086:229:GW200110425th:1:1101:9995:27352,chr14,"[(32163959, 32164109)]","((M, 150),)",1
15918,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,"[(48233331, 48233481)]","((M, 150),)",1
15919,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,"[(48233331, 48233481)]","((M, 150),)",2
34304,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,"[(49586704, 49586854)]","((M, 150),)",1


In [8]:
linear_reads['segs'] = linear_reads.apply(infer_segments, axis=1, result_type='reduce')
linear_reads = linear_reads.explode('segs')
linear_reads['seg'] = linear_reads.segs.transform(lambda segs: Interval(*segs))
linear_reads = linear_reads.drop(columns=['blocks', 'segs', 'cigar'])
linear_reads

Unnamed: 0,read_id,chrom,read,seg
41609,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,2,"(43161160, 43161190, None)"
41609,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,2,"(43162802, 43162922, None)"
41608,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,1,"(43161160, 43161190, None)"
41608,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,1,"(43162802, 43162922, None)"
44423,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr14,2,"(49586616, 49586766, None)"
...,...,...,...,...
105752,NGSNJ-086:229:GW200110425th:1:1101:9995:27352,chr14,1,"(32163959, 32164109, None)"
15918,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,1,"(48233331, 48233481, None)"
15919,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,2,"(48233331, 48233481, None)"
34304,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,1,"(49586704, 49586854, None)"


In [9]:
for chrom in linear_reads.chrom.unique():
	if chrom not in exons.keys():
		print(f'No exons in {chrom}')
		continue
	chrom_exons = exons[chrom]
	linear_reads.loc[linear_reads.chrom==chrom, 'exons'] = linear_reads.loc[linear_reads.chrom==chrom, 'seg'].transform(chrom_exons.overlap)
	
	if chrom not in introns.keys():
		print(f'No introns in {chrom}')
		continue
	chrom_introns = introns[chrom]
	linear_reads.loc[linear_reads.chrom==chrom, 'introns'] = linear_reads.loc[linear_reads.chrom==chrom, 'seg'].transform(chrom_introns.overlap)

linear_reads.exons = linear_reads.exons.fillna('').transform(set)
linear_reads.introns = linear_reads.introns.fillna('').transform(set)
linear_reads

No introns in chrM


Unnamed: 0,read_id,chrom,read,seg,exons,introns
41609,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,2,"(43161160, 43161190, None)","{(43161051, 43161190, {'gene_id': {'ENSG000001...",{}
41609,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,2,"(43162802, 43162922, None)","{(43162802, 43163210, {'gene_id': {'ENSG000001...",{}
41608,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,1,"(43161160, 43161190, None)","{(43161051, 43161190, {'gene_id': {'ENSG000001...",{}
41608,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,1,"(43162802, 43162922, None)","{(43162802, 43163210, {'gene_id': {'ENSG000001...",{}
44423,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr14,2,"(49586616, 49586766, None)","{(49586579, 49586878, {'gene_id': {'ENSG000002...","{(49586049, 49598399, {'gene_id': {'ENSG000002..."
...,...,...,...,...,...,...
105752,NGSNJ-086:229:GW200110425th:1:1101:9995:27352,chr14,1,"(32163959, 32164109, None)",{},{}
15918,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,1,"(48233331, 48233481, None)","{(48233202, 48233477, {'gene_id': {'ENSG000001...","{(48232698, 48253806, {'gene_id': {'ENSG000001..."
15919,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,2,"(48233331, 48233481, None)","{(48233202, 48233477, {'gene_id': {'ENSG000001...","{(48232698, 48253806, {'gene_id': {'ENSG000001..."
34304,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,1,"(49586704, 49586854, None)","{(49586579, 49586878, {'gene_id': {'ENSG000002...","{(49586049, 49598399, {'gene_id': {'ENSG000002..."


In [10]:
def infer_common_genes(df):
	gene_ids = []
	for i, seg_row in df.iterrows():
		features = seg_row['exons'].union(seg_row['introns'])
		seg_genes = set()
		for feat in features:
			seg_genes.update(feat.data['gene_id'])
		gene_ids.append(seg_genes)

	common_genes = set(gid for gid in gene_ids[0] if all(gid in set_genes for set_genes in gene_ids))
	return common_genes

linear_reads['common_genes'] = linear_reads.read_id.map(linear_reads.groupby('read_id').apply(infer_common_genes))
print(linear_reads.common_genes.transform(len).value_counts())
linear_reads

common_genes
1    89714
2    36712
0    14798
3     1706
4      114
5       12
Name: count, dtype: int64


  linear_reads['common_genes'] = linear_reads.read_id.map(linear_reads.groupby('read_id').apply(infer_common_genes))


Unnamed: 0,read_id,chrom,read,seg,exons,introns,common_genes
41609,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,2,"(43161160, 43161190, None)","{(43161051, 43161190, {'gene_id': {'ENSG000001...",{},{ENSG00000100300.18}
41609,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,2,"(43162802, 43162922, None)","{(43162802, 43163210, {'gene_id': {'ENSG000001...",{},{ENSG00000100300.18}
41608,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,1,"(43161160, 43161190, None)","{(43161051, 43161190, {'gene_id': {'ENSG000001...",{},{ENSG00000100300.18}
41608,NGSNJ-086:229:GW200110425th:1:1101:10004:11303,chr22,1,"(43162802, 43162922, None)","{(43162802, 43163210, {'gene_id': {'ENSG000001...",{},{ENSG00000100300.18}
44423,NGSNJ-086:229:GW200110425th:1:1101:10004:11992,chr14,2,"(49586616, 49586766, None)","{(49586579, 49586878, {'gene_id': {'ENSG000002...","{(49586049, 49598399, {'gene_id': {'ENSG000002...","{ENSG00000213741.11, ENSG00000276168.1}"
...,...,...,...,...,...,...,...
105752,NGSNJ-086:229:GW200110425th:1:1101:9995:27352,chr14,1,"(32163959, 32164109, None)",{},{},{}
15918,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,1,"(48233331, 48233481, None)","{(48233202, 48233477, {'gene_id': {'ENSG000001...","{(48232698, 48253806, {'gene_id': {'ENSG000001...",{ENSG00000136156.15}
15919,NGSNJ-086:229:GW200110425th:1:1101:9995:4961,chr13,2,"(48233331, 48233481, None)","{(48233202, 48233477, {'gene_id': {'ENSG000001...","{(48232698, 48253806, {'gene_id': {'ENSG000001...",{ENSG00000136156.15}
34304,NGSNJ-086:229:GW200110425th:1:1101:9995:9502,chr14,1,"(49586704, 49586854, None)","{(49586579, 49586878, {'gene_id': {'ENSG000002...","{(49586049, 49598399, {'gene_id': {'ENSG000002...","{ENSG00000213741.11, ENSG00000276168.1}"


In [12]:
linear_reads['exons_f'] = linear_reads.apply(lambda row: set(exon for exon in row['exons'] if len(exon.data['gene_id'].intersection(row['common_genes']))>0), axis=1)
linear_reads['introns_f'] = linear_reads.apply(lambda row: set(intron for intron in row['introns'] if len(intron.data['gene_id'].intersection(row['common_genes']))>0), axis=1)