In [None]:
"""

Date : August 1, 2019

Author : Heather Landry Drexler

This script will develop datasets necessary to make splicing order
plots from nano-COP data. These files serve as input for the scripts for
Figures 4, 5 and S6.
                                            
"""


In [1]:
import numpy as np
import pandas as pd
import pysam
from collections import Counter

import matplotlib.pyplot as plt
import re
% matplotlib inline

import math

import pybedtools
from pybedtools import BedTool


In [2]:
# function to get a bedtool file with splice site info from hg38 intron coordinate bed file
def hg38_introns(intronFile):
    # make a set for all intron coordinates
    introns = []

    # loop through a file with intron coordinates
    # record features about the introns
    for line in intronFile:
        chrom = str(line.split('\t')[0])                  # chromosome
        start = line.split('\t')[1]                       # start coordinate of intron (last base of exon)
        end = line.split('\t')[2]                         # end coordinate of intron (last base of intron)
        name_split = line.split('\t')[3].split('_intron') # split name file by intron
        gene = name_split[0]                              # gene name
        feature = name_split[1].split('_')[1]             # intron count
        strand = line.split('\t')[5][0]                   # strand of gene with intron

        # make a key that will represent intron coordinates
        introns.append([str(chrom),str(start),str(end),str(gene),str(feature),str(strand)])

    #introns_df = pd.DataFrame(introns)
    #introns_df.columns = ["chrom","start","end","gene","feature","strand"]
    introns_bedtool = BedTool(introns)
    intronFile.close()
    return introns_bedtool


# function to get a bedtool file with splice site info from hg38 intron coordinate bed file
def dmel_introns(intronFile):
    # make a set for all intron coordinates
    introns = []

    # loop through a file with intron coordinates
    # record features about the introns
    for line in intronFile:
        chrom = 'chr'+str(line.split('\t')[0])                  # chromosome
        start = line.split('\t')[1]                       # start coordinate of intron (last base of exon)
        end = line.split('\t')[2]                         # end coordinate of intron (last base of intron)
        name_split = line.split('\t')[3].split('_intron') # split name file by intron
        gene = name_split[0]                              # gene name
        feature = name_split[1].split('_')[1]             # intron count
        strand = line.split('\t')[5][0]                   # strand of gene with intron

        # make a key that will represent intron coordinates
        introns.append([str(chrom),str(start),str(end),str(gene),str(feature),str(strand)])

    #introns_df = pd.DataFrame(introns)
    #introns_df.columns = ["chrom","start","end","gene","feature","strand"]
    introns_bedtool = BedTool(introns)
    intronFile.close()
    return introns_bedtool


# function to create a dataframe with reads that span 3'SS positions
def get_intron_intersect(introns_df, bam_file):
    # get reads that span 3' splice sites and convert to a dataframe
    bedFile = bam_file.bam_to_bed(cigar=True, tag='NM') # convert bam file to bed file, keep cigar string and NM (edit distance) tag
    intersect = bedFile.intersect(introns_df, wo=True, s=True) # intersect reads from bam file with 3' splice site coordinates, ensure strandedness
    df = intersect.to_dataframe(names=['chr_aln', 'start_aln', 'end_aln', 'name_aln', 'qual_aln', \
                                           'strand_aln', 'cigar_aln', 'chr_intron', 'start_intron', \
                                           'end_intron', 'name_gene', 'intron_count', 'strand_gene', 'count'], \
                               dtype={"chr_aln": str, "start_aln": int, "end_aln": int, \
                                     "name_aln": str, "qual_aln": int, "strand_aln": str, \
                                     "cigar_aln": str, "chr_intron": str, "start_intron": int, \
                                     "end_intron": int, "name_gene": str, \
                                     "intron_count": int,"strand_gene": str, "count": int}) # convert to a dataframe
    return df


# function to create a dataframe with splicing information for
# every read that spans an intron in the dataset
def get_splicing_info(intersect_df, min_overlap):
    
    df = intersect_df

    # prepare a list for splice calls
    spliceCalls = []

    # set variables for parsing the cigar string
    pattern = re.compile('([MIDNSHPX=])')
    Consumes_Query = ["M", "I", "S", "=", "X"]
    Consumes_Reference = ["M", "D", "N", "=", "X"]    

    # loop through all read-intron intersects
    for i in range(0,df.shape[0]):

        # ignore reads that do not overlap intron by minimum threshold
        if (df['count'].iloc[i] < min_overlap):
            continue

        # record the start and ends of reads 
        # will deal with gene strand after cigar counts are made
        aln_start = df['start_aln'].iloc[i] # record the start of the read
        aln_end = df['end_aln'].iloc[i] # record the end of the read
        intron_start = df['start_intron'].iloc[i] # record the end of the intron
        intron_end = df['end_intron'].iloc[i] # record the end of the intron

        # parse cigar string into a list of tuples for easy parsing
        Sep_Values = pattern.split(df['cigar_aln'].iloc[i])[:-1]
        CigarPairs = list((Sep_Values[n:n+2] for n in range(0, len(Sep_Values), 2)))  

        # set up variables for measuring the length of cigar string operators
        CigarOp_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        start_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        end_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        intron_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        currentloc = int(df['start_aln'].iloc[i])

        # go through list of cigar strings and grab splicing information
        for cigar_Entry in CigarPairs:

            op_Length = int(cigar_Entry[0]) # get length of cigar operator
            cigarOp = cigar_Entry[1] # get type of cigar operator  
            CigarOp_counts[cigarOp] += op_Length # add the cigar operator length to the counts dictionary
            cigarOp_start=currentloc # get the starting coordinate of the cigar operator

            if (cigarOp in Consumes_Reference):
                currentloc=currentloc+op_Length # add the cigar operator length to the current location coordinate 

            cigarOp_end=currentloc # get the ending coordinate of the cigar operator

            # gather information if the portion of the cigar string spans the designated intron start
            if (cigarOp_start<intron_start-min_overlap and cigarOp_end>=intron_start-min_overlap):
                if (cigarOp_end>=intron_start+min_overlap):
                    count=min_overlap*2
                else:
                    count=cigarOp_end-(intron_start-min_overlap)+1
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=intron_start-min_overlap and cigarOp_end<intron_start+min_overlap):
                count=op_Length
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary       

            elif (cigarOp_start<intron_start+min_overlap and cigarOp_end>=intron_start+min_overlap):
                if (cigarOp_start<=intron_start-min_overlap):
                    count=min_overlap*2
                else:
                    count=(intron_start+min_overlap)-cigarOp_start-1
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary 

            # gather information if the portion of the cigar string is within the intron
            if (cigarOp_start<intron_start and cigarOp_end>=intron_start):
                if (cigarOp_end>=intron_end):
                    count=intron_end-intron_start
                else:
                    count=cigarOp_end-intron_start
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=intron_start and cigarOp_end<intron_end):
                count=op_Length
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start<intron_end and cigarOp_end>=intron_end):
                if (cigarOp_start<=intron_start):
                    count=intron_end-intron_start
                else:
                    count=intron_end-cigarOp_start
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary 

            # gather information if the portion of the cigar string spans the designated intron end
            if (cigarOp_start<intron_end-min_overlap and cigarOp_end>=intron_end-min_overlap):
                if (cigarOp_end>=intron_end+min_overlap):
                    count=min_overlap*2
                else:
                    count=cigarOp_end-(intron_end-min_overlap)
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=intron_end-min_overlap and cigarOp_end<intron_end+min_overlap):
                count=op_Length
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start<intron_end+min_overlap and cigarOp_end>=intron_end+min_overlap):
                if (cigarOp_start<=intron_end-min_overlap):
                    count=min_overlap*2
                else:
                    count=(intron_end+min_overlap)-cigarOp_start
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary 

        # get length of the aligned portion of this read from cigar string
        aligned_read_length = CigarOp_counts['M']+CigarOp_counts['D']

        # get 5'SS and 3'SS counts as determined by gene strand
        strand = df['strand_gene'].iloc[i]
        if (strand == '+'):
            aln_start = df['start_aln'].iloc[i] # record the start of the read
            aln_end = df['end_aln'].iloc[i] # record the end of the read
            intron_5SS_counts = start_counts # record the cigar string counts over the 5'SS
            intron_3SS_counts = end_counts # record the cigar string counts over the 3'SS
            read_overlap = intron_end - (aln_end - aligned_read_length + min_overlap)
            
        if (strand == '-'):
            aln_start = df['end_aln'].iloc[i] # record the start of the read
            aln_end = df['start_aln'].iloc[i] # record the end of the read
            intron_5SS_counts = end_counts # record the cigar string counts over the 5'SS
            intron_3SS_counts = start_counts # record the cigar string counts over the 3'SS  
            read_overlap = (aln_end + aligned_read_length - min_overlap) - intron_start
            
        # annotate splicing status based on CIGAR string information around splice sites
        splice='UNDETERMINED'

        if (intron_5SS_counts['N']==0 and intron_3SS_counts['N']==0):
            if (intron_3SS_counts['M']+intron_3SS_counts['D']==min_overlap*2):
                if (intron_3SS_counts['M']>min_overlap):
                    splice = 'NO'

        if (intron_5SS_counts['N']>0 and intron_5SS_counts['N']<min_overlap*2):
            if (intron_3SS_counts['N']>0 and intron_3SS_counts['N']<min_overlap*2):
                splice = 'YES'

        # annotate splicing status based on CIGAR string information within the intron 
        if (splice == 'YES'):
            if (float(intron_end-intron_start) > 0.0):
                ratio = float(intron_counts['N'])/float(intron_end-intron_start)
                difference = abs(intron_counts['N']-(intron_end-intron_start))

                # if read is spliced, between 90-100% of the intron has to be spliced 
                # and no more than 100 nucleotides within the intron can be matching the intron sequence
                if( ratio < 0.9 or ratio > 1.1 or difference > 100):
                    splice='UNDETERMINED'
            if (float(intron_end-intron_start) == 0.0):
                splice='UNDETERMINED'

        if (splice == 'NO'):
            if (float(intron_end-intron_start) > 0.0):
                ratio = float(intron_counts['M'])/(float(intron_counts['M'])+float(intron_counts['N'])+float(intron_counts['D'])+1)

                # if read is unspliced, at least 75% of the read has to match (CIGAR=M) the intron sequence
                if(intron_counts['M'] < min_overlap/2 or ratio < 0.75):
                    splice='UNDETERMINED'
            
            if (float(intron_end-intron_start) == 0.0):
                splice='UNDETERMINED'

        # save read, intron, and splicing information
        spliceCalls.append([df['name_aln'].iloc[i],df['chr_intron'].iloc[i],intron_start,intron_end,df['strand_gene'].iloc[i],df['name_gene'].iloc[i],df['intron_count'].iloc[i],read_overlap,splice])

    spliceCalls_df = pd.DataFrame(spliceCalls)
    spliceCalls_df.columns = ["read_name","chrom","intron_start","intron_end","strand","gene_name","intron_count","read_overlap","splice_status"]

    return spliceCalls_df


# every read that spans an intron in the dataset
def get_read_junctions_dictionary(splice_df):

    read_junctions = {}

    for i in range(0,splice_df.shape[0]):       

        # define the read name
        read_name = splice_df['read_name'].iloc[i]
        gene_name = splice_df['gene_name'].iloc[i]
        chrom = splice_df['chrom'].iloc[i]
        intron_start = splice_df['intron_start'].iloc[i]
        intron_end = splice_df['intron_end'].iloc[i]
        intron_count = splice_df['intron_count'].iloc[i]
        strand = splice_df['strand'].iloc[i]
        read_overlap = splice_df['read_overlap'].iloc[i]
        splice_status = splice_df['splice_status'].iloc[i]

        # check if read name is in the dictionary, if not save it
        if read_name not in read_junctions.keys():

            # make a new dictionary for the gene and add intron info to it
            read_junctions[read_name] = {}
            read_junctions[read_name][gene_name] = [[chrom, intron_start, intron_end, intron_count, strand, read_overlap, splice_status]]

        # check if read name is in the dictionary, if it is proceed to gene information
        elif read_name in read_junctions.keys():

            # if gene_name is not already in read dictionary, 
            # make a new dictionary for the gene and add intron info to it
            if gene_name not in read_junctions[read_name].keys():
                read_junctions[read_name][gene_name] = [[chrom, intron_start, intron_end, intron_count, strand, read_overlap, splice_status]]

            # if gene_name is already in read dictionary, add new intron info to it
            elif gene_name in read_junctions[read_name].keys():
                read_junctions[read_name][gene_name].append([chrom, intron_start, intron_end, intron_count, strand, read_overlap, splice_status])

    return read_junctions


def get_intron_pairs_df(read_junctions):
    intron_pairs = []

    # loop through all reads in the dictionary
    for read in read_junctions.keys():

        # make a set for all intron pairs within a read
        # this will avoid duplicate pairs being called due to alternative splicing
        uniq_pairs = set()
        uniq_splice_pattern = set()
        
        # loop through all genes that has introns that a read maps to
        for gene in read_junctions[read].keys():

            # only go through genes that have 2 or more introns
            if (len(read_junctions[read][gene]) > 1 ):

                # characterize the number of spliced and unspliced introns in the read
                splice_status = [row[6] for row in read_junctions[read][gene]]
                splice_status_join = '_'.join(splice_status)
                status_count = Counter(splice_status)
                 
                # only process the file if intron pattern hasn't been seen previously
                # for a gene that this read aligns to
                if (splice_status_join not in uniq_splice_pattern):
                    uniq_splice_pattern.add(splice_status_join)

                    spliced_count = status_count['YES']
                    unspliced_count = status_count['NO']

                    # build a dataframe of introns in the gene that map to this read
                    # and are capable of being sequenced if the read has no splicing
                    read_introns_df = pd.DataFrame(read_junctions[read][gene])
                    read_introns_df.columns = ['chrom','start','end','intron_count','strand','read_overlap','splice_status']
                    read_introns_df = read_introns_df[read_introns_df['read_overlap'] > 0].sort_values('intron_count').reset_index(drop=True)

                    # loop through introns that read maps to and find pairs
                    prev_intron_count = -2    # counter for the start becuase no intron should have a negative count

                    for i in range(len(read_introns_df)):
                        intron_count = read_introns_df.iloc[i]['intron_count']
                        intron_chrom = str(read_introns_df.iloc[i]['chrom'])
                        intron_start = str(read_introns_df.iloc[i]['start'])
                        intron_end = str(read_introns_df.iloc[i]['end'])
                        intron_strand = read_introns_df.iloc[i]['strand']
                        intron_splice = read_introns_df.iloc[i]['splice_status']
                        intron_coord = intron_chrom+'_'+intron_start+'_'+intron_end

                        # if intron counts are sequential (one follows the next)
                        # it is a true intron pair (i.e. neighboring introns)
                        if (intron_count - prev_intron_count == 1):
                            intron_pair_coord = prev_intron_coord+'_'+intron_coord

                            # record information about the read pair only if the coordinates of this
                            # intron pair have not yet been seen
                            if (intron_pair_coord not in uniq_pairs): 
                                uniq_pairs.add(intron_pair_coord)
                                prev_intron_start = prev_intron_coord.split('_')[1]
                                prev_intron_end = prev_intron_coord.split('_')[2]

                                # append intron pair coordinate and splicing information to a list
                                intron_pairs.append([read,intron_chrom,prev_intron_start,prev_intron_end,
                                                     int(intron_start),int(intron_end),intron_strand,
                                                    prev_intron_splice, intron_splice])

                        # save information about this intron for the next pair
                        prev_intron_count = intron_count
                        prev_intron_coord = intron_coord
                        prev_intron_splice = intron_splice

    intron_pairs_df = pd.DataFrame(intron_pairs)
    intron_pairs_df.columns = ['read','chrom','int1_start','int1_end','int2_start','int2_end','strand','int1_splice','int2_splice']        

    return intron_pairs_df


In [3]:
# set all variables for analysis
min_overlap = 25

In [4]:
# Get human data for figure 3B (Splicing pattern proportions)

# install splice sites from K562 intron files
K562_intronFile = open('/path/to/annotation_files/NCBI_RefSeq_hg38_introns_parsed.bed')

# make a dataframe of intron coordinates
K562_introns_bedtool = hg38_introns(K562_intronFile) 

# import alignment files
K562_1_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_1_hg38_minimap2_uniq_sort.bam')
K562_2_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_2_hg38_minimap2_uniq_sort.bam')
K562_3_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_3_hg38_minimap2_uniq_sort.bam')
K562_4_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_4_hg38_minimap2_uniq_sort.bam')
K562_5a_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_5a_hg38_minimap2_uniq_sort.bam')
K562_5b_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_5b_hg38_minimap2_uniq_sort.bam')



In [5]:
# get reads that intersect introns
K562_1_intersect = get_intron_intersect(K562_introns_bedtool, K562_1_bam_file)
K562_2_intersect = get_intron_intersect(K562_introns_bedtool, K562_2_bam_file)
K562_3_intersect = get_intron_intersect(K562_introns_bedtool, K562_3_bam_file)
K562_4_intersect = get_intron_intersect(K562_introns_bedtool, K562_4_bam_file)
K562_5a_intersect = get_intron_intersect(K562_introns_bedtool, K562_5a_bam_file)
K562_5b_intersect = get_intron_intersect(K562_introns_bedtool, K562_5b_bam_file)



In [6]:
# get splicing information for every read that spans an intron
K562_1_splice_info = get_splicing_info(K562_1_intersect,min_overlap)
K562_2_splice_info = get_splicing_info(K562_2_intersect,min_overlap)
K562_3_splice_info = get_splicing_info(K562_3_intersect,min_overlap)
K562_4_splice_info = get_splicing_info(K562_4_intersect,min_overlap)
K562_5a_splice_info = get_splicing_info(K562_5a_intersect,min_overlap)
K562_5b_splice_info = get_splicing_info(K562_5b_intersect,min_overlap)


# remove unwanted intron information
K562_1_splice_info = K562_1_splice_info[K562_1_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_1_splice_info_wiOverlap = K562_1_splice_info[K562_1_splice_info['read_overlap']>(K562_1_splice_info['intron_end']-K562_1_splice_info['intron_start'])].reset_index(drop=True)

K562_2_splice_info = K562_2_splice_info[K562_2_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_2_splice_info_wiOverlap = K562_2_splice_info[K562_2_splice_info['read_overlap']>(K562_2_splice_info['intron_end']-K562_2_splice_info['intron_start'])].reset_index(drop=True)

K562_3_splice_info = K562_3_splice_info[K562_3_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_3_splice_info_wiOverlap = K562_3_splice_info[K562_3_splice_info['read_overlap']>(K562_3_splice_info['intron_end']-K562_3_splice_info['intron_start'])].reset_index(drop=True)

K562_4_splice_info = K562_4_splice_info[K562_4_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_4_splice_info_wiOverlap = K562_4_splice_info[K562_4_splice_info['read_overlap']>(K562_4_splice_info['intron_end']-K562_4_splice_info['intron_start'])].reset_index(drop=True)

K562_5a_splice_info = K562_5a_splice_info[K562_5a_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_5a_splice_info_wiOverlap = K562_5a_splice_info[K562_5a_splice_info['read_overlap']>(K562_5a_splice_info['intron_end']-K562_5a_splice_info['intron_start'])].reset_index(drop=True)

K562_5b_splice_info = K562_5b_splice_info[K562_5b_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_5b_splice_info_wiOverlap = K562_5b_splice_info[K562_5b_splice_info['read_overlap']>(K562_5b_splice_info['intron_end']-K562_5b_splice_info['intron_start'])].reset_index(drop=True)


In [7]:
# get dictionary with all intron junctions that a read spans
K562_1_splice_dictionary = get_read_junctions_dictionary(K562_1_splice_info_wiOverlap)
K562_2_splice_dictionary = get_read_junctions_dictionary(K562_2_splice_info_wiOverlap)
K562_3_splice_dictionary = get_read_junctions_dictionary(K562_3_splice_info_wiOverlap)
K562_4_splice_dictionary = get_read_junctions_dictionary(K562_4_splice_info_wiOverlap)
K562_5a_splice_dictionary = get_read_junctions_dictionary(K562_5a_splice_info_wiOverlap)
K562_5b_splice_dictionary = get_read_junctions_dictionary(K562_5b_splice_info_wiOverlap)


In [8]:
# save splicing dictionaries to file (to use again later for plotting) (K562 cells)
np.save('/path/to/K562_1_splicing_dictionary.npy', K562_6_splice_dictionary) 
np.save('/path/to/K562_2_splicing_dictionary.npy', K562_7_splice_dictionary) 
np.save('/path/to/K562_3_splicing_dictionary.npy', K562_12_splice_dictionary) 
np.save('/path/to/K562_4_splicing_dictionary.npy', K562_27_splice_dictionary) 
np.save('/path/to/K562_5a_splicing_dictionary.npy', Pr1_splice_dictionary) 
np.save('/path/to/K562_5b_splicing_dictionary.npy', Pr2_splice_dictionary) 


In [9]:
# get information about intron pairs from read junctions dictionary
K562_1_intron_pairs_df = get_intron_pairs_df(K562_1_splice_dictionary)
K562_2_intron_pairs_df = get_intron_pairs_df(K562_2_splice_dictionary)
K562_3_intron_pairs_df = get_intron_pairs_df(K562_3_splice_dictionary)
K562_4_intron_pairs_df = get_intron_pairs_df(K562_4_splice_dictionary)
K562_5a_intron_pairs_df = get_intron_pairs_df(K562_5a_splice_dictionary)
K562_5b_intron_pairs_df = get_intron_pairs_df(K562_5b_splice_dictionary)


In [10]:
# save splice dataframes to file (to use again later for plotting) (K562 cells)
K562_1_intron_pairs_df.to_csv('/path/to/K562_1_hg38_intron_pairs_df.txt', sep='\t', index=False, header=True)
K562_2_intron_pairs_df.to_csv('/path/to/K562_2_hg38_intron_pairs_df.txt', sep='\t', index=False, header=True)
K562_3_intron_pairs_df.to_csv('/path/to/K562_3_hg38_intron_pairs_df.txt', sep='\t', index=False, header=True)
K562_4_intron_pairs_df.to_csv('/path/to/K562_4_hg38_intron_pairs_df.txt', sep='\t', index=False, header=True)
K562_5a_intron_pairs_df.to_csv('/path/to/K562_5a_hg38_intron_pairs_df.txt', sep='\t', index=False, header=True)
K562_5b_intron_pairs_df.to_csv('/path/to/K562_5b_hg38_intron_pairs_df.txt', sep='\t', index=False, header=True)


In [16]:
### intron pairs for B lymphoblast (BL1184) sample

# load bam files
BL1184_1_bam_file = pybedtools.BedTool('/path/to/BL1184_4sUchr_ONT_1_hg38_minimap2_uniq_sort.bam')
BL1184_2_bam_file = pybedtools.BedTool('/path/to/BL1184_4sUchr_ONT_2_hg38_minimap2_uniq_sort.bam')

# get reads that intersect introns
BL1184_1_intersect = get_intron_intersect(K562_introns_bedtool, BL1184_1_bam_file)
BL1184_2_intersect = get_intron_intersect(K562_introns_bedtool, BL1184_2_bam_file)

# get splicing information for every read that spans an intron
BL1184_1_splice_info = get_splicing_info(BL1184_1_intersect,min_overlap)
BL1184_2_splice_info = get_splicing_info(BL1184_2_intersect,min_overlap)

# remove unwanted intron information
BL1184_1_splice_info = BL1184_1_splice_info[BL1184_1_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
BL1184_1_splice_info_wiOverlap = BL1184_1_splice_info[BL1184_1_splice_info['read_overlap']>(BL1184_1_splice_info['intron_end']-BL1184_1_splice_info['intron_start'])].reset_index(drop=True)
BL1184_2_splice_info = BL1184_2_splice_info[BL1184_2_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
BL1184_2_splice_info_wiOverlap = BL1184_2_splice_info[BL1184_2_splice_info['read_overlap']>(BL1184_2_splice_info['intron_end']-BL1184_2_splice_info['intron_start'])].reset_index(drop=True)

# get dictionary with all intron junctions that a read spans
BL1184_1_splice_dictionary = get_read_junctions_dictionary(BL1184_1_splice_info_wiOverlap)
BL1184_2_splice_dictionary = get_read_junctions_dictionary(BL1184_2_splice_info_wiOverlap)

# save splicing dictionary datasets
np.save('/path/to/BL1184_1_splicing_dictionary.npy', BL1184_1_splice_dictionary) 
np.save('/path/to/BL1184_2_splicing_dictionary.npy', BL1184_2_splice_dictionary) 

# get information about intron pairs from read junctions dictionary
BL1184_1_intron_pairs_df = get_intron_pairs_df(BL1184_1_splice_dictionary)
BL1184_2_intron_pairs_df = get_intron_pairs_df(BL1184_2_splice_dictionary)

# save intron_pairs datasets
BL1184_1_intron_pairs_df.to_csv('/path/to/BL1184_1_hg38_intron_pairs_df.txt', sep='\t', index=False, header=True)
BL1184_2_intron_pairs_df.to_csv('/path/to/BL1184_2_hg38_intron_pairs_df.txt', sep='\t', index=False, header=True)


In [None]:
## intron pairs for S2 Drosophila samples

# install splice sites from K562 intron files
S2_intronFile = open('/path/to/annotation_files/dm6_RefSeq_introns_parsed.bed')

# make a dataframe of intron coordinates
S2_introns_bedtool = dmel_introns(S2_intronFile) 

# import alignment files
S2_1a_bam_file = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_1a_dm6_minimap2_uniq_sort.bam')
S2_1b_bam_file = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_1b_dm6_minimap2_uniq_sort.bam')
S2_2_bam_file = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_2_dm6_minimap2_uniq_sort.bam')
S2_3_bam_file = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_3_dm6_minimap2_uniq_sort.bam')

In [None]:
# get reads that intersect introns
S2_1a_intersect = get_intron_intersect(S2_introns_bedtool, S2_1a_bam_file)
S2_1b_intersect = get_intron_intersect(S2_introns_bedtool, S2_1b_bam_file)
S2_2_intersect = get_intron_intersect(S2_introns_bedtool, S2_2_bam_file)
S2_3_intersect = get_intron_intersect(S2_introns_bedtool, S2_3_bam_file)

# get splicing information for every read that spans an intron
S2_1a_splice_info = get_splicing_info(S2_1a_intersect,min_overlap)
S2_1b_splice_info = get_splicing_info(S2_1b_intersect,min_overlap)
S2_2_splice_info = get_splicing_info(S2_2_intersect,min_overlap)
S2_3_splice_info = get_splicing_info(S2_3_intersect,min_overlap)

# rerun with only introns that overlap the read without any splicing to control for read length bias
S2_1a_splice_info = S2_1a_splice_info[S2_1a_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_1a_splice_info_wiOverlap = S2_1a_splice_info[S2_1a_splice_info['read_overlap']>(S2_1a_splice_info['intron_end']-S2_1a_splice_info['intron_start'])].reset_index(drop=True)

S2_1b_splice_info = S2_1b_splice_info[S2_1b_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_1b_splice_info_wiOverlap = S2_1b_splice_info[S2_1b_splice_info['read_overlap']>(S2_1b_splice_info['intron_end']-S2_1b_splice_info['intron_start'])].reset_index(drop=True)

S2_2_splice_info = S2_2_splice_info[S2_2_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_2_splice_info_wiOverlap = S2_2_splice_info[S2_2_splice_info['read_overlap']>(S2_2_splice_info['intron_end']-S2_2_splice_info['intron_start'])].reset_index(drop=True)

S2_3_splice_info = S2_3_splice_info[S2_3_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_3_splice_info_wiOverlap = S2_3_splice_info[S2_3_splice_info['read_overlap']>(S2_3_splice_info['intron_end']-S2_3_splice_info['intron_start'])].reset_index(drop=True)


# get dictionary with all intron junctions that a read spans
S2_1a_splice_dictionary = get_read_junctions_dictionary(S2_1a_splice_info_wiOverlap)
S2_1b_splice_dictionary = get_read_junctions_dictionary(S2_1b_splice_info_wiOverlap)
S2_2_splice_dictionary = get_read_junctions_dictionary(S2_2_splice_info_wiOverlap)
S2_3_splice_dictionary = get_read_junctions_dictionary(S2_3_splice_info_wiOverlap)

# save dictionary with all intron junctions that a read spans
np.save('/path/to/S2_1a_splicing_dictionary.npy', S2_1a_splice_dictionary) 
np.save('/path/to/S2_1b_splicing_dictionary.npy', S2_1b_splice_dictionary) 
np.save('/path/to/S2_2_splicing_dictionary.npy', S2_2_splice_dictionary) 
np.save('/path/to/S2_3_splicing_dictionary.npy', S2_3_splice_dictionary) 

In [None]:
# get information about intron pairs from read junctions dictionary
S2_1a_intron_pairs_df = get_intron_pairs_df(S2_1a_splice_dictionary)
S2_1b_intron_pairs_df = get_intron_pairs_df(S2_1b_splice_dictionary)
S2_2_intron_pairs_df = get_intron_pairs_df(S2_2_splice_dictionary)
S2_3_intron_pairs_df = get_intron_pairs_df(S2_3_splice_dictionary)

# save splice dataframes to file (to use again later for plotting) (S2 cells)
S2_1a_intron_pairs_df.to_csv('/path/to/S2_1a_dm6_intron_pairs_df.txt', sep='\t', index=False, header=True)
S2_1b_intron_pairs_df.to_csv('/path/to/S2_1b_dm6_intron_pairs_df.txt', sep='\t', index=False, header=True)
S2_2_intron_pairs_df.to_csv('/path/to/S2_2_dm6_intron_pairs_df.txt', sep='\t', index=False, header=True)
S2_3_intron_pairs_df.to_csv('/path/to/S2_3_dm6_intron_pairs_df.txt', sep='\t', index=False, header=True)

In [None]:
# Splicing dictionaries for DMSO and PlaB datasets
K562_DMSO_1_bamFile = pybedtools.BedTool('/path/to/K562_ONT_DMSO_1_hg38_minimap2_uniq_sort.bam')
K562_DMSO_2_bamFile = pybedtools.BedTool('/path/to/K562_ONT_DMSO_2_hg38_minimap2_uniq_sort.bam')
K562_PlaB_1_bamFile = pybedtools.BedTool('/path/to/K562_ONT_PlaB_1_hg38_minimap2_uniq_sort.bam')
K562_PlaB_2_bamFile = pybedtools.BedTool('/path/to/K562_ONT_PlaB_2_hg38_minimap2_uniq_sort.bam')

S2_DMSO_1_bamFile = pybedtools.BedTool('/path/to/S2_ONT_DMSO_1_dm6_minimap2_uniq_sort.bam')
S2_DMSO_2_bamFile = pybedtools.BedTool('/path/to/S2_ONT_DMSO_1_dm6_minimap2_uniq_sort.bam')
S2_PlaB_1a_bamFile = pybedtools.BedTool('/path/to/S2_ONT_PlaB_1a_dm6_minimap2_uniq_sort.bam')
S2_PlaB_1b_bamFile = pybedtools.BedTool('/path/to/S2_ONT_PlaB_1b_dm6_minimap2_uniq_sort.bam')
S2_PlaB_2_bamFile = pybedtools.BedTool('/path/to/S2_ONT_PlaB_2_dm6_minimap2_uniq_sort.bam')

In [None]:
# K562
# get reads that intersect introns
K562_DMSO_1_intersect = get_intron_intersect(S2_introns_bedtool, K562_DMSO_1_bam_file)
K562_DMSO_2_intersect = get_intron_intersect(S2_introns_bedtool, K562_DMSO_2_bam_file)
K562_PlaB_1_intersect = get_intron_intersect(S2_introns_bedtool, K562_PlaB_1_bam_file)
K562_PlaB_2_intersect = get_intron_intersect(S2_introns_bedtool, K562_PlaB_2_bam_file)

# get splicing information for every read that spans an intron
K562_DMSO_1_splice_info = get_splicing_info(K562_DMSO_1_intersect,min_overlap)
K562_DMSO_2_splice_info = get_splicing_info(K562_DMSO_2_intersect,min_overlap)
K562_PlaB_1_splice_info = get_splicing_info(K562_PlaB_1_intersect,min_overlap)
K562_PlaB_2_splice_info = get_splicing_info(K562_PlaB_2_intersect,min_overlap)

# rerun with only introns that overlap the read without any splicing to control for read length bias
K562_DMSO_1_splice_info = K562_DMSO_1_splice_info[K562_DMSO_1_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_DMSO_1_splice_info_wiOverlap = K562_DMSO_1_splice_info[K562_DMSO_1_splice_info['read_overlap']>(K562_DMSO_1_splice_info['intron_end']-K562_DMSO_1_splice_info['intron_start'])].reset_index(drop=True)

K562_DMSO_2_splice_info = K562_DMSO_2_splice_info[K562_DMSO_2_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_DMSO_2_splice_info_wiOverlap = K562_DMSO_2_splice_info[K562_DMSO_2_splice_info['read_overlap']>(K562_DMSO_2_splice_info['intron_end']-K562_DMSO_2_splice_info['intron_start'])].reset_index(drop=True)

K562_PlaB_1_splice_info = K562_PlaB_1_splice_info[K562_PlaB_1_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_PlaB_1_splice_info_wiOverlap = K562_PlaB_1_splice_info[K562_PlaB_1_splice_info['read_overlap']>(K562_PlaB_1_splice_info['intron_end']-K562_PlaB_1_splice_info['intron_start'])].reset_index(drop=True)

K562_PlaB_2_splice_info = K562_PlaB_2_splice_info[K562_PlaB_2_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_PlaB_2_splice_info_wiOverlap = K562_PlaB_2_splice_info[K562_PlaB_2_splice_info['read_overlap']>(K562_PlaB_2_splice_info['intron_end']-K562_PlaB_2_splice_info['intron_start'])].reset_index(drop=True)


# get dictionary with all intron junctions that a read spans
K562_DMSO_1_splice_dictionary = get_read_junctions_dictionary(K562_DMSO_1_splice_info_wiOverlap)
K562_DMSO_2_splice_dictionary = get_read_junctions_dictionary(K562_DMSO_2_splice_info_wiOverlap)
K562_PlaB_1_splice_dictionary = get_read_junctions_dictionary(K562_PlaB_1_splice_info_wiOverlap)
K562_PlaB_2_splice_dictionary = get_read_junctions_dictionary(K562_PlaB_2_splice_info_wiOverlap)

# save dictionary with all intron junctions that a read spans
np.save('/path/to/K562_DMSO_1_splicing_dictionary.npy', K562_DMSO_1_splice_dictionary) 
np.save('/path/to/K562_DMSO_2_splicing_dictionary.npy', K562_DMSO_2_splice_dictionary) 
np.save('/path/to/K562_PlaB_1_splicing_dictionary.npy', K562_PlaB_1_splice_dictionary) 
np.save('/path/to/K562_PlaB_2_splicing_dictionary.npy', K562_PlaB_2_splice_dictionary) 

In [None]:
# S2
# get reads that intersect introns
S2_DMSO_1_intersect = get_intron_intersect(S2_introns_bedtool, S2_DMSO_1_bam_file)
S2_DMSO_2_intersect = get_intron_intersect(S2_introns_bedtool, S2_DMSO_2_bam_file)
S2_PlaB_1a_intersect = get_intron_intersect(S2_introns_bedtool, S2_PlaB_1a_bam_file)
S2_PlaB_1b_intersect = get_intron_intersect(S2_introns_bedtool, S2_PlaB_1b_bam_file)
S2_PlaB_2_intersect = get_intron_intersect(S2_introns_bedtool, S2_PlaB_2_bam_file)

# get splicing information for every read that spans an intron
S2_DMSO_1_splice_info = get_splicing_info(S2_DMSO_1_intersect,min_overlap)
S2_DMSO_2_splice_info = get_splicing_info(S2_DMSO_2_intersect,min_overlap)
S2_PlaB_1a_splice_info = get_splicing_info(S2_PlaB_1a_intersect,min_overlap)
S2_PlaB_1b_splice_info = get_splicing_info(S2_PlaB_1b_intersect,min_overlap)
S2_PlaB_2_splice_info = get_splicing_info(S2_PlaB_2_intersect,min_overlap)

# rerun with only introns that overlap the read without any splicing to control for read length bias
S2_DMSO_1_splice_info = S2_DMSO_1_splice_info[S2_DMSO_1_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_DMSO_1_splice_info_wiOverlap = S2_DMSO_1_splice_info[S2_DMSO_1_splice_info['read_overlap']>(S2_DMSO_1_splice_info['intron_end']-S2_DMSO_1_splice_info['intron_start'])].reset_index(drop=True)

S2_DMSO_2_splice_info = S2_DMSO_2_splice_info[S2_DMSO_2_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_DMSO_2_splice_info_wiOverlap = S2_DMSO_2_splice_info[S2_DMSO_2_splice_info['read_overlap']>(S2_DMSO_2_splice_info['intron_end']-S2_DMSO_2_splice_info['intron_start'])].reset_index(drop=True)

S2_PlaB_1a_splice_info = S2_PlaB_1a_splice_info[S2_PlaB_1a_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_PlaB_1a_splice_info_wiOverlap = S2_PlaB_1a_splice_info[S2_PlaB_1a_splice_info['read_overlap']>(S2_PlaB_1a_splice_info['intron_end']-S2_PlaB_1a_splice_info['intron_start'])].reset_index(drop=True)

S2_PlaB_1b_splice_info = S2_PlaB_1b_splice_info[S2_PlaB_1b_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_PlaB_1b_splice_info_wiOverlap = S2_PlaB_1b_splice_info[S2_PlaB_1b_splice_info['read_overlap']>(S2_PlaB_1b_splice_info['intron_end']-S2_PlaB_1b_splice_info['intron_start'])].reset_index(drop=True)

S2_PlaB_2_splice_info = S2_PlaB_2_splice_info[S2_PlaB_2_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_PlaB_2_splice_info_wiOverlap = S2_PlaB_2_splice_info[S2_PlaB_2_splice_info['read_overlap']>(S2_PlaB_2_splice_info['intron_end']-S2_PlaB_2_splice_info['intron_start'])].reset_index(drop=True)


# get dictionary with all intron junctions that a read spans
S2_DMSO_1_splice_dictionary = get_read_junctions_dictionary(S2_DMSO_1_splice_info_wiOverlap)
S2_DMSO_2_splice_dictionary = get_read_junctions_dictionary(S2_DMSO_2_splice_info_wiOverlap)
S2_PlaB_1a_splice_dictionary = get_read_junctions_dictionary(S2_PlaB_1a_splice_info_wiOverlap)
S2_PlaB_1b_splice_dictionary = get_read_junctions_dictionary(S2_PlaB_1b_splice_info_wiOverlap)
S2_PlaB_2_splice_dictionary = get_read_junctions_dictionary(S2_PlaB_2_splice_info_wiOverlap)

# save dictionary with all intron junctions that a read spans
np.save('/path/to/S2_DMSO_1_splicing_dictionary.npy', S2_DMSO_1_splice_dictionary) 
np.save('/path/to/S2_DMSO_2_splicing_dictionary.npy', S2_DMSO_2_splice_dictionary) 
np.save('/path/to/S2_PlaB_1a_splicing_dictionary.npy', S2_PlaB_1a_splice_dictionary)
np.save('/path/to/S2_PlaB_1b_splicing_dictionary.npy', S2_PlaB_1b_splice_dictionary) 
np.save('/path/to/S2_PlaB_2_splicing_dictionary.npy', S2_PlaB_2_splice_dictionary) 