In [None]:
"""

Date : August 1, 2019

Author : Heather Landry Drexler

This script will develop datasets for splicing order plots
for alternative introns. These files serve as input for the scripts for
Figure 5I.
                          
                                            
"""

In [1]:
import numpy as np
import pandas as pd
import pysam
from collections import Counter

import matplotlib.pyplot as plt
import re
% matplotlib inline

import math

import pybedtools
from pybedtools import BedTool

import seaborn as sns
sns.set_style("white")

In [2]:
def changeChrom(row):
    chrom = row['chrom'][3:]
    return chrom

# function to get a bedtool file with splice site info from dm6 intron coordinate bed file
def hg38_introns_byRow(row):

    chrom = row['chrom'][3:]
    start = row['start']
    end = row['end']
    name_split = row['name'].split('_intron')
    gene = name_split[0] 
    feature = name_split[1].split('_')[1] # intron_count
    strand = row['strand']
    alt_refseq = row['alternative_gtf']
    SE = row['SE_MISO']
    SE_PSI = row['PSI_SE']
    A5SS = row['A5SS_MISO']
    A5SS_PSI = row['PSI_A5SS']
    A3SS = row['A3SS_MISO']
    A3SS_PSI = row['PSI_A3SS']
    MXE = row['MXE_MISO']
    MXE_PSI = row['PSI_MXE']
        
    # Make a single column for classification of alternative introns
    alt_events = []
        
    if (SE == 'YES' or A5SS == 'YES' or A3SS == 'YES' or MXE == 'YES'):
        if (SE_PSI > 0.8 or A5SS_PSI > 0.8 or A3SS_PSI > 0.8 or MXE_PSI > 0.8):
            return 'ALT'
        elif (SE_PSI > 0.2 or A5SS_PSI > 0.2 or A3SS_PSI > 0.2 or MXE_PSI > 0.2):
            return 'PARTIAL'
        else:
            return 'OTHER'              
    elif (SE == 'NO' and A5SS == 'NO' and A3SS == 'NO' and MXE == 'NO'):
        return 'CONST'                               
    else:
        return 'OTHER'


# function to create a dataframe with reads that span 3'SS positions
def get_intron_intersect(introns_df, bam_file):
    # get reads that span 3' splice sites and convert to a dataframe
    bedFile = bam_file.bam_to_bed(cigar=True, tag='NM') # convert bam file to bed file, keep cigar string and NM (edit distance) tag
    intersect = bedFile.intersect(introns_df, wo=True, s=True) # intersect reads from bam file with 3' splice site coordinates, ensure strandedness
    df = intersect.to_dataframe(names=['chr_aln', 'start_aln', 'end_aln', 'name_aln', 'qual_aln', \
                                           'strand_aln', 'cigar_aln', 'chr_intron', 'start_intron', \
                                           'end_intron', 'name_gene', 'intron_count', 'strand_gene', 'alt_refseq', \
                                       'alt_events', 'count'], \
                               dtype={"chr_aln": str, "start_aln": int, "end_aln": int, \
                                     "name_aln": str, "qual_aln": int, "strand_aln": str, \
                                     "cigar_aln": str, "chr_intron": str, "start_intron": int, \
                                     "end_intron": int, "name_gene": str, \
                                     "intron_count": int,"strand_gene": str, \
                                      "alt_refseq": str, "alt_events": str, \
                                      "count": int}) # convert to a dataframe
    return df


# function to create a dataframe with splicing information for
# every read that spans an intron in the dataset
def get_splicing_info(intersect_df, min_overlap):
    
    df = intersect_df

    # prepare a list for splice calls
    spliceCalls = []

    # set variables for parsing the cigar string
    pattern = re.compile('([MIDNSHPX=])')
    Consumes_Query = ["M", "I", "S", "=", "X"]
    Consumes_Reference = ["M", "D", "N", "=", "X"]    

    # loop through all read-intron intersects
    for i in range(0,df.shape[0]):

        # ignore reads that do not overlap intron by minimum threshold
        if (df['count'].iloc[i] < min_overlap):
            continue

        # record the start and ends of reads 
        # will deal with gene strand after cigar counts are made
        aln_start = df['start_aln'].iloc[i] # record the start of the read
        aln_end = df['end_aln'].iloc[i] # record the end of the read
        intron_start = df['start_intron'].iloc[i] # record the end of the intron
        intron_end = df['end_intron'].iloc[i] # record the end of the intron

        # parse cigar string into a list of tuples for easy parsing
        Sep_Values = pattern.split(df['cigar_aln'].iloc[i])[:-1]
        CigarPairs = list((Sep_Values[n:n+2] for n in range(0, len(Sep_Values), 2)))  

        # set up variables for measuring the length of cigar string operators
        CigarOp_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        start_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        end_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        intron_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        currentloc = int(df['start_aln'].iloc[i])

        # go through list of cigar strings and grab splicing information
        for cigar_Entry in CigarPairs:

            op_Length = int(cigar_Entry[0]) # get length of cigar operator
            cigarOp = cigar_Entry[1] # get type of cigar operator  
            CigarOp_counts[cigarOp] += op_Length # add the cigar operator length to the counts dictionary
            cigarOp_start=currentloc # get the starting coordinate of the cigar operator

            if (cigarOp in Consumes_Reference):
                currentloc=currentloc+op_Length # add the cigar operator length to the current location coordinate 

            cigarOp_end=currentloc # get the ending coordinate of the cigar operator

            # gather information if the portion of the cigar string spans the designated intron start
            if (cigarOp_start<intron_start-min_overlap and cigarOp_end>=intron_start-min_overlap):
                if (cigarOp_end>=intron_start+min_overlap):
                    count=min_overlap*2
                else:
                    count=cigarOp_end-(intron_start-min_overlap)+1
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=intron_start-min_overlap and cigarOp_end<intron_start+min_overlap):
                count=op_Length
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary       

            elif (cigarOp_start<intron_start+min_overlap and cigarOp_end>=intron_start+min_overlap):
                if (cigarOp_start<=intron_start-min_overlap):
                    count=min_overlap*2
                else:
                    count=(intron_start+min_overlap)-cigarOp_start-1
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary 

            # gather information if the portion of the cigar string is within the intron
            if (cigarOp_start<intron_start and cigarOp_end>=intron_start):
                if (cigarOp_end>=intron_end):
                    count=intron_end-intron_start
                else:
                    count=cigarOp_end-intron_start
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=intron_start and cigarOp_end<intron_end):
                count=op_Length
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start<intron_end and cigarOp_end>=intron_end):
                if (cigarOp_start<=intron_start):
                    count=intron_end-intron_start
                else:
                    count=intron_end-cigarOp_start
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary 

            # gather information if the portion of the cigar string spans the designated intron end
            if (cigarOp_start<intron_end-min_overlap and cigarOp_end>=intron_end-min_overlap):
                if (cigarOp_end>=intron_end+min_overlap):
                    count=min_overlap*2
                else:
                    count=cigarOp_end-(intron_end-min_overlap)
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=intron_end-min_overlap and cigarOp_end<intron_end+min_overlap):
                count=op_Length
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start<intron_end+min_overlap and cigarOp_end>=intron_end+min_overlap):
                if (cigarOp_start<=intron_end-min_overlap):
                    count=min_overlap*2
                else:
                    count=(intron_end+min_overlap)-cigarOp_start
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary 

        # get length of the aligned portion of this read from cigar string
        aligned_read_length = CigarOp_counts['M']+CigarOp_counts['D']

        # get 5'SS and 3'SS counts as determined by gene strand
        strand = df['strand_gene'].iloc[i]
        if (strand == '+'):
            aln_start = df['start_aln'].iloc[i] # record the start of the read
            aln_end = df['end_aln'].iloc[i] # record the end of the read
            intron_5SS_counts = start_counts # record the cigar string counts over the 5'SS
            intron_3SS_counts = end_counts # record the cigar string counts over the 3'SS
            read_overlap = intron_end - (aln_end - aligned_read_length + min_overlap)
            
        if (strand == '-'):
            aln_start = df['end_aln'].iloc[i] # record the start of the read
            aln_end = df['start_aln'].iloc[i] # record the end of the read
            intron_5SS_counts = end_counts # record the cigar string counts over the 5'SS
            intron_3SS_counts = start_counts # record the cigar string counts over the 3'SS  
            read_overlap = (aln_end + aligned_read_length - min_overlap) - intron_start
            
        # annotate splicing status based on CIGAR string information around splice sites
        splice='UNDETERMINED'

        if (intron_5SS_counts['N']==0 and intron_3SS_counts['N']==0):
            if (intron_3SS_counts['M']+intron_3SS_counts['D']==min_overlap*2):
                if (intron_3SS_counts['M']>min_overlap):
                    splice = 'NO'

        if (intron_5SS_counts['N']>0 and intron_5SS_counts['N']<min_overlap*2):
            if (intron_3SS_counts['N']>0 and intron_3SS_counts['N']<min_overlap*2):
                splice = 'YES'

        # annotate splicing status based on CIGAR string information within the intron 
        if (splice == 'YES'):
            if (float(intron_end-intron_start) > 0.0):
                ratio = float(intron_counts['N'])/float(intron_end-intron_start)
                difference = abs(intron_counts['N']-(intron_end-intron_start))

                # if read is spliced, between 90-100% of the intron has to be spliced 
                # and no more than 100 nucleotides within the intron can be matching the intron sequence
                # change this to no more than 10 nucleotides for alternative splicing analysis
                if( ratio < 0.9 or ratio > 1.1 or difference > 10):
                    splice='UNDETERMINED'
            if (float(intron_end-intron_start) == 0.0):
                splice='UNDETERMINED'

        if (splice == 'NO'):
            if (float(intron_end-intron_start) > 0.0):
                ratio = float(intron_counts['M'])/(float(intron_counts['M'])+float(intron_counts['N'])+float(intron_counts['D'])+1)

                # if read is unspliced, at least 75% of the read has to match (CIGAR=M) the intron sequence
                if(intron_counts['M'] < min_overlap/2 or ratio < 0.75):
                    splice='UNDETERMINED'
            
            if (float(intron_end-intron_start) == 0.0):
                splice='UNDETERMINED'

        # save read, intron, and splicing information
        spliceCalls.append([df['name_aln'].iloc[i],df['chr_intron'].iloc[i],intron_start,intron_end,df['strand_gene'].iloc[i],df['name_gene'].iloc[i],df['intron_count'].iloc[i],read_overlap,splice,
                           df['alt_refseq'].iloc[i],df['alt_events'].iloc[i]])  

    spliceCalls_df = pd.DataFrame(spliceCalls)
    spliceCalls_df.columns = ["read_name","chrom","intron_start","intron_end","strand","gene_name","intron_count","read_overlap","splice_status",
                             "alt_refseq", "alt_events"]

    return spliceCalls_df



# every read that spans an intron in the dataset
def get_read_junctions_dictionary(splice_df):

    read_junctions = {}

    for i in range(0,splice_df.shape[0]):       

        # define the read name
        read_name = splice_df['read_name'].iloc[i]
        gene_name = splice_df['gene_name'].iloc[i]
        chrom = splice_df['chrom'].iloc[i]
        intron_start = splice_df['intron_start'].iloc[i]
        intron_end = splice_df['intron_end'].iloc[i]
        intron_count = splice_df['intron_count'].iloc[i]
        strand = splice_df['strand'].iloc[i]
        read_overlap = splice_df['read_overlap'].iloc[i]
        splice_status = splice_df['splice_status'].iloc[i]
        alt_refseq = splice_df['alt_refseq'].iloc[i]
        alt_events = splice_df['alt_events'].iloc[i]

        # check if read name is in the dictionary, if not save it
        if read_name not in read_junctions.keys():

            # make a new dictionary for the gene and add intron info to it
            read_junctions[read_name] = {}
            read_junctions[read_name][gene_name] = [[chrom, intron_start, intron_end, intron_count, strand, read_overlap, splice_status,
                                                    alt_refseq, alt_events]]

        # check if read name is in the dictionary, if it is proceed to gene information
        elif read_name in read_junctions.keys():

            # if gene_name is not already in read dictionary, 
            # make a new dictionary for the gene and add intron info to it
            if gene_name not in read_junctions[read_name].keys():
                read_junctions[read_name][gene_name] = [[chrom, intron_start, intron_end, intron_count, strand, read_overlap, splice_status,
                                                        alt_refseq, alt_events]]

            # if gene_name is already in read dictionary, add new intron info to it
            elif gene_name in read_junctions[read_name].keys():
                read_junctions[read_name][gene_name].append([chrom, intron_start, intron_end, intron_count, strand, read_overlap, splice_status,
                                                            alt_refseq, alt_events])

    return read_junctions


def get_intron_distribution(read_junctions):

    intron_distribution = []

    for read in read_junctions.keys():

        read_distribution = []

        for gene in read_junctions[read].keys():

            if (len(read_junctions[read][gene]) > 1):

                splice_status = [row[6] for row in read_junctions[read][gene]]
                status_count = Counter(splice_status)

                spliced_count = status_count['YES']
                unspliced_count = status_count['NO']

                if (unspliced_count == 0 and spliced_count > 1):
                    read_distribution.append([read,gene,'all_spliced'])

                elif (unspliced_count > 1 and spliced_count == 0):
                    read_distribution.append([read,gene,'all_unspliced'])

                elif (unspliced_count > 0 and spliced_count > 0):
                    read_distribution.append([read,gene,'intermediate'])

            if (len(read_junctions[read]) == len(read_distribution)):
                if (len(read_junctions[read]) == 1):
                    intron_distribution.append([read,read_distribution[0][2]])

                if (len(read_junctions[read]) > 1):
                    read_distribution_df = pd.DataFrame(read_distribution)
                    read_distribution_df.columns = ['read','gene','distribution'] 
                    distribution_count = Counter(read_distribution_df['distribution'])

                    if (len(distribution_count) == 1):
                        intron_distribution.append([read,list(distribution_count.keys())[0]])

    intron_distribution_df = pd.DataFrame(intron_distribution)
    intron_distribution_df.columns = ['read','distribution']        

    return intron_distribution_df


def get_pie_chart_df(intron_distribution_df):
    
    pie_chart = []
    
    pie_chart.append(['all_unspliced',len(intron_distribution_df[intron_distribution_df['distribution']=='all_unspliced'])])
    pie_chart.append(['all_spliced',len(intron_distribution_df[intron_distribution_df['distribution']=='all_spliced'])])
    pie_chart.append(['intermediate',len(intron_distribution_df[intron_distribution_df['distribution']=='intermediate'])])
    pie_chart_df = pd.DataFrame(pie_chart)
    pie_chart_df = pie_chart_df.set_index([0])
    pie_chart_df.columns = ['count']
    
    return pie_chart_df


def get_intron_pairs_df(read_junctions, good_events):
    # good_events are the events that should be considered alternative below
    
    intron_pairs = []

    # loop through all reads in the dictionary
    for read in read_junctions.keys():

        # make a set for all intron pairs within a read
        # this will avoid duplicate pairs being called due to alternative splicing
        uniq_pairs = set()
        uniq_splice_pattern = set()
        
        # loop through all genes that has introns that a read maps to
        for gene in read_junctions[read].keys():

            # only go through genes that have 2 or more introns
            if (len(read_junctions[read][gene]) > 1 ):

                # characterize the number of spliced and unspliced introns in the read
                splice_status = [row[6] for row in read_junctions[read][gene]]
                splice_status_join = '_'.join(splice_status)
                status_count = Counter(splice_status)
                 
                # only process the file if intron pattern hasn't been seen previously
                # for a gene that this read aligns to
                if (splice_status_join not in uniq_splice_pattern):
                    uniq_splice_pattern.add(splice_status_join)

                    spliced_count = status_count['YES']
                    unspliced_count = status_count['NO']

                    # build a dataframe of introns in the gene that map to this read
                    # and are capable of being sequenced if the read has no splicing
                    read_introns_df = pd.DataFrame(read_junctions[read][gene])
                    read_introns_df.columns = ['chrom','start','end','intron_count','strand','read_overlap','splice_status', 'alt_refseq', 'alt_events']
                    read_introns_df = read_introns_df[read_introns_df['read_overlap'] > 0].sort_values('intron_count').reset_index(drop=True)

                    # loop through introns that read maps to and find pairs
                    prev_intron_count = -2    # counter for the start becuase no intron should have a negative count
                    

                    for i in range(len(read_introns_df)):
                        intron_count = read_introns_df.iloc[i]['intron_count']
                        intron_chrom = str(read_introns_df.iloc[i]['chrom'])
                        intron_start = str(read_introns_df.iloc[i]['start'])
                        intron_end = str(read_introns_df.iloc[i]['end'])
                        intron_strand = read_introns_df.iloc[i]['strand']
                        intron_splice = read_introns_df.iloc[i]['splice_status']
                        intron_alt_refseq = read_introns_df.iloc[i]['alt_refseq']
                        intron_alt_events = read_introns_df.iloc[i]['alt_events']
                        intron_coord = intron_chrom+'_'+intron_start+'_'+intron_end
                        
                        # determine if the intron is alternative or constitutive based on those three columns
                        if intron_alt_events == 'CONST':
                            intron_alt = 'constitutive'
                            
                        elif intron_alt_events in good_events:
                            intron_alt = 'alternative'
                                
                        else:
                            intron_alt = 'other'

                        # if intron counts are sequential (one follows the next)
                        # it is a true intron pair (i.e. neighboring introns)
                        if (intron_count - prev_intron_count == 1):
                            intron_pair_coord = prev_intron_coord+'_'+intron_coord

                            # record information about the read pair only if the coordinates of this
                            # intron pair have not yet been seen
                            if (intron_pair_coord not in uniq_pairs): 
                                uniq_pairs.add(intron_pair_coord)
                                prev_intron_start = prev_intron_coord.split('_')[1]
                                prev_intron_end = prev_intron_coord.split('_')[2]

                                # append intron pair coordinate and splicing information to a list
                                intron_pairs.append([read,intron_chrom,prev_intron_start,prev_intron_end,
                                                     int(intron_start),int(intron_end),intron_strand,
                                                    prev_intron_splice, intron_splice, prev_intron_alt, intron_alt])

                        # save information about this intron for the next pair
                        prev_intron_count = intron_count
                        prev_intron_coord = intron_coord
                        prev_intron_splice = intron_splice
                        prev_intron_alt = intron_alt

    intron_pairs_df = pd.DataFrame(intron_pairs)
    intron_pairs_df.columns = ['read','chrom','int1_start','int1_end','int2_start','int2_end','strand','int1_splice','int2_splice',
                              'int1_alt', 'int2_alt']        

    return intron_pairs_df




def get_splicing_order_df(intron_pairs_df):
    
    splicing_order = []

    pos_1st = len(intron_pairs_df[(intron_pairs_df['strand']=="+") & (intron_pairs_df['int1_splice']=="YES") & (intron_pairs_df['int2_splice']=="NO")])
    pos_2nd = len(intron_pairs_df[(intron_pairs_df['strand']=="+") & (intron_pairs_df['int1_splice']=="NO") & (intron_pairs_df['int2_splice']=="YES")])

    neg_1st = len(intron_pairs_df[(intron_pairs_df['strand']=="-") & (intron_pairs_df['int1_splice']=="NO") & (intron_pairs_df['int2_splice']=="YES")])
    neg_2nd = len(intron_pairs_df[(intron_pairs_df['strand']=="-") & (intron_pairs_df['int1_splice']=="YES") & (intron_pairs_df['int2_splice']=="NO")])

    all_yes = len(intron_pairs_df[(intron_pairs_df['int1_splice']=="YES") & (intron_pairs_df['int2_splice']=="YES")])
    all_no = len(intron_pairs_df[(intron_pairs_df['int1_splice']=="NO") & (intron_pairs_df['int2_splice']=="NO")])
    
    all_1st = pos_1st + neg_1st
    all_2nd = pos_2nd + neg_2nd
    percent_1st = float(all_1st) / float(all_1st + all_2nd) * 100.0

    splicing_order.append([all_yes,all_1st,all_2nd,all_no,percent_1st])

    splicing_order_df = pd.DataFrame(splicing_order)
    splicing_order_df.columns = ['yes_yes','yes_no','no_yes','no_no','percent_first']

    return splicing_order_df


# Same function as above, but applied to pairs of adjacent constitutive/alternative intron pairs
# Determines which intron gets spliced first and which gets transcribed first

def get_splicing_order_df_alt(intron_pairs_df):
    
    splicing_order = []
    splicing_order_percent = []
    
    pos_const_txn_1st_spliced_1st = len(intron_pairs_df[(intron_pairs_df['strand']=="+") &
                                                        (intron_pairs_df['int1_splice'] == "YES") &
                                                   (intron_pairs_df['int2_splice'] == "NO") &
                                                   (intron_pairs_df['int1_alt'] == "constitutive") &
                                                   (intron_pairs_df['int2_alt'] == "alternative")])
    
    neg_const_txn_1st_spliced_1st = len(intron_pairs_df[(intron_pairs_df['strand']=="-") &
                                                        (intron_pairs_df['int1_splice'] == "NO") &
                                                   (intron_pairs_df['int2_splice'] == "YES") &
                                                   (intron_pairs_df['int1_alt'] == "alternative") &
                                                   (intron_pairs_df['int2_alt'] == "constitutive")])
    
    pos_const_txn_1st_spliced_2nd = len(intron_pairs_df[(intron_pairs_df['strand']=="+") &
                                                        (intron_pairs_df['int1_splice'] == "NO") &
                                                   (intron_pairs_df['int2_splice'] == "YES") &
                                                   (intron_pairs_df['int1_alt'] == "constitutive") &
                                                   (intron_pairs_df['int2_alt'] == "alternative")])
    
    neg_const_txn_1st_spliced_2nd = len(intron_pairs_df[(intron_pairs_df['strand']=="-") &
                                                        (intron_pairs_df['int1_splice'] == "YES") &
                                                   (intron_pairs_df['int2_splice'] == "NO") &
                                                   (intron_pairs_df['int1_alt'] == "alternative") &
                                                   (intron_pairs_df['int2_alt'] == "constitutive")])
    
    #####
    
    pos_alt_txn_1st_spliced_1st = len(intron_pairs_df[(intron_pairs_df['strand']=="+") &
                                                        (intron_pairs_df['int1_splice'] == "YES") &
                                                   (intron_pairs_df['int2_splice'] == "NO") &
                                                   (intron_pairs_df['int1_alt'] == "alternative") &
                                                   (intron_pairs_df['int2_alt'] == "constitutive")])
    
    neg_alt_txn_1st_spliced_1st = len(intron_pairs_df[(intron_pairs_df['strand']=="-") &
                                                        (intron_pairs_df['int1_splice'] == "NO") &
                                                   (intron_pairs_df['int2_splice'] == "YES") &
                                                   (intron_pairs_df['int1_alt'] == "constitutive") &
                                                   (intron_pairs_df['int2_alt'] == "alternative")])
    
    pos_alt_txn_1st_spliced_2nd = len(intron_pairs_df[(intron_pairs_df['strand']=="+") &
                                                        (intron_pairs_df['int1_splice'] == "NO") &
                                                   (intron_pairs_df['int2_splice'] == "YES") &
                                                   (intron_pairs_df['int1_alt'] == "alternative") &
                                                   (intron_pairs_df['int2_alt'] == "constitutive")])
    
    neg_alt_txn_1st_spliced_2nd = len(intron_pairs_df[(intron_pairs_df['strand']=="-") &
                                                        (intron_pairs_df['int1_splice'] == "YES") &
                                                   (intron_pairs_df['int2_splice'] == "NO") &
                                                   (intron_pairs_df['int1_alt'] == "constitutive") &
                                                   (intron_pairs_df['int2_alt'] == "alternative")])
    
    #####
    
    pos_2const_txn_1st_spliced_1st = len(intron_pairs_df[(intron_pairs_df['strand']=="+") &
                                                        (intron_pairs_df['int1_splice'] == "YES") &
                                                   (intron_pairs_df['int2_splice'] == "NO") &
                                                   (intron_pairs_df['int1_alt'] == "constitutive") &
                                                   (intron_pairs_df['int2_alt'] == "constitutive")])
    
    neg_2const_txn_1st_spliced_1st = len(intron_pairs_df[(intron_pairs_df['strand']=="-") &
                                                        (intron_pairs_df['int1_splice'] == "NO") &
                                                   (intron_pairs_df['int2_splice'] == "YES") &
                                                   (intron_pairs_df['int1_alt'] == "constitutive") &
                                                   (intron_pairs_df['int2_alt'] == "constitutive")])
    
    pos_2const_txn_1st_spliced_2nd = len(intron_pairs_df[(intron_pairs_df['strand']=="+") &
                                                        (intron_pairs_df['int1_splice'] == "NO") &
                                                   (intron_pairs_df['int2_splice'] == "YES") &
                                                   (intron_pairs_df['int1_alt'] == "constitutive") &
                                                   (intron_pairs_df['int2_alt'] == "constitutive")])
    
    neg_2const_txn_1st_spliced_2nd = len(intron_pairs_df[(intron_pairs_df['strand']=="-") &
                                                        (intron_pairs_df['int1_splice'] == "YES") &
                                                   (intron_pairs_df['int2_splice'] == "NO") &
                                                   (intron_pairs_df['int1_alt'] == "constitutive") &
                                                   (intron_pairs_df['int2_alt'] == "constitutive")])
    
    #####
    
    pos_2alt_txn_1st_spliced_1st = len(intron_pairs_df[(intron_pairs_df['strand']=="+") &
                                                        (intron_pairs_df['int1_splice'] == "YES") &
                                                   (intron_pairs_df['int2_splice'] == "NO") &
                                                   (intron_pairs_df['int1_alt'] == "alternative") &
                                                   (intron_pairs_df['int2_alt'] == "alternative")])
    
    neg_2alt_txn_1st_spliced_1st = len(intron_pairs_df[(intron_pairs_df['strand']=="-") &
                                                        (intron_pairs_df['int1_splice'] == "NO") &
                                                   (intron_pairs_df['int2_splice'] == "YES") &
                                                   (intron_pairs_df['int1_alt'] == "alternative") &
                                                   (intron_pairs_df['int2_alt'] == "alternative")])
    
    pos_2alt_txn_1st_spliced_2nd = len(intron_pairs_df[(intron_pairs_df['strand']=="+") &
                                                        (intron_pairs_df['int1_splice'] == "NO") &
                                                   (intron_pairs_df['int2_splice'] == "YES") &
                                                   (intron_pairs_df['int1_alt'] == "alternative") &
                                                   (intron_pairs_df['int2_alt'] == "alternative")])
    
    neg_2alt_txn_1st_spliced_2nd = len(intron_pairs_df[(intron_pairs_df['strand']=="-") &
                                                        (intron_pairs_df['int1_splice'] == "YES") &
                                                   (intron_pairs_df['int2_splice'] == "NO") &
                                                   (intron_pairs_df['int1_alt'] == "alternative") &
                                                   (intron_pairs_df['int2_alt'] == "alternative")])
    
    #####

    # Calculate total for each type (pos + neg strands)   
    const_txn_1st_spliced_1st = pos_const_txn_1st_spliced_1st + neg_const_txn_1st_spliced_1st
    const_txn_1st_spliced_2nd = pos_const_txn_1st_spliced_2nd + neg_const_txn_1st_spliced_2nd
    alt_txn_1st_spliced_1st = pos_alt_txn_1st_spliced_1st + neg_alt_txn_1st_spliced_1st
    alt_txn_1st_spliced_2nd = pos_alt_txn_1st_spliced_2nd + neg_alt_txn_1st_spliced_2nd
    two_const_txn_1st_spliced_1st = pos_2const_txn_1st_spliced_1st + neg_2const_txn_1st_spliced_1st
    two_const_txn_1st_spliced_2nd = pos_2const_txn_1st_spliced_2nd + neg_2const_txn_1st_spliced_2nd
    two_alt_txn_1st_spliced_1st = pos_2alt_txn_1st_spliced_1st + neg_2alt_txn_1st_spliced_1st
    two_alt_txn_1st_spliced_2nd = pos_2alt_txn_1st_spliced_2nd + neg_2alt_txn_1st_spliced_2nd
    
    # Calculate total per category
    total_const_txn_1st = const_txn_1st_spliced_1st + const_txn_1st_spliced_2nd
    total_alt_txn_1st = alt_txn_1st_spliced_1st + alt_txn_1st_spliced_2nd
    total_two_const_txn_1st = two_const_txn_1st_spliced_1st + two_const_txn_1st_spliced_2nd
    total_two_alt_txn_1st = two_alt_txn_1st_spliced_1st + two_alt_txn_1st_spliced_2nd
    
    # Calculate total number of pairs
    total = total_const_txn_1st + total_alt_txn_1st + total_two_const_txn_1st + total_two_alt_txn_1st
    
    # Calculate percentages for each of the 8 groups (although we could eventually represent only the first member
    # of each group)
    percent_const_txn_1st_spliced_1st = const_txn_1st_spliced_1st / total_const_txn_1st
    percent_const_txn_1st_spliced_2nd = const_txn_1st_spliced_2nd / total_const_txn_1st
    percent_alt_txn_1st_spliced_1st = alt_txn_1st_spliced_1st / total_alt_txn_1st
    percent_alt_txn_1st_spliced_2nd = alt_txn_1st_spliced_2nd / total_alt_txn_1st
    percent_two_const_txn_1st_spliced_1st = two_const_txn_1st_spliced_1st / total_two_const_txn_1st
    percent_two_const_txn_1st_spliced_2nd = two_const_txn_1st_spliced_2nd / total_two_const_txn_1st
    percent_two_alt_txn_1st_spliced_1st = two_alt_txn_1st_spliced_1st / total_two_alt_txn_1st
    percent_two_alt_txn_1st_spliced_2nd = two_alt_txn_1st_spliced_2nd / total_two_alt_txn_1st

    splicing_order.append([const_txn_1st_spliced_1st, const_txn_1st_spliced_2nd, alt_txn_1st_spliced_1st, alt_txn_1st_spliced_2nd,
                          two_const_txn_1st_spliced_1st,two_const_txn_1st_spliced_2nd,two_alt_txn_1st_spliced_1st,two_alt_txn_1st_spliced_2nd])
    
    splicing_order_percent.append([percent_const_txn_1st_spliced_1st, percent_const_txn_1st_spliced_2nd, percent_alt_txn_1st_spliced_1st, percent_alt_txn_1st_spliced_2nd,
                          percent_two_const_txn_1st_spliced_1st,percent_two_const_txn_1st_spliced_2nd,percent_two_alt_txn_1st_spliced_1st,percent_two_alt_txn_1st_spliced_2nd])

    splicing_order_df = pd.DataFrame(splicing_order)
    splicing_order_df.columns = ['const_txn_1st_spliced_1st', 'const_txn_1st_spliced_2nd', 'alt_txn_1st_spliced_1st', 'alt_txn_1st_spliced_2nd',
                                'two_const_txn_1st_spliced_1st','two_const_txn_1st_spliced_2nd','two_alt_txn_1st_spliced_1st','two_alt_txn_1st_spliced_2nd']
    splicing_order_percent_df = pd.DataFrame(splicing_order_percent)
    splicing_order_percent_df.columns = ['const_txn_1st_spliced_1st', 'const_txn_1st_spliced_2nd', 'alt_txn_1st_spliced_1st', 'alt_txn_1st_spliced_2nd',
                                'two_const_txn_1st_spliced_1st','two_const_txn_1st_spliced_2nd','two_alt_txn_1st_spliced_1st','two_alt_txn_1st_spliced_2nd']

    return splicing_order_df,splicing_order_percent_df


In [3]:
# Get Human data 

hg38_intronFile = pd.read_table('/path/to/annotation_files/hg38_RefSeq_introns_parsed_wiAltRNAseqMISO.v2.txt')
hg38_intronFile = hg38_intronFile[hg38_intronFile['gene'].str.startswith('NM')].reset_index()

# make a dataframe of intron coordinates
hg38_intronFile['alt_events'] = hg38_intronFile.apply(lambda row: hg38_introns_byRow(row),axis=1)

# Change chromosome nomenclature
hg38_intronFile['chrom'] = hg38_intronFile.apply(lambda row: changeChrom(row),axis=1)

# Select the relevant columns and make a bedtool
hg38_intronFile = hg38_intronFile[['chrom','start','end','gene','intron_count','strand','alternative_gtf','alt_events']]
introns_bedtool = pybedtools.BedTool.from_dataframe(hg38_intronFile)


# Get Drosophila alternative splicing data

# upload dm6 intron file with alternative splicing
dm6_intronFile = pd.read_table('/path/to/dm6_RefSeq_introns_parsed_wiAltRNAseqMISO.txt')
dm6_intronFile = dm6_intronFile[dm6_intronFile['gene'].str.startswith('NM')]

# make a dataframe of intron coordinates
dm6_introns_bedtool = dmel_introns(dm6_intronFile) 


In [5]:
# upload alignment files for processing (K562 cells)
K562_1_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_1_hg38_minimap2_uniq_sort.bam')
K562_2_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_2_hg38_minimap2_uniq_sort.bam')
K562_3_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_3_hg38_minimap2_uniq_sort.bam')
K562_4_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_4_hg38_minimap2_uniq_sort.bam')
K562_5a_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_5a_hg38_minimap2_uniq_sort.bam')
K562_5b_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_5b_hg38_minimap2_uniq_sort.bam')

# upload alignment files for processing (S2 cells)
S2_1a_bamFile = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_1a_dm6_minimap2_uniq_sort.bam')
S2_1b_bamFile = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_1b_dm6_minimap2_uniq_sort.bam')
S2_2_bamFile = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_2_dm6_minimap2_uniq_sort.bam')
S2_3_bamFile = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_3_dm6_minimap2_uniq_sort.bam')

In [6]:
# get reads that intersect introns
K562_1_intersect = get_intron_intersect(introns_bedtool, K562_1_bam_file)
K562_2_intersect = get_intron_intersect(introns_bedtool, K562_2_bam_file)
K562_3_intersect = get_intron_intersect(introns_bedtool, K562_3_bam_file)
K562_4_intersect = get_intron_intersect(introns_bedtool, K562_4_bam_file)
K562_5a_intersect = get_intron_intersect(introns_bedtool, K562_5a_bam_file)
K562_5b_intersect = get_intron_intersect(introns_bedtool, K562_5b_bam_file)

S2_1a_intersect = get_intron_intersect(dm6_introns_bedtool, S2_1a_bamFile)
S2_1b_intersect = get_intron_intersect(dm6_introns_bedtool, S2_1b_bamFile)
S2_2_intersect = get_intron_intersect(dm6_introns_bedtool, S2_2_bamFile)
S2_3_intersect = get_intron_intersect(dm6_introns_bedtool, S2_3_bamFile)


In [7]:
# get splicing information for every read that spans an intron
min_overlap = 25

K562_1_splice_info = get_splicing_info(K562_1_intersect,min_overlap)
K562_2_splice_info = get_splicing_info(K562_2_intersect,min_overlap)
K562_3_splice_info = get_splicing_info(K562_3_intersect,min_overlap)
K562_4_splice_info = get_splicing_info(K562_4_intersect,min_overlap)
K562_5a_splice_info = get_splicing_info(K562_5a_intersect,min_overlap)
K562_5b_splice_info = get_splicing_info(K562_5b_intersect,min_overlap)

S2_1a_splice_info = get_splicing_info(S2_1a_intersect,min_overlap)
S2_1b_splice_info = get_splicing_info(S2_1b_intersect,min_overlap)
S2_2_splice_info = get_splicing_info(S2_2_intersect,min_overlap)
S2_3_splice_info = get_splicing_info(S2_3_intersect,min_overlap)


In [8]:
# rerun with only introns that overlap the read without any splicing to control for read length bias

K562_1_splice_info = K562_1_splice_info[K562_1_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_1_splice_info_wiOverlap = K562_1_splice_info[K562_1_splice_info['read_overlap']>(K562_1_splice_info['intron_end']-K562_1_splice_info['intron_start'])].reset_index(drop=True)

K562_2_splice_info = K562_2_splice_info[K562_2_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_2_splice_info_wiOverlap = K562_2_splice_info[K562_2_splice_info['read_overlap']>(K562_2_splice_info['intron_end']-K562_2_splice_info['intron_start'])].reset_index(drop=True)

K562_3_splice_info = K562_3_splice_info[K562_3_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_3_splice_info_wiOverlap = K562_3_splice_info[K562_3_splice_info['read_overlap']>(K562_3_splice_info['intron_end']-K562_3_splice_info['intron_start'])].reset_index(drop=True)

K562_4_splice_info = K562_4_splice_info[K562_4_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_4_splice_info_wiOverlap = K562_4_splice_info[K562_4_splice_info['read_overlap']>(K562_4_splice_info['intron_end']-K562_4_splice_info['intron_start'])].reset_index(drop=True)

K562_5a_splice_info = K562_5a_splice_info[K562_5a_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_5a_splice_info_wiOverlap = K562_5a_splice_info[K562_5a_splice_info['read_overlap']>(K562_5a_splice_info['intron_end']-K562_5a_splice_info['intron_start'])].reset_index(drop=True)

K562_5b_splice_info = K562_5b_splice_info[K562_5b_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
K562_5b_splice_info_wiOverlap = K562_5b_splice_info[K562_5b_splice_info['read_overlap']>(K562_5b_splice_info['intron_end']-K562_5b_splice_info['intron_start'])].reset_index(drop=True)



S2_1a_splice_info = S2_1a_splice_info[S2_1a_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_1a_splice_info_wiOverlap = S2_1a_splice_info[S2_1a_splice_info['read_overlap']>(S2_1a_splice_info['intron_end']-S2_1a_splice_info['intron_start'])].reset_index(drop=True)

S2_1b_splice_info = S2_1b_splice_info[S2_1b_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_1b_splice_info_wiOverlap = S2_1b_splice_info[S2_1b_splice_info['read_overlap']>(S2_1b_splice_info['intron_end']-S2_1b_splice_info['intron_start'])].reset_index(drop=True)

S2_2_splice_info = S2_2_splice_info[S2_2_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_2_splice_info_wiOverlap = S2_2_splice_info[S2_2_splice_info['read_overlap']>(S2_2_splice_info['intron_end']-S2_2_splice_info['intron_start'])].reset_index(drop=True)

S2_3_splice_info = S2_3_splice_info[S2_3_splice_info['splice_status'] != "UNDETERMINED"].reset_index(drop=True)
S2_3_splice_info_wiOverlap = S2_3_splice_info[S2_3_splice_info['read_overlap']>(S2_3_splice_info['intron_end']-S2_3_splice_info['intron_start'])].reset_index(drop=True)

In [9]:
# get dictionary with all intron junctions that a read spans
K562_1_splice_dictionary = get_read_junctions_dictionary(K562_1_splice_info_wiOverlap)
K562_2_splice_dictionary = get_read_junctions_dictionary(K562_2_splice_info_wiOverlap)
K562_3_splice_dictionary = get_read_junctions_dictionary(K562_3_splice_info_wiOverlap)
K562_4_splice_dictionary = get_read_junctions_dictionary(K562_4_splice_info_wiOverlap)
K562_5a_splice_dictionary = get_read_junctions_dictionary(K562_5a_splice_info_wiOverlap)
K562_5b_splice_dictionary = get_read_junctions_dictionary(K562_5b_splice_info_wiOverlap)

HL8_splice_dictionary = get_read_junctions_dictionary(HL8_splice_info_wiOverlap)
HL9_splice_dictionary = get_read_junctions_dictionary(HL9_splice_info_wiOverlap)
HL10_splice_dictionary = get_read_junctions_dictionary(HL10_splice_info_wiOverlap)
HL11_splice_dictionary = get_read_junctions_dictionary(HL11_splice_info_wiOverlap)

# get distribution of intron splicing across reads
K562_1_intron_distribution = get_intron_distribution(K562_1_splice_dictionary)
K562_2_intron_distribution = get_intron_distribution(K562_2_splice_dictionary)
K562_3_intron_distribution = get_intron_distribution(K562_3_splice_dictionary)
K562_4_intron_distribution = get_intron_distribution(K562_4_splice_dictionary)
K562_5a_intron_distribution = get_intron_distribution(K562_5a_splice_dictionary)
K562_5b_intron_distribution = get_intron_distribution(K562_5b_splice_dictionary)

S2_1a_intron_distribution = get_intron_distribution(S2_1a_splice_dictionary)
S2_1b_intron_distribution = get_intron_distribution(S2_1b_splice_dictionary)
S2_2_intron_distribution = get_intron_distribution(S2_2_splice_dictionary)
S2_3_intron_distribution = get_intron_distribution(S2_3_splice_dictionary)

# get pie chart for intron splicing distribution
K562_1_intron_distribution_counts = get_pie_chart_df(K562_1_intron_distribution)
K562_2_intron_distribution_counts = get_pie_chart_df(K562_2_intron_distribution)
K562_3_intron_distribution_counts = get_pie_chart_df(K562_3_intron_distribution)
K562_4_intron_distribution_counts = get_pie_chart_df(K562_4_intron_distribution)
K562_5a_intron_distribution_counts = get_pie_chart_df(K562_5a_intron_distribution)
K562_5b_intron_distribution_counts = get_pie_chart_df(K562_5b_intron_distribution)

all_hg38 = pd.concat([K562_1_intron_distribution_counts,K562_2_intron_distribution_counts,K562_3_intron_distribution_counts,
                    K562_4_intron_distribution_counts,K562_5a_intron_distribution_counts,K562_5b_intron_distribution_counts],axis=1)
all_hg38.columns = ['K562_1','K562_2','K562_3','K562_4','K562_5a','K562_5b']



S2_1a_intron_distribution_counts = get_pie_chart_df(S2_1a_intron_distribution)
S2_1b_intron_distribution_counts = get_pie_chart_df(S2_1b_intron_distribution)
S2_2_intron_distribution_counts = get_pie_chart_df(S2_2_intron_distribution)
S2_3_intron_distribution_counts = get_pie_chart_df(S2_3_intron_distribution)

all_dm6 = pd.concat([S2_1a_intron_distribution_counts,S2_1b_intron_distribution_counts,S2_2_intron_distribution_counts,S2_3_intron_distribution_counts],axis=1)
all_dm6.columns = ['S2_1a','S2_1b','S2_2','S2_3']


In [12]:
# get information about intron pairs from read junctions dictionary

alt_events = ['ALT']

K562_1_intron_pairs_df = get_intron_pairs_df(K562_1_splice_dictionary, alt_events)
K562_2_intron_pairs_df = get_intron_pairs_df(K562_2_splice_dictionary, alt_events)
K562_3_intron_pairs_df = get_intron_pairs_df(K562_3_splice_dictionary, alt_events)
K562_4_intron_pairs_df = get_intron_pairs_df(K562_4_splice_dictionary, alt_events)
K562_5a_intron_pairs_df = get_intron_pairs_df(K562_5a_splice_dictionary, alt_events)
K562_5b_intron_pairs_df = get_intron_pairs_df(K562_5b_splice_dictionary, alt_events)

S2_1a_intron_pairs_df = get_intron_pairs_df(S2_1a_splice_dictionary, alt_events)
S2_1b_intron_pairs_df = get_intron_pairs_df(S2_1b_splice_dictionary, alt_events)
S2_2_intron_pairs_df = get_intron_pairs_df(S2_2_splice_dictionary, alt_events)
S2_3_intron_pairs_df = get_intron_pairs_df(S2_3_splice_dictionary, alt_events)


# save splice dataframes to file (to use again later for plotting) (K562 cells)
K562_1_intron_pairs_df.to_csv('/path/to/K562_1_hg38_intron_pairs_df_wiAlt.txt', sep='\t', index=False, header=True)
K562_2_intron_pairs_df.to_csv('/path/to/K562_2_hg38_intron_pairs_df_wiAlt.txt', sep='\t', index=False, header=True)
K562_3_intron_pairs_df.to_csv('/path/to/K562_3_hg38_intron_pairs_df_wiAlt.txt', sep='\t', index=False, header=True)
K562_4_intron_pairs_df.to_csv('/path/to/K562_4_hg38_intron_pairs_df_wiAlt.txt', sep='\t', index=False, header=True)
K562_5a_intron_pairs_df.to_csv('/path/to/K562_5a_hg38_intron_pairs_df_wiAlt.txt', sep='\t', index=False, header=True)
K562_5b_intron_pairs_df.to_csv('/path/to/K562_5b_hg38_intron_pairs_df_wiAlt.txt', sep='\t', index=False, header=True)

S2_1a_intron_pairs_df.to_csv('/path/to/S2_1a_dm6_intron_pairs_df_wiAlt.txt', sep='\t', index=False, header=True)
S2_1b_intron_pairs_df.to_csv('/path/to/S2_1b_dm6_intron_pairs_df_wiAlt.txt', sep='\t', index=False, header=True)
S2_2_intron_pairs_df.to_csv('/path/to/S2_2_dm6_intron_pairs_df_wiAlt.txt', sep='\t', index=False, header=True)
S2_3_intron_pairs_df.to_csv('/path/to/S2_3_dm6_intron_pairs_df_wiAlt.txt', sep='\t', index=False, header=True)

In [13]:
# get information about intron pairs from read junctions dictionary (K562)
K562_1_splicing_order_df = get_splicing_order_df_alt(K562_1_intron_pairs_df)[1]
K562_2_splicing_order_df = get_splicing_order_df_alt(K562_2_intron_pairs_df)[1]
K562_3_splicing_order_df = get_splicing_order_df_alt(K562_3_intron_pairs_df)[1]
K562_4_splicing_order_df = get_splicing_order_df_alt(K562_4_intron_pairs_df)[1]
K562_5a_splicing_order_df = get_splicing_order_df_alt(K562_5a_intron_pairs_df)[1]
K562_5b_splicing_order_df = get_splicing_order_df_alt(K562_5b_intron_pairs_df)[1]

# concatenate samples
concat_splicing_order_df = pd.concat([K562_1_splicing_order_df, K562_2_splicing_order_df, K562_3_splicing_order_df,
                                     K562_4_splicing_order_df, K562_5a_splicing_order_df, K562_5b_splicing_order_df], axis=0)
concat_splicing_order_df.index = ['K562_1', 'K562_2', 'K562_3','K562_4','K562_5a','K562_5b']
concat_splicing_order_df.to_csv('/path/to/K562_alt_splicing_order_df.txt', sep='\t', index=True, header=True)

# also output the counts
K562_1_splicing_order_counts = get_splicing_order_df_alt(K562_1_intron_pairs_df)[0]
K562_2_splicing_order_counts = get_splicing_order_df_alt(K562_2_intron_pairs_df)[0]
K562_3_splicing_order_counts = get_splicing_order_df_alt(K562_3_intron_pairs_df)[0]
K562_4_splicing_order_counts = get_splicing_order_df_alt(K562_4_intron_pairs_df)[0]
K562_5a_splicing_order_counts = get_splicing_order_df_alt(K562_5a_intron_pairs_df)[0]
K562_5b_splicing_order_counts = get_splicing_order_df_alt(K562_5b_intron_pairs_df)[0]

# concatenated samples
concat_splicing_order_counts = pd.concat([K562_1_splicing_order_counts, K562_2_splicing_order_counts,K562_3_splicing_order_counts,
                                         K562_4_splicing_order_counts,K562_5a_splicing_order_counts,K562_5b_splicing_order_counts], axis=0)
concat_splicing_order_counts.index = ['K562_1', 'K562_2', 'K562_3','K562_4','K562_5a','K562_5b']
concat_splicing_order_counts.to_csv('/path/to/K562_alt_splicing_order_counts.txt', sep='\t', index=True, header=True)

In [14]:
# get information about intron pairs from read junctions dictionary (S2)
S2_1_splicing_order_df = get_splicing_order_df_alt(S2_1_intron_pairs_df)[1]
S2_1b_splicing_order_df = get_splicing_order_df_alt(S2_1b_intron_pairs_df)[1]
S2_2_splicing_order_df = get_splicing_order_df_alt(S2_2_intron_pairs_df)[1]
S2_3_splicing_order_df = get_splicing_order_df_alt(S2_3_intron_pairs_df)[1]

# concatenate samples
concat_splicing_order_df = pd.concat([S2_1_splicing_order_df, S2_1b_splicing_order_df, S2_2_splicing_order_df,
                                     S2_3_splicing_order_df], axis=0)
concat_splicing_order_df.index = ['S2_1', 'S2_1b', 'S2_2','S2_3']
concat_splicing_order_df.to_csv('/path/to/S2_alt_splicing_order_df.txt', sep='\t', index=True, header=True)

# also output the counts
S2_1_splicing_order_counts = get_splicing_order_df_alt(S2_1_intron_pairs_df)[0]
S2_1b_splicing_order_counts = get_splicing_order_df_alt(S2_1b_intron_pairs_df)[0]
S2_2_splicing_order_counts = get_splicing_order_df_alt(S2_2_intron_pairs_df)[0]
S2_3_splicing_order_counts = get_splicing_order_df_alt(S2_3_intron_pairs_df)[0]

# concatenated samples
concat_splicing_order_counts = pd.concat([S2_1_splicing_order_counts, S2_1b_splicing_order_counts,S2_2_splicing_order_counts,
                                         S2_3_splicing_order_counts], axis=0)
concat_splicing_order_counts.index = ['S2_1', 'S2_1b', 'S2_2','S2_3']
concat_splicing_order_counts.to_csv('/path/to/S2_alt_splicing_order_counts.txt', sep='\t', index=True, header=True)