In [None]:
"""

Date : August 1, 2019

Author : Heather Landry Drexler

This script will develop datasets for making plots for percent spliced
terminal introns at termination sites from nanoCOP data.

These files serve as input for the scripts for Figures 2, 3 and S5.
                                       
                                            
"""

In [1]:
import numpy as np
import pandas as pd
import re
import math
import pybedtools
from pybedtools import BedTool


In [2]:
def get_last_polyA_site(last_intron_df):
    
    df = last_intron_df.copy()
    df['gene'] = df.apply(lambda row: row.gene_polyA.split('_')[0]+'_'+row.gene_polyA.split('_')[1] ,axis=1)
    df['polyA'] = df.apply(lambda row: row.gene_polyA.split('_')[2] ,axis=1)
    df = df.drop(['gene_polyA'], axis = 1)

    polyA_sites = {}

    for i in range(len(df)):

        chrom = df['chrom'].iloc[i]
        start = df['start'].iloc[i]
        end = df['end'].iloc[i]
        strand = df['strand'].iloc[i]
        polyA = int(df['polyA'].iloc[i])

        coord = chrom+'_'+str(start)+'_'+str(end)+'_'+str(strand)

        if coord not in polyA_sites.keys():

            polyA_sites[coord] = polyA


        if coord in polyA_sites.keys():

            if strand=="+" and polyA>polyA_sites[coord]:
                polyA_sites[coord] = polyA

            if strand=="-" and polyA<polyA_sites[coord]:
                polyA_sites[coord] = polyA

    last_polyA_site = []

    for key in polyA_sites: 

        chrom = key.split('_')[0]
        start = key.split('_')[1]
        end = key.split('_')[2]
        strand = key.split('_')[3]
        polyA = polyA_sites[key]

        last_polyA_site.append([chrom, start, end, strand, polyA])

    last_polyA_site_df = pd.DataFrame(last_polyA_site)
    last_polyA_site_df.columns = ['chrom','start','end','strand','polyA']

    return last_polyA_site_df


# function to get a bedtool file with splice site info from hg38 intron coordinate bed file
def human_spliceSites(last_intron_df):
    # make a set for all 3'SS coordinates
    introns = []

    # loop through a file with intron coordinates
    # record features about the introns
    for i in range(0,len(last_intron_df)):
        chrom = str(last_intron_df['chrom'].iloc[i])  # chromosome
        start = int(last_intron_df['start'].iloc[i])        # start coordinate of intron (last base of exon)
        end = int(last_intron_df['end'].iloc[i])            # end coordinate of intron (last base of intron)
        polyA = int(last_intron_df['polyA'].iloc[i])        # gene name and polyA site for that gene (multiples are split with "_")
        strand = last_intron_df['strand'].iloc[i]           # strand of gene with intron

        # get 3' SS positions for introns plus one base
        if strand=='+':
            pos1 = int(end)
            pos2 = int(end)+1
            pos5prime = int(start)+1
        if strand=='-':
            pos1 = int(start)
            pos2 = int(start)+1
            pos5prime = int(end)

        # make a key that will represent intron coordinates
        introns.append([str(chrom),str(pos1),str(pos2),str(polyA),str(pos5prime),str(strand)])

    spliceSites = BedTool(introns)
    return spliceSites


# function to get a bedtool file with splice site info from dmel6 intron coordinate bed file
def drosophila_spliceSites(last_intron_df):
    # make a set for all 3'SS coordinates
    introns = []

    # loop through a file with intron coordinates
    # record features about the introns
    for i in range(0,len(last_intron_df)):
        chrom = 'chr'+str(last_intron_df['chrom'].iloc[i])  # chromosome
        start = int(last_intron_df['start'].iloc[i])        # start coordinate of intron (last base of exon)
        end = int(last_intron_df['end'].iloc[i])            # end coordinate of intron (last base of intron)
        polyA = int(last_intron_df['polyA'].iloc[i])        # gene name and polyA site for that gene (multiples are split with "_")
        strand = last_intron_df['strand'].iloc[i]           # strand of gene with intron

        # get 3' SS positions for introns plus one base
        if strand=='+':
            pos1 = int(end)
            pos2 = int(end)+1
            pos5prime = int(start)+1
        if strand=='-':
            pos1 = int(start)
            pos2 = int(start)+1
            pos5prime = int(end)

        # make a key that will represent intron coordinates
        introns.append([str(chrom),str(pos1),str(pos2),str(polyA),str(pos5prime),str(strand)])

    spliceSites = BedTool(introns)
    return spliceSites


# function to create a dataframe with reads that span 3'SS positions
def get_spliceSite_df(spliceSites, bamFile):
    # get reads that span 3' splice sites and convert to a dataframe
    bedFile = bamFile.bam_to_bed(cigar=True, tag='NM') # convert bam file to bed file, keep cigar string and NM (edit distance) tag
    intersect = bedFile.intersect(spliceSites, wo=True, s=True) # intersect reads from bam file with 3' splice site coordinates, ensure strandedness
    df = intersect.to_dataframe(names=['chrom', 'start_aln', 'end_aln', 'name_aln', 'qual_aln', \
                                           'strand_aln', 'cigar_aln', 'chr_3SS', 'start_3SS', \
                                           'end_3SS', 'polyA', 'pos_5SS', 'strand_gene', 'count'], \
                               dtype={"chrom": str, "start_aln": int, "end_aln": int, \
                                     "name_aln": str, "qual_aln": int, "strand_aln": str, \
                                     "cigar_aln": str, "chr_3SS": str, "start_3SS": int, \
                                     "end_3SS": int, "polyA": int, \
                                     "pos_5SS": int,"strand_gene": str, "count": int}) # convert to a dataframe
    return df


# function to get a splicing calls from intersect dataframe
def get_MinION_spliceCalls(df, min_overlap):
    
    # prepare a list for splice calls
    spliceCalls = []

    # set variables for parsing the cigar string
    pattern = re.compile('([MIDNSHPX=])')
    Consumes_Query = ["M", "I", "S", "=", "X"]
    Consumes_Reference = ["M", "D", "N", "=", "X"]    

    for i in range(0,df.shape[0]):
        if df.strand_gene[i] == "-":
            align_3p_end = df.start_aln[i] # record 3' end of read for - strand genes
            align_5p_end = df.end_aln[i] # record 5' end of read for - strand genes
            pos_3SS = df.end_3SS[i] # record 3'SS position for - strand genes
            pos_5SS = df.pos_5SS[i] # record 5'SS position for - strand genes
            intron_start = pos_3SS # get position for the start of intron coordinate on negative strand
            intron_end = pos_5SS # get position for the end of intron coordinate on negative strand

        if df.strand_gene[i] == "+":
            align_3p_end = df.end_aln[i] # record 3' end of read for + strand genes
            align_5p_end = df.start_aln[i] # record 5' end of read for + strand genes
            pos_3SS = df.start_3SS[i] # record 3'SS position for + strand genes 
            pos_5SS = df.pos_5SS[i] # record 5'SS position for - strand genes 
            intron_start = pos_5SS # get position for the start of intron coordinate on positive strand
            intron_end = pos_3SS # get position for the end of intron coordinate on positive strand
            
        # calculate distance between 3'SS and 3'end of read 
        dist = abs(align_3p_end - pos_3SS) #*** double check this!!!

        # parse cigar string into a list of tuples for easy parsing
        Sep_Values = pattern.split(df.cigar_aln[i])[:-1]
        CigarPairs = list((Sep_Values[n:n+2] for n in range(0, len(Sep_Values), 2)))  

        # get the 3' softclip length
        if df.strand_aln[i]=="+":        
            last=len(CigarPairs)
            if(CigarPairs[last-1][1]=='S'):
                clip=CigarPairs[last-1][0]
            elif(CigarPairs[last-1][1]=='H'):
                clip=CigarPairs[last-1][0]
            elif(CigarPairs[last-1][1]!='S' or CigarPairs[last-1][1]!='H'):
                clip=0

        if df.strand_aln[i]=="-":
            if(CigarPairs[0][1]=='S'):
                clip=CigarPairs[0][0]
            elif(CigarPairs[0][1]=='H'):
                clip=CigarPairs[0][0]
            elif(CigarPairs[0][1]!='S' or CigarPairs[0][1]!='H'):
                clip=0

        # set up variables for measuring the length of cigar string operators
        CigarOp_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        start_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        end_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        intron_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        currentloc = int(df.start_aln[i]) 

        # go through list of cigar strings and grab splicing information
        for cigar_Entry in CigarPairs:

            op_Length = int(cigar_Entry[0]) # get length of cigar operator
            cigarOp = cigar_Entry[1] # get type of cigar operator  
            CigarOp_counts[cigarOp] += op_Length # add the cigar operator length to the counts dictionary
            cigarOp_start=currentloc # get the starting coordinate of the cigar operator

            if (cigarOp in Consumes_Reference):
                currentloc=currentloc+op_Length # add the cigar operator length to the current location coordinate 

            cigarOp_end=currentloc # get the ending coordinate of the cigar operator

            # gather information if the portion of the cigar string spans the designated 5' splice site
            if (cigarOp_start<intron_start-min_overlap and cigarOp_end>=intron_start-min_overlap):
                if (cigarOp_end>=intron_start+min_overlap):
                    count=min_overlap*2
                else:
                    count=cigarOp_end-(intron_start-min_overlap)+1
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=intron_start-min_overlap and cigarOp_end<intron_start+min_overlap):
                count=op_Length
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary       

            elif (cigarOp_start<intron_start+min_overlap and cigarOp_end>=intron_start+min_overlap):
                if (cigarOp_start<=intron_start-min_overlap):
                    count=min_overlap*2
                else:
                    count=(intron_start+min_overlap)-cigarOp_start-1
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary 

            # gather information if the portion of the cigar string is within the intron
            if (cigarOp_start<intron_start and cigarOp_end>=intron_start):
                if (cigarOp_end>=intron_end):
                    count=intron_end-intron_start
                else:
                    count=cigarOp_end-intron_start
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=intron_start and cigarOp_end<intron_end):
                count=op_Length
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start<intron_end and cigarOp_end>=intron_end):
                if (cigarOp_start<=intron_start):
                    count=intron_end-intron_start
                else:
                    count=intron_end-cigarOp_start
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary 

            # gather information if the portion of the cigar string spans the designated 3' splice site
            if (cigarOp_start<intron_end-min_overlap and cigarOp_end>=intron_end-min_overlap):
                if (cigarOp_end>=intron_end+min_overlap):
                    count=min_overlap*2
                else:
                    count=cigarOp_end-(intron_end-min_overlap)
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=intron_end-min_overlap and cigarOp_end<intron_end+min_overlap):
                count=op_Length
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start<intron_end+min_overlap and cigarOp_end>=intron_end+min_overlap):
                if (cigarOp_start<=intron_end-min_overlap):
                    count=min_overlap*2
                else:
                    count=(intron_end+min_overlap)-cigarOp_start
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary 


        # assign strandedness to determine counts around 5'SS and 3'SS
        if(df.strand_gene[i]=='+'):
            around5SS_counts = start_counts
            around3SS_counts = end_counts

        elif(df.strand_gene[i]=="-"):
            around5SS_counts = end_counts
            around3SS_counts = start_counts

        # annotate splicing status based on CIGAR string information around splice sites
        if(around3SS_counts['N']==0 and around3SS_counts['M']>min_overlap/2):
            splice='NO'
        elif(around3SS_counts['N']>0 and around3SS_counts['N']<min_overlap*2):
            if(around5SS_counts['N']>0 and around5SS_counts['N']<min_overlap*2):
                splice='YES'
            else:
                splice='UNDETERMINED'
        else:
            splice='UNDETERMINED'

        # annotate splicing status based on CIGAR string information within the intron 
        if (splice == 'YES'):
            if (float(intron_end-intron_start) > 0.0):
                ratio = float(intron_counts['N'])/float(intron_end-intron_start)
                difference = abs(intron_counts['N']-(intron_end-intron_start))
                # if read is spliced, between 90-100% of the intron has to be spliced 
                # and no more than 100 nucleotides within the intron can be matching the intron sequence
                if( ratio < 0.9 or ratio > 1.1 or difference > 100):
                    splice='UNDETERMINED'
            if (float(intron_end-intron_start) == 0.0):
                splice='UNDETERMINED'

        if (splice == 'NO'):
            if (float(intron_end-intron_start) > 0.0):
                intronLength = intron_end-intron_start
                difference = abs(intron_counts['M']+intron_counts['D']+intron_counts['S']-intronLength)
                ratio = float(intron_counts['M'])/(float(intron_counts['M'])+float(intron_counts['N'])+float(intron_counts['D'])+1)
                # if read is unspliced, at least 70% of the read has to match (CIGAR=M) the intron sequence
                # and at least 10 nucleotides must match (CIGAR=M) within the intron sequence
                if(intron_counts['M'] < 10 or ratio < 0.7):
                    splice='UNDETERMINED'
            if (float(intron_end-intron_start) == 0.0):
                splice='UNDETERMINED'

        # get information on match percentage / error length 
        # this will be used as a quality control cutoff if necessary
        read_length = CigarOp_counts['M']+CigarOp_counts['I']+CigarOp_counts['S']+CigarOp_counts['=']+CigarOp_counts['X']
        error_rate = float(df.qual_aln[i])/float(read_length)

        spliceCalls.append([df.name_aln[i],df.chrom[i],str(intron_start),str(intron_end),str(align_5p_end),str(align_3p_end),str(read_length),df.strand_gene[i],str(error_rate),str(clip),str(dist),splice,df.polyA[i]])
    
    spliceCalls_df = pd.DataFrame(spliceCalls)
    spliceCalls_df.columns = ["read_name","chrom","intron_start","intron_end","read_start","read_end","read_length","strand","error_rate","end_clippling","dist_from_3SS","splice_status","polyA"]
   
    return spliceCalls_df


# function to line up splicing calls with distance from polyA site information
def get_termination_splicingCalls(splicingCalls, overlap):
    termination = []
    for i in range(0,splicingCalls.shape[0]):
        read_name = splicingCalls['read_name'].iloc[i]
        chrom = splicingCalls['chrom'].iloc[i]
        intron_start = int(splicingCalls['intron_start'].iloc[i])
        intron_end = int(splicingCalls['intron_end'].iloc[i])
        strand = splicingCalls['strand'].iloc[i]
        read_length = int(splicingCalls['read_length'].iloc[i])
        intron_length = int(splicingCalls['intron_end'].iloc[i])-int(splicingCalls['intron_start'].iloc[i])
        dist_from_3SS = int(splicingCalls['dist_from_3SS'].iloc[i])
        splice_status = str(splicingCalls['splice_status'].iloc[i])

        if (read_length >= (dist_from_3SS + overlap)) and (splice_status != 'UNDETERMINED'):
        
            if (splicingCalls['strand'].iloc[i] == '+'):
                read_end = int(splicingCalls['read_end'].iloc[i])
                polyA = int(splicingCalls['polyA'].iloc[i])
                dist_from_polyA = read_end-polyA
                termination.append([read_name, intron_length, dist_from_polyA, splice_status])
                #termination.append([read_name, chrom, intron_start, intron_end, strand, name, polyA, dist_from_polyA, splice_status])

            if (splicingCalls['strand'].iloc[i] == '-'):
                read_end = int(splicingCalls['read_end'].iloc[i])
                polyA = int(splicingCalls['polyA'].iloc[i])
                dist_from_polyA = polyA-read_end
                termination.append([read_name, intron_length, dist_from_polyA, splice_status])
                #termination.append([read_name, chrom, intron_start, intron_end, strand, name, polyA, dist_from_polyA, splice_status])
                                        
    termination_df = pd.DataFrame(termination)
    termination_df.columns = ["read_name","intron_length","polyA_dist","splice_status"]
    #termination_df.columns = ["read_name","chrom","intron_start","intron_end","strand","gene_name","polyA_pos","polyA_dist","splice_status"]                     
    termination_df = termination_df.drop_duplicates(subset=['read_name','splice_status']).reset_index(drop=True)  
    
    return termination_df


In [3]:
### get splicing information at termination sites for all K562 samples

# read file with last intron coordinates for polyA analysis
hg38_last_intron_df = pd.read_csv("/path/to/annotation_files/hg38_last_introns_polyA.txt",sep="\t")

# get coordinates with only the last polyA site
hg38_last_intron_last_polyA_df = get_last_polyA_site(hg38_last_intron_df)

# make a bedtool with 3' splice site coordinates from the bedfile with last introns
hg38_spliceSites = human_spliceSites(hg38_last_intron_last_polyA_df)

# set variables
min_overlap = 25
termination_overlap = 100


In [7]:
# prepare dataframes for terminal intron splicing plots

# K562_1
bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_1_hg38_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(hg38_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
K562_1_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# K562_2
bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_2_hg38_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(hg38_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
K562_2_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# K562_3
bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_3_hg38_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(hg38_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
K562_3_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# K562_4
bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_4_hg38_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(hg38_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
K562_4_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# K562_5a
bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_5a_hg38_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(hg38_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
K562_5a_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# K562_5b
bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_5b_hg38_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(hg38_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
K562_5b_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# K562_DMSO_1
bamFile = pybedtools.BedTool('/path/to/K562_ONT_DMSO_1_hg38_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(hg38_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
K562_DMSO_1_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# K562_DMSO_2
bamFile = pybedtools.BedTool('/path/to/K562_ONT_DMSO_2_hg38_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(hg38_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
K562_DMSO_2_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# K562_PlaB_1
bamFile = pybedtools.BedTool('/path/to/K562_ONT_PlaB_1_hg38_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(hg38_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
K562_PlaB_1_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# K562_PlaB_2
bamFile = pybedtools.BedTool('/path/to/K562_ONT_PlaB_2_hg38_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(hg38_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
K562_PlaB_2_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)



In [None]:
# save splice dataframes to file (to use again later for plotting) (K562 cells)
K562_1_termination_df.to_csv('/path/to/K562_1_hg38_termination_df.txt', sep='\t', index=False, header=True)
K562_2_termination_df.to_csv('/path/to/K562_2_hg38_termination_df.txt', sep='\t', index=False, header=True)
K562_3_termination_df.to_csv('/path/to/K562_3_hg38_termination_df.txt', sep='\t', index=False, header=True)
K562_4_termination_df.to_csv('/path/to/K562_4_hg38_termination_df.txt', sep='\t', index=False, header=True)
K562_5a_termination_df.to_csv('/path/to/K562_5a_hg38_termination_df.txt', sep='\t', index=False, header=True)
K562_5b_termination_df.to_csv('/path/to/K562_5b_hg38_termination_df.txt', sep='\t', index=False, header=True)
K562_DMSO_1_termination_df.to_csv('/path/to/K562_DMSO_1_hg38_termination_df.txt', sep='\t', index=False, header=True)
K562_DMSO_2_termination_df.to_csv('/path/to/K562_DMSO_2_hg38_termination_df.txt', sep='\t', index=False, header=True)
K562_PlaB_1_termination_df.to_csv('/path/to/K562_PlaB_1_hg38_termination_df.txt', sep='\t', index=False, header=True)
K562_PlaB_2_termination_df.to_csv('/path/to/K562_PlaB_2_hg38_termination_df.txt', sep='\t', index=False, header=True)


In [None]:
### get splicing information at termination sites for all S2 samples

# read file with last intron coordinates for polyA analysis
dmel6_last_intron_df = pd.read_csv("/path/to/annotation_files/dmel6_last_introns_polyA.txt",sep="\t")

# get coordinates with only the last polyA site
dmel6_last_intron_last_polyA_df = get_last_polyA_site(dmel6_last_intron_df)

# make a bedtool with 3' splice site coordinates from the bedfile with last introns
dmel6_spliceSites = drosophila_spliceSites(dmel6_last_intron_last_polyA_df)


# set variables
min_overlap = 25
termination_overlap = 100

# S2_1a
bamFile = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_1a_dm6_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(dmel6_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
S2_1a_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# S2_1b
bamFile = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_1b_dm6_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(dmel6_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
S2_1b_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# S2_3
bamFile = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_2_dm6_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(dmel6_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
S2_3_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# S2_3
bamFile = pybedtools.BedTool('/path/to/S2_4sUchr_ONT_3_dm6_minimap2_uniq_sort.bam')
spliceSites_df = get_spliceSite_df(dmel6_spliceSites, bamFile)
splicingCalls = get_MinION_spliceCalls(spliceSites_df,min_overlap)
S2_3_termination_df = get_termination_splicingCalls(splicingCalls, termination_overlap)

# merge technical replicates S2_1a + S2_1b
S2_1_termination_df = pd.concat([S2_1a_termination_df,S2_1b_termination_df])


# save splice dataframes to file (to use again later for plotting) (K562 cells)
S2_1a_termination_df.to_csv('/path/to/S2_1a_dm6_termination_df.txt', sep='\t', index=False, header=True)
S2_1b_termination_df.to_csv('/path/to/S2_1b_dm6_termination_df.txt', sep='\t', index=False, header=True)
S2_3_termination_df.to_csv('/path/to/S2_3_dm6_termination_df.txt', sep='\t', index=False, header=True)
S2_3_termination_df.to_csv('/path/to/S2_3_dm6_termination_df.txt', sep='\t', index=False, header=True)
S2_1_termination_df.to_csv('/path/to/S2_1_dm6_termination_df.txt', sep='\t', index=False, header=True)