In [None]:
"""

Author: Karine Choquet

Date: August 7, 2021

This script will analyze the mitochondrial transcriptome using direct RNA-seq data


"""

In [1]:
import numpy as np
import pandas as pd
import pysam

import matplotlib.pyplot as plt
import re
%matplotlib inline

import math

import pybedtools
from pybedtools import BedTool

import seaborn as sns
sns.set_style("white")
sns.set_style("ticks")

from collections import Counter

## Transcript abundance

### Read counts for whole gene and 100 or 200 nt windows at 3'-prime end of gene

In [2]:
def get_mito_transcripts_bedtool(annotation_df):

    # make a set for all 3'SS coordinates
    features = []

    # loop through a file with intron coordinates
    # check if feature is an exon
    for i in range(0,len(annotation_df)):
        feature = annotation_df['feature'].iloc[i]   # feature
        chrom = annotation_df['chrom'].iloc[i] # chromosome
 
        if (feature == 'transcript') and (chrom == 'h_MT'):
            start = int(annotation_df['start'].iloc[i])             # start coordinate of intron (last base of exon)
            end = int(annotation_df['end'].iloc[i])                 # end coordinate of intron (last base of intron)
            gene = annotation_df['info'].iloc[i].split(";")[4].split('"')[1] # gene name
            gene_biotype = annotation_df['info'].iloc[i].split(";")[6].split('"')[1]
            strand = annotation_df['strand'].iloc[i]                # strand of gene with intron
   
            
            # make a key that will represent intron coordinates
            if gene_biotype != "origin":
                features.append([str(chrom),str(start),str(end),str(gene),str(gene_biotype),str(strand)])

    features_bedtool = BedTool(features)
    return features_bedtool


def get_mito_transcripts_bedtool_within_window(annotation_df, window):

    # make a set for all 3'SS coordinates
    features = []

    # loop through a file with intron coordinates
    # check if feature is an exon
    for i in range(0,len(annotation_df)):
        feature = annotation_df['feature'].iloc[i]   # feature
        chrom = annotation_df['chrom'].iloc[i] # chromosome
        strand = annotation_df['strand'].iloc[i]

        if (feature == 'transcript') and (chrom == 'h_MT'):
            gene = annotation_df['info'].iloc[i].split(";")[4].split('"')[1] # gene name
            gene_biotype = annotation_df['info'].iloc[i].split(";")[6].split('"')[1]
            start = int(annotation_df['start'].iloc[i]) 
            end = int(annotation_df['end'].iloc[i])
            
            if strand == "+":
                new_end = end
                if start < end - window:
                    new_start = end - window
                elif start > end - window:
                    new_start = start
                    
            elif strand == "-":
                new_start = start
                if end > start + window:
                    new_end = start + window
                elif end < start + window:
                    new_end = end
                    
                            
            # make a key that will represent intron coordinates
            if gene_biotype != "origin":
                features.append([str(chrom),str(new_start),str(new_end),str(gene),str(gene_biotype),str(strand)])

    features_bedtool = BedTool(features)
    return features_bedtool

# function to create a dataframe with reads that span transcripts
def get_transcript_intersect(transcript_df, bam_file):
    # get reads that span 3' splice sites and convert to a dataframe
    bedFile = bam_file.bam_to_bed(cigar=True, tag='NM') # convert bam file to bed file, keep cigar string and NM (edit distance) tag
    intersect = bedFile.intersect(transcript_df, wo=True, s=True) # intersect reads from bam file with 3' splice site coordinates, ensure strandedness
    df = intersect.to_dataframe(names=['chr_aln', 'start_aln', 'end_aln', 'name_aln', 'qual_aln', \
                                           'strand_aln', 'cigar_aln', 'chr_transcript', 'start_transcript', \
                                           'end_transcript', 'name_gene', 'biotype_gene', 'strand_gene', 'count'], \
                               dtype={"chr_aln": str, "start_aln": int, "end_aln": int, \
                                     "name_aln": str, "qual_aln": int, "strand_aln": str, \
                                     "cigar_aln": str, "chr_transcript": str, "start_transcript": int, \
                                     "end_transcript": int, "name_gene": str, \
                                     "biotype_gene": str,"strand_gene": str, "count": int}) # convert to a dataframe
    return df


# parse every read that spans a transcript in the dataset to ensure that there are no large "splicing" events
# that are likely artifacts
# this is modelled after our splicing scripts for nuclear transcripts, which is why that nomenclature is used
def parse_CIGAR(intersect_df, min_overlap):

    read_list = []

    for i in range(0,intersect_df.shape[0]):       

        # define the read name
        read_name = intersect_df['name_aln'].iloc[i]
        gene_name = intersect_df['name_gene'].iloc[i]
        chrom = intersect_df['chr_transcript'].iloc[i]
        transcript_start = intersect_df['start_transcript'].iloc[i]
        transcript_end = intersect_df['end_transcript'].iloc[i]
        biotype = intersect_df['biotype_gene'].iloc[i]
        strand = intersect_df['strand_gene'].iloc[i]
        read_overlap = intersect_df['count'].iloc[i]
        
        # set variables for parsing the cigar string
        pattern = re.compile('([MIDNSHPX=])')
        Consumes_Query = ["M", "I", "S", "=", "X"]
        Consumes_Reference = ["M", "D", "N", "=", "X"] 
        
        # parse cigar string into a list of tuples for easy parsing
        Sep_Values = pattern.split(intersect_df.cigar_aln[i])[:-1]
        CigarPairs = list((Sep_Values[n:n+2] for n in range(0, len(Sep_Values), 2)))
        
        # get the 3' softclip length
        if intersect_df.strand_aln[i]=="+":        
            last=len(CigarPairs)
            if(CigarPairs[last-1][1]=='S'):
                clip_3prime=CigarPairs[last-1][0]
            elif(CigarPairs[last-1][1]=='H'):
                clip_3prime=CigarPairs[last-1][0]
            elif(CigarPairs[last-1][1]!='S' or CigarPairs[last-1][1]!='H'):
                clip_3prime=0
                
        if intersect_df.strand_aln[i]=="-":
            if(CigarPairs[0][1]=='S'):
                clip_3prime=CigarPairs[0][0]
            elif(CigarPairs[0][1]=='H'):
                clip_3prime=CigarPairs[0][0]
            elif(CigarPairs[0][1]!='S' or CigarPairs[0][1]!='H'):
                clip_3prime=0
                
        # get the 5' softclip length
        if intersect_df.strand_aln[i]=="+":        
            if(CigarPairs[0][1]=='S'):
                clip_5prime=CigarPairs[0][0]
            elif(CigarPairs[0][1]=='H'):
                clip_5prime=CigarPairs[0][0]
            elif(CigarPairs[0][1]!='S' or CigarPairs[0][1]!='H'):
                clip_5prime=0
                
        if intersect_df.strand_aln[i]=="-":
            last=len(CigarPairs)
            if(CigarPairs[last-1][1]=='S'):
                clip_5prime=CigarPairs[last-1][0]
            elif(CigarPairs[last-1][1]=='H'):
                clip_5prime=CigarPairs[last-1][0]
            elif(CigarPairs[last-1][1]!='S' or CigarPairs[last-1][1]!='H'):
                clip_5prime=0
        
        # set up variables for measuring the length of cigar string operators
        CigarOp_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        start_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        end_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        intron_counts = {'M': 0, 'I': 0, 'D': 0, 'N': 0, 'S': 0, 'H': 0, 'P': 0, '=': 0, 'X': 0}
        currentloc = int(intersect_df['start_aln'].iloc[i])
        
        
        # go through list of cigar strings and grab splicing information
        for cigar_Entry in CigarPairs:

            op_Length = int(cigar_Entry[0]) # get length of cigar operator
            cigarOp = cigar_Entry[1] # get type of cigar operator  
            CigarOp_counts[cigarOp] += op_Length # add the cigar operator length to the counts dictionary
            cigarOp_start=currentloc # get the starting coordinate of the cigar operator

            if (cigarOp in Consumes_Reference):
                currentloc=currentloc+op_Length # add the cigar operator length to the current location coordinate 

            cigarOp_end=currentloc # get the ending coordinate of the cigar operator

            # gather information if the portion of the cigar string spans the designated intron start
            if (cigarOp_start<transcript_start-min_overlap and cigarOp_end>=transcript_start-min_overlap):
                if (cigarOp_end>=transcript_start+min_overlap):
                    count=min_overlap*2
                else:
                    count=cigarOp_end-(transcript_start-min_overlap)+1
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=transcript_start-min_overlap and cigarOp_end<transcript_start+min_overlap):
                count=op_Length
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary       

            elif (cigarOp_start<transcript_start+min_overlap and cigarOp_end>=transcript_start+min_overlap):
                if (cigarOp_start<=transcript_start-min_overlap):
                    count=min_overlap*2
                else:
                    count=(transcript_start+min_overlap)-cigarOp_start-1
                start_counts[cigarOp] += count # add the cigar operator length to the counts dictionary 

            # gather information if the portion of the cigar string is within the intron
            if (cigarOp_start<transcript_start and cigarOp_end>=transcript_start):
                if (cigarOp_end>=transcript_end):
                    count=transcript_end-transcript_start
                else:
                    count=cigarOp_end-transcript_start
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=transcript_start and cigarOp_end<transcript_end):
                count=op_Length
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start<transcript_end and cigarOp_end>=transcript_end):
                if (cigarOp_start<=transcript_start):
                    count=transcript_end-transcript_start
                else:
                    count=transcript_end-cigarOp_start
                intron_counts[cigarOp] += count # add the cigar operator length to the counts dictionary 

            # gather information if the portion of the cigar string spans the designated intron end
            if (cigarOp_start<transcript_end-min_overlap and cigarOp_end>=transcript_end-min_overlap):
                if (cigarOp_end>=transcript_end+min_overlap):
                    count=min_overlap*2
                else:
                    count=cigarOp_end-(transcript_end-min_overlap)
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start>=transcript_end-min_overlap and cigarOp_end<transcript_end+min_overlap):
                count=op_Length
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary

            elif (cigarOp_start<transcript_end+min_overlap and cigarOp_end>=transcript_end+min_overlap):
                if (cigarOp_start<=transcript_end-min_overlap):
                    count=min_overlap*2
                else:
                    count=(transcript_end+min_overlap)-cigarOp_start
                end_counts[cigarOp] += count # add the cigar operator length to the counts dictionary
        
        
        # Remove reads that have long portions that are "spliced" between two genes
        if (end_counts['N']==0 and start_counts['N']==0):
            if read_overlap > min_overlap:
                
                read_list.append([read_name, chrom, transcript_start, transcript_end, gene_name, biotype, strand, read_overlap, clip_5prime, clip_3prime])


    read_df = pd.DataFrame(read_list)
    read_df.columns = ['name_read','chrom','transcript_start','transcript_end','name_gene','biotype','strand','read_overlap','clip_5prime','clip_3prime']
    
    return read_df


def get_gene_counts(bamFile):
    
    # Get reads intersecting transcripts, with different windows from the 3'-end
    intersect_whole = get_transcript_intersect(hela_transcripts_whole_bedtool, bamFile)
    intersect_100nt = get_transcript_intersect(hela_transcripts_100nt_window_bedtool, bamFile)
    intersect_200nt = get_transcript_intersect(hela_transcripts_200nt_window_bedtool, bamFile)
    
    intersect_whole_filt = parse_CIGAR(intersect_whole, min_overlap)
    intersect_100nt_filt = parse_CIGAR(intersect_100nt, min_overlap)
    intersect_200nt_filt = parse_CIGAR(intersect_200nt, min_overlap)
    
    counts_whole = pd.DataFrame(intersect_whole_filt.drop_duplicates(subset=['name_read','name_gene']).groupby(['name_gene'])['name_read'].count()).reset_index()
    counts_100nt = pd.DataFrame(intersect_100nt_filt.drop_duplicates(subset=['name_read','name_gene']).groupby(['name_gene'])['name_read'].count()).reset_index()
    counts_200nt = pd.DataFrame(intersect_200nt_filt.drop_duplicates(subset=['name_read','name_gene']).groupby(['name_gene'])['name_read'].count()).reset_index()
    
    merged_df = counts_whole.merge(counts_100nt, on='name_gene').merge(counts_200nt, on='name_gene')
    merged_df.columns = ['name_gene','count_whole','count_100nt_3prime','count_200nt_3prime']
    
    return(merged_df)


def get_read_to_transcript_mapping_100nt(bamFile):
    
    intersect_100nt = get_transcript_intersect(hela_transcripts_100nt_window_bedtool, bamFile)
    intersect_100nt_filt = parse_CIGAR(intersect_100nt, min_overlap)
    
    return(intersect_100nt_filt)

def get_read_to_transcript_mapping_whole(bamFile):
    
    intersect_whole = get_transcript_intersect(hela_transcripts_whole_bedtool, bamFile)
    intersect_whole_filt = parse_CIGAR(intersect_whole, min_overlap)
    
    return(intersect_whole_filt)
    


In [3]:
hela_gtf = pd.read_table('/path/to/annotations/Hela_ensGRCh38_h_MT_ncRNAs_allERCC_merge_MTmod.gtf', header=None, sep="\t",
                        dtype={'chrom':str, 'build':str, 'feature':str, 'start':int, 'end':int, 'score1':int, 'strand':str, 'score2':int,
                               'info':str})
hela_gtf.columns = ['chrom','build','feature','start','end','score1','strand','score2','info']
hela_gtf_mito = hela_gtf[hela_gtf['chrom']=='h_MT'].reset_index(drop=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
mito_bamFile = pybedtools.BedTool('/path/to/HeLa_EnrichedMito_directRNAseq_minimap2_sort.bam')
tot1_bamFile = pybedtools.BedTool('/path/to/HeLa_WholeCell_directRNAseq_minimap2_sort.bam')
tot2_bamFile = pybedtools.BedTool('/path/to/HeLa_WholeCell_rep2_directRNAseq_minimap2_sort.bam')
tot3_bamFile = pybedtools.BedTool('/path/to/HeLa_totalRNA_3lig_minimap2_sort.bam')
tot4_bamFile = pybedtools.BedTool('/path/to/HeLa_totalRNA_3lig_rep2_minimap2_sort.bam')


In [5]:
# set all variables for analysis
min_overlap = 25

# make a dataframe of transcript coordinates
hela_transcripts_whole_bedtool = get_mito_transcripts_bedtool(hela_gtf_mito)
hela_transcripts_100nt_window_bedtool = get_mito_transcripts_bedtool_within_window(hela_gtf_mito, 100)
hela_transcripts_200nt_window_bedtool = get_mito_transcripts_bedtool_within_window(hela_gtf_mito, 200)

In [6]:
# get read counts for each sample
mito_gene_counts = get_gene_counts(mito_bamFile)
tot1_gene_counts = get_gene_counts(tot1_bamFile)
tot2_gene_counts = get_gene_counts(tot2_bamFile)
tot3_gene_counts = get_gene_counts(tot3_bamFile)
tot4_gene_counts = get_gene_counts(tot4_bamFile)


In [7]:
# Add sample name
mito_gene_counts['sample_name'] = 'mito_enriched_pA'
tot1_gene_counts['sample_name'] = 'total_pA_rep1'
tot2_gene_counts['sample_name'] = 'total_pA_rep2'
tot3_gene_counts['sample_name'] = 'total_lig_rep1'
tot4_gene_counts['sample_name'] = 'total_lig_rep2'

# Write to file
mito_gene_counts.to_csv("/path/to/mito_enriched_polyA_tailing_gene_counts.txt", sep="\t", header=True, index=False)
tot1_gene_counts.to_csv("/path/to/total_RNA_polyA_tailing_rep1_gene_counts.txt", sep="\t", header=True, index=False)
tot2_gene_counts.to_csv("/path/to/total_RNA_polyA_tailing_rep2_gene_counts.txt", sep="\t", header=True, index=False)
tot3_gene_counts.to_csv("/path/to/total_RNA_ligation_rep1_gene_counts.txt", sep="\t", header=True, index=False)
tot4_gene_counts.to_csv("/path/to/total_RNA_ligation_rep2_gene_counts.txt", sep="\t", header=True, index=False)

In [8]:
# Get read to transcript mapping for the 100 nt 3'prime window
mito_transcript_map_100nt = get_read_to_transcript_mapping_100nt(mito_bamFile)
tot1_transcript_map_100nt = get_read_to_transcript_mapping_100nt(tot1_bamFile)
tot2_transcript_map_100nt = get_read_to_transcript_mapping_100nt(tot2_bamFile)
tot3_transcript_map_100nt = get_read_to_transcript_mapping_100nt(tot3_bamFile)
tot4_transcript_map_100nt = get_read_to_transcript_mapping_100nt(tot4_bamFile)


In [9]:
# Get read to transcript mapping for the 100 nt 3'prime window
mito_transcript_map_whole = get_read_to_transcript_mapping_whole(mito_bamFile)
tot1_transcript_map_whole = get_read_to_transcript_mapping_whole(tot1_bamFile)
tot2_transcript_map_whole = get_read_to_transcript_mapping_whole(tot2_bamFile)
tot3_transcript_map_whole = get_read_to_transcript_mapping_whole(tot3_bamFile)
tot4_transcript_map_whole = get_read_to_transcript_mapping_whole(tot4_bamFile)


In [None]:
# K562 polyA+ samples
K562_rep1_bamFile = pybedtools.BedTool("/path/to/K562_rep1_totalRNA_minimap2_sort.bam")
K562_rep2_bamFile = pybedtools.BedTool("/path/to/K562_rep2_totalRNA_minimap2_sort.bam")

K562_rep1_transcript_map_100nt = get_read_to_transcript_mapping_100nt(K562_rep1_bamFile)
K562_rep2_transcript_map_100nt = get_read_to_transcript_mapping_100nt(K562_rep2_bamFile)

K562_rep1_transcript_map_100nt.to_csv("/path/to/K562_rep1_transcript_map_100nt_3prime.txt", sep="\t", header=True, index=False)
K562_rep2_transcript_map_100nt.to_csv("/path/to/K562_rep2_transcript_map_100nt_3prime.txt", sep="\t", header=True, index=False)



In [None]:
# Myoblasts polyA+ samples
myo_D0_bamFile = pybedtools.BedTool("/path/to/myoblast_D0_polyA_directRNA_minimap2_sort.bam")
myo_D7_bamFile = pybedtools.BedTool("/path/to/myoblasts_D7_polyA_directRNA_minimap2_sort.bam")

myo_D0_transcript_map_100nt = get_read_to_transcript_mapping_100nt(myo_D0_bamFile)
myo_D7_transcript_map_100nt = get_read_to_transcript_mapping_100nt(myo_D7_bamFile)

myo_D0_transcript_map_100nt.to_csv("/path/to/myoblasts_D0_transcript_map_100nt_3prime.txt", sep="\t", header=True, index=False)
myo_D7_transcript_map_100nt.to_csv("/path/to/myoblasts_D7_transcript_map_100nt_3prime.txt", sep="\t", header=True, index=False)



In [14]:
# HeLa polyA+ sample
HeLa_pA_bamFile = pybedtools.BedTool("/path/to/HeLa_total_polyA+_minimap2_sort.bam")
HeLa_transcript_map_100nt = get_read_to_transcript_mapping_100nt(HeLa_pA_bamFile)

HeLa_transcript_map_100nt.to_csv("/path/to/HeLa_polyA+_transcript_map_100nt_3prime.txt", sep="\t", header=True, index=False)



### Correct abundance for polycistronic transcripts

In [17]:
def get_mito_features_bedtool_for_read_ends_starts(annotation_df, polyA_window, TSS_window_pre, TSS_window_post, leader_window):

    # make a set for all 3'SS coordinates
    features = []

    # loop through a file with intron coordinates
    # check if feature is an exon
    for i in range(0,len(annotation_df)):
        feature = annotation_df['feature'].iloc[i]   # feature
        chrom = annotation_df['chrom'].iloc[i] # chromosome
 
        if (feature == 'transcript') and (chrom == 'h_MT'):
            start = int(annotation_df['start'].iloc[i])             # start coordinate of intron (last base of exon)
            end = int(annotation_df['end'].iloc[i])                 # end coordinate of intron (last base of intron)
            gene = annotation_df['info'].iloc[i].split(";")[4].split('"')[1] # gene name
            gene_biotype = annotation_df['info'].iloc[i].split(";")[6].split('"')[1]
            strand = annotation_df['strand'].iloc[i]                # strand of gene with intron
   
            
            if gene_biotype in ['protein_coding','Mt_rRNA','Mt_tRNA']:
                if (strand=='+'):
                    TSS_start = start - TSS_window_pre
                    TSS_end = start + TSS_window_post
                    
                    polyA_start = end - polyA_window
                    polyA_end = end + polyA_window
                    
                    gene_body_start = TSS_end
                    gene_body_end = polyA_start
                    
                    leader_5prime_start = TSS_start - leader_window
                    leader_5prime_end = TSS_start
                
                if (strand=='-'):
                    TSS_start = end - TSS_window_post
                    TSS_end = end + TSS_window_pre
                    
                    polyA_start = start - polyA_window
                    polyA_end = start + polyA_window
                    
                    gene_body_start = polyA_end
                    gene_body_end = TSS_start
                    
                    leader_5prime_start = TSS_end
                    leader_5prime_end = TSS_end + leader_window

                features.append([chrom,str(polyA_start),str(polyA_end),gene,'TES',strand,gene_biotype])
                features.append([chrom,str(TSS_start),str(TSS_end),gene,'TSS',strand,gene_biotype])
                features.append([chrom,gene_body_start,gene_body_end,gene,'gene_body',strand,gene_biotype])
                features.append([chrom,leader_5prime_start,leader_5prime_end,gene,'leader_5prime',strand,gene_biotype])
                
            #elif gene_biotype == "origin":
                #features.append([chrom,start,end,gene,'origin',strand,gene_biotype])
              
    #features.append(['h_MT',1,16569,'h_MT_+','intergenic','+','intergenic'])
    #features.append(['h_MT',1,16569,'h_MT_-','intergenic','-','intergenic'])
    features_bedtool = BedTool(features)
    return features_bedtool



def get_read_end_bedtool(bamFile):

    bedFile = bamFile.bam_to_bed()
    bedFile_df = bedFile.to_dataframe(dtype={'chrom':'str'})
        
    read_end = []
        
    for i in range(0,len(bedFile_df)):

        chrom = str(bedFile_df['chrom'].iloc[i])
        start = bedFile_df['start'].iloc[i]
        end = bedFile_df['end'].iloc[i]
        read = bedFile_df['name'].iloc[i]
        score = bedFile_df['score'].iloc[i]
        strand = bedFile_df['strand'].iloc[i]

        if (strand == "-"):
            pos_1 = start
            pos_2 = start + 1

        if (strand == "+"):
            pos_1 = end - 1
            pos_2 = end

        read_end.append([chrom,str(pos_1),str(pos_2),read,str(score),strand])

    read_end_bedtool = BedTool(read_end)
    return read_end_bedtool



def get_read_start_bedtool(bamFile):

    bedFile = bamFile.bam_to_bed()
    bedFile_df = bedFile.to_dataframe(dtype={'chrom':'str'})
        
    read_start = []
        
    for i in range(0,len(bedFile_df)):

        chrom = str(bedFile_df['chrom'].iloc[i])
        start = bedFile_df['start'].iloc[i]
        end = bedFile_df['end'].iloc[i]
        read = bedFile_df['name'].iloc[i]
        score = bedFile_df['score'].iloc[i]
        strand = bedFile_df['strand'].iloc[i]

        if (strand == "-"):
            pos_1 = end - 1
            pos_2 = end

        if (strand == "+"):
            pos_1 = start
            pos_2 = start + 1

        read_start.append([chrom,str(pos_1),str(pos_2),read,str(score),strand])

    read_start_bedtool = BedTool(read_start)
    return read_start_bedtool




def get_intersect_read_ends_starts(read_ends, intron_info):

    intersect = read_ends.intersect(intron_info, wo=True, s=True) # intersect reads from bam file with 3' splice site coordinates, ensure strandedness
    intersect_df = intersect.to_dataframe(names=['chrom_read', 'start_read', 'end_read', 'name_read', 'qual_read', \
                                           'strand_read', 'chr_feature', 'start_feature', \
                                           'end_feature', 'name_gene', 'name_feature', 'strand_feature', 'biotype_feature', 'count'], \
                               dtype={"chrom_read": str, "start_read": int, "end_read": int, \
                                     "name_read": str, "qual_read": int, "strand_read": str, \
                                    "chr_feature": str, "start_feature": int, "end_feature": int, "name_gene": str, \
                                     "name_feature": str,"strand_feature": str, "biotype_feature":str, "count": int}) # convert to a dataframe

    return intersect_df



def get_read_subcounts(bamFile, transcript_map, gene):
    
    # Get bedtool entries for that gene
    hela_gtf_mito_sub = hela_gtf_mito[(hela_gtf_mito['info'].str.contains(gene))]
    hela_mito_bedtool_sub = get_mito_features_bedtool_for_read_ends_starts(hela_gtf_mito_sub, polyA_window, TSS_window_pre, TSS_window_post, leader_window)
    
    # Get read ends and starts
    read_ends = get_read_end_bedtool(bamFile)
    read_starts = get_read_start_bedtool(bamFile)
    
    # Intersect with bedtool
    intersect_ends = get_intersect_read_ends_starts(read_ends, hela_mito_bedtool_sub)
    intersect_starts = get_intersect_read_ends_starts(read_starts, hela_mito_bedtool_sub)
    
    # Retrieve reads mapping to gene of interest from transcript map
    transcript_map_sub = transcript_map[transcript_map['name_gene']==gene]

    # Merge with read end and read start features
    fields1 = ['name_read','name_gene','chrom','transcript_start','transcript_end']
    fields2 = ['name_read','name_feature','name_gene']
    intersect_map = transcript_map_sub[fields1].merge(intersect_ends[fields2], on=['name_read','name_gene'], how='left').rename(columns={'name_feature':'3prime_feature'}).merge(intersect_starts[fields2], on=['name_read','name_gene'], how='left').rename(columns={'name_feature':'5prime_feature'}).fillna("other")
    
    # Count by features
    subcounts = pd.DataFrame(intersect_map.groupby(['3prime_feature','5prime_feature'])['name_read'].count()).reset_index()

    return(subcounts)



def get_all_read_subcounts(bamFile, transcript_map_100nt, transcript_map_whole):
    
    subcounts_CO3_100nt = get_read_subcounts(bamFile, transcript_map_100nt, 'MT-CO3')
    subcounts_CYB_100nt = get_read_subcounts(bamFile, transcript_map_100nt, 'MT-CYB')
    
    subcounts_CO3_whole = get_read_subcounts(bamFile, transcript_map_whole, 'MT-CO3')
    subcounts_CYB_whole = get_read_subcounts(bamFile, transcript_map_whole, 'MT-CYB')
    
    subcounts_CO3_100nt['name_gene'] = 'MT-CO3'
    subcounts_CYB_100nt['name_gene'] = 'MT-CYB'
    subcounts_CO3_whole['name_gene'] = 'MT-CO3'
    subcounts_CYB_whole['name_gene'] = 'MT-CYB'
    
    subcounts_100nt = pd.concat([subcounts_CO3_100nt,subcounts_CYB_100nt]).reset_index(drop=True)
    subcounts_whole = pd.concat([subcounts_CO3_whole,subcounts_CYB_whole]).reset_index(drop=True)
    
    subcounts = pd.merge(subcounts_whole, subcounts_100nt, on=['name_gene','3prime_feature','5prime_feature'], how='left')[['name_gene','3prime_feature','5prime_feature','name_read_x','name_read_y']].fillna(0)
    subcounts.columns = ['name_gene','3prime_feature','5prime_feature','count_whole','count_100nt_3prime']
    
    return(subcounts)


In [18]:
polyA_window = 25
TSS_window_post = 25
TSS_window_pre = 0
leader_window = 1500

In [19]:
mito_subcounts = get_all_read_subcounts(mito_bamFile, mito_transcript_map_100nt, mito_transcript_map_whole)
tot1_subcounts = get_all_read_subcounts(tot1_bamFile, tot1_transcript_map_100nt, tot1_transcript_map_whole)
tot2_subcounts = get_all_read_subcounts(tot2_bamFile, tot2_transcript_map_100nt, tot2_transcript_map_whole)
tot3_subcounts = get_all_read_subcounts(tot3_bamFile, tot3_transcript_map_100nt, tot3_transcript_map_whole)
tot4_subcounts = get_all_read_subcounts(tot4_bamFile, tot4_transcript_map_100nt, tot4_transcript_map_whole)

In [20]:
# Write to file
mito_subcounts.to_csv("/path/to/mito_enriched_polyA_tailing_subcounts_MT-CO3_MT-CYB.txt", sep="\t", header=True, index=False)
tot1_subcounts.to_csv("/path/to/total_RNA_polyA_tailing_rep1_subcounts_MT-CO3_MT-CYB.txt", sep="\t", header=True, index=False)
tot2_subcounts.to_csv("/path/to/total_RNA_polyA_tailing_rep2_subcounts_MT-CO3_MT-CYB.txt", sep="\t", header=True, index=False)
tot3_subcounts.to_csv("/path/to/total_RNA_ligation_rep1_subcounts_MT-CO3_MT-CYB.txt", sep="\t", header=True, index=False)
tot4_subcounts.to_csv("/path/to/total_RNA_ligation_rep2_subcounts_MT-CO3_MT-CYB.txt", sep="\t", header=True, index=False)


## Read ends for modelling transcription

In [2]:
def get_mito_features_bedtool(annotation_df, polyA_window):

    # make a set for all 3'SS coordinates
    features = []

    # loop through a file with intron coordinates
    # check if feature is an exon
    for i in range(0,len(annotation_df)):
        feature = annotation_df['feature'].iloc[i]   # feature
        chrom = annotation_df['chrom'].iloc[i] # chromosome
 
        if (feature == 'transcript') and (chrom == 'h_MT'):
            start = int(annotation_df['start'].iloc[i])             # start coordinate of intron (last base of exon)
            end = int(annotation_df['end'].iloc[i])                 # end coordinate of intron (last base of intron)
            gene = annotation_df['info'].iloc[i].split(";")[4].split('"')[1] # gene name
            gene_biotype = annotation_df['info'].iloc[i].split(";")[6].split('"')[1]
            strand = annotation_df['strand'].iloc[i]                # strand of gene with intron
   
            
            if gene_biotype in ['protein_coding','Mt_rRNA','Mt_tRNA']:
                if (strand=='+'):
                    polyA_start = end - polyA_window
                    polyA_end = end + polyA_window
                
                if (strand=='-'):
                    polyA_start = start - polyA_window
                    polyA_end = start + polyA_window

                features.append([chrom,str(polyA_start),str(polyA_end),gene,'3prime_end',strand,gene_biotype])
                features.append([chrom,start,end,gene,'gene_body',strand,gene_biotype])
                
            elif gene_biotype == "origin":
                features.append([chrom,start,end,gene,'origin',strand,gene_biotype])
              
    features.append(['h_MT',1,16569,'h_MT_+','intergenic','+','intergenic'])
    features.append(['h_MT',1,16569,'h_MT_-','intergenic','-','intergenic'])
    features_bedtool = BedTool(features)
    return features_bedtool



def get_read_end_bedtool(bamFile):

    bedFile = bamFile.bam_to_bed()
    bedFile_df = bedFile.to_dataframe()
        
    read_end = []
        
    for i in range(0,len(bedFile_df)):

        chrom = str(bedFile_df['chrom'].iloc[i])
        start = bedFile_df['start'].iloc[i]
        end = bedFile_df['end'].iloc[i]
        read = bedFile_df['name'].iloc[i]
        score = bedFile_df['score'].iloc[i]
        strand = bedFile_df['strand'].iloc[i]

        if (strand == "-"):
            pos_1 = start
            pos_2 = start + 1

        if (strand == "+"):
            pos_1 = end - 1
            pos_2 = end

        read_end.append([chrom,str(pos_1),str(pos_2),read,str(score),strand])

    read_end_bedtool = BedTool(read_end)
    return read_end_bedtool



def get_intersect(read_ends, intron_info):

    intersect = read_ends.intersect(intron_info, wo=True, s=True) # intersect reads from bam file with 3' splice site coordinates, ensure strandedness
    intersect_df = intersect.to_dataframe(names=['chrom_read', 'start_read', 'end_read', 'name_read', 'qual_read', \
                                           'strand_read', 'chr_feature', 'start_feature', \
                                           'end_feature', 'name_gene', 'name_feature', 'strand_feature', 'biotype_feature', 'count'], \
                               dtype={"chrom_read": str, "start_read": int, "end_read": int, \
                                     "name_read": str, "qual_read": int, "strand_read": str, \
                                    "chr_feature": str, "start_feature": int, "end_feature": int, "name_gene": str, \
                                     "name_feature": str,"strand_feature": str, "biotype_feature":str, "count": int}) # convert to a dataframe

    return intersect_df



def get_read_end_mapping(intersect_df):

    read_ends = {}

    for i in range(0,len(intersect_df)):

        # get read name and feature type
        read = intersect_df['name_read'].iloc[i]
        feature = intersect_df['name_feature'].iloc[i]
        biotype = intersect_df['biotype_feature'].iloc[i]
        strand = intersect_df['strand_read'].iloc[i]
        
        feature_type = feature + "__" + biotype + "," + strand

        # check if read name is in the dictionary, if not save it
        if read not in read_ends.keys():

            # make a new dictionary for the read and end mapping info
            read_ends[read] = [feature_type]

        # check if read name is in the dictionary, if not save it
        if read in read_ends.keys():

            # if end mapping info is different, append it to the dictionary
            if (feature_type not in read_ends[read]):
                read_ends[read].append(feature_type)
    
    return read_ends



def get_read_end_stats(read_ends):
    
    read_features = []

    for k, v in read_ends.items():

        if (len(v) == 1):
            read_features.append([k,v[0]])

        if (len(v) > 1):
            if ("3prime_end__Mt_rRNA,+" in v):
                read_features.append([k,"3prime_end__Mt_rRNA,+"])
                
            elif ("3prime_end__Mt_tRNA,+" in v):
                read_features.append([k,"3prime_end__Mt_tRNA,+"])
                
            elif ("3prime_end__protein_coding,+" in v):
                read_features.append([k,"3prime_end__protein_coding,+"])
                
            elif ("3prime_end__Mt_rRNA,-" in v):
                read_features.append([k,"3prime_end__Mt_rRNA,-"])
                
            elif ("3prime_end__Mt_tRNA,-" in v):
                read_features.append([k,"3prime_end__Mt_tRNA,-"])
                
            elif ("3prime_end__protein_coding,-" in v):
                read_features.append([k,"3prime_end__protein_coding,-"])
                
            elif ("gene_body__Mt_rRNA,+" in v):
                read_features.append([k,"gene_body__Mt_rRNA,+"])
                
            elif ("gene_body__Mt_tRNA,+" in v):
                read_features.append([k,"gene_body__Mt_tRNA,+"])
                
            elif ("gene_body__protein_coding,+" in v):
                read_features.append([k,"gene_body__protein_coding,+"])
                
            elif ("gene_body__Mt_rRNA,-" in v):
                read_features.append([k,"gene_body__Mt_rRNA,-"])
                
            elif ("gene_body__Mt_tRNA,-" in v):
                read_features.append([k,"gene_body__Mt_tRNA,-"])
                
            elif ("gene_body__protein_coding,-" in v):
                read_features.append([k,"gene_body__protein_coding,-"])
            else:
                read_features.append([k,"undetermined"])

    read_features_df = pd.DataFrame(read_features)
    read_features_df.columns = ['read','end_feature']
    
    return read_features_df





In [3]:
polyA_window = 25

In [6]:
hela_gtf_mito = hela_gtf[hela_gtf['chrom']=='h_MT'].reset_index(drop=True)
hela_mito_bedtool = get_mito_features_bedtool(hela_gtf_mito, polyA_window)

In [347]:
# get read ends and turn into a bedtool for intersecting 
mito_read_ends = get_read_end_bedtool(mito_bamFile)

# intersect read ends with genome features
mito_intersect = get_intersect(mito_read_ends, hela_mito_bedtool)

# get read ends dictionary
mito_read_end_mapping = get_read_end_mapping(mito_intersect)

# get read end mapping statistics
mito_read_end_stats = get_read_end_stats(mito_read_end_mapping)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [348]:
# get read ends and turn into a bedtool for intersecting 
tot1_read_ends = get_read_end_bedtool(tot1_bamFile)

# intersect read ends with genome features
tot1_intersect = get_intersect(tot1_read_ends, hela_mito_bedtool)

# get read ends dictionary
tot1_read_end_mapping = get_read_end_mapping(tot1_intersect)

# get read end mapping statistics
tot1_read_end_stats = get_read_end_stats(tot1_read_end_mapping)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [349]:
# get read ends and turn into a bedtool for intersecting 
tot2_read_ends = get_read_end_bedtool(tot2_bamFile)

# intersect read ends with genome features
tot2_intersect = get_intersect(tot2_read_ends, hela_mito_bedtool)

# get read ends dictionary
tot2_read_end_mapping = get_read_end_mapping(tot2_intersect)

# get read end mapping statistics
tot2_read_end_stats = get_read_end_stats(tot2_read_end_mapping)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [350]:
# get read ends and turn into a bedtool for intersecting 
tot3_read_ends = get_read_end_bedtool(tot3_bamFile)

# intersect read ends with genome features
tot3_intersect = get_intersect(tot3_read_ends, hela_mito_bedtool)

# get read ends dictionary
tot3_read_end_mapping = get_read_end_mapping(tot3_intersect)

# get read end mapping statistics
tot3_read_end_stats = get_read_end_stats(tot3_read_end_mapping)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [351]:
# get read ends and turn into a bedtool for intersecting 
tot4_read_ends = get_read_end_bedtool(tot4_bamFile)

# intersect read ends with genome features
tot4_intersect = get_intersect(tot4_read_ends, hela_mito_bedtool)

# get read ends dictionary
tot4_read_end_mapping = get_read_end_mapping(tot4_intersect)

# get read end mapping statistics
tot4_read_end_stats = get_read_end_stats(tot4_read_end_mapping)

In [353]:
mito_read_ends_df = mito_read_ends.to_dataframe()
mito_read_ends_df_MT = mito_read_ends_df[mito_read_ends_df['chrom']=='h_MT'].reset_index(drop=True)
mito_read_ends_df_MT.to_csv('/path/to/mito_read_ends_coordinates.txt', sep='\t', index=False, header=True)

tot1_read_ends_df = tot1_read_ends.to_dataframe()
tot1_read_ends_df_MT = tot1_read_ends_df[tot1_read_ends_df['chrom']=='h_MT'].reset_index(drop=True)
tot1_read_ends_df_MT.to_csv('/path/to/tot1_read_ends_coordinates.txt', sep='\t', index=False, header=True)

tot2_read_ends_df = tot2_read_ends.to_dataframe()
tot2_read_ends_df_MT = tot2_read_ends_df[tot2_read_ends_df['chrom']=='h_MT'].reset_index(drop=True)
tot2_read_ends_df_MT.to_csv('/path/to/tot2_read_ends_coordinates.txt', sep='\t', index=False, header=True)

tot3_read_ends_df = tot3_read_ends.to_dataframe()
tot3_read_ends_df_MT = tot3_read_ends_df[tot3_read_ends_df['chrom']=='h_MT'].reset_index(drop=True)
tot3_read_ends_df_MT.to_csv('/path/to/tot3_read_ends_coordinates.txt', sep='\t', index=False, header=True)

tot4_read_ends_df = tot4_read_ends.to_dataframe()
tot4_read_ends_df_MT = tot4_read_ends_df[tot4_read_ends_df['chrom']=='h_MT'].reset_index(drop=True)
tot4_read_ends_df_MT.to_csv('/path/to/tot4_read_ends_coordinates.txt', sep='\t', index=False, header=True)

In [354]:
# Write to file
mito_read_end_stats.to_csv("/path/to/mito_enriched_polyA_tailing_read_end_features.txt", sep="\t", header=True, index=False)
tot1_read_end_stats.to_csv("/path/to/total_RNA_polyA_tailing_rep1_read_end_features.txt", sep="\t", header=True, index=False)
tot2_read_end_stats.to_csv("/path/to/total_RNA_polyA_tailing_rep2_read_end_features.txt", sep="\t", header=True, index=False)
tot3_read_end_stats.to_csv("/path/to/total_RNA_ligation_rep1_read_end_features.txt", sep="\t", header=True, index=False)
tot4_read_end_stats.to_csv("/path/to/total_RNA_ligation_rep2_read_end_features.txt", sep="\t", header=True, index=False)

## Poly(A) tails

### 3'-end ligation samples

In [2]:
# Get processing status (computed on HPC cluster)
tot3_process_df = pd.read_table("/path/to/total_RNA_ligation_rep1_read_processing_status.txt")
tot4_process_df = pd.read_table("/path/to/total_RNA_ligation_rep2_read_processing_status.txt")

In [3]:
tot3_nano_df = pd.read_table("/path/to/total_RNA_ligation_rep1.polyA_estimates.tsv",
                            dtype={'readname':str, 'contig':str, 'position':int, 'leader_start':int, 'adapter_start':int, 'polya_start':int,
                               'transcript_start':int, 'read_rate':float, 'polya_length':float, 'qc_tag':str})
tot4_nano_df = pd.read_table("/path/to/total_RNA_ligation_rep2.polyA_estimates.tsv",
                            dtype={'readname':str, 'contig':str, 'position':int, 'leader_start':int, 'adapter_start':int, 'polya_start':int,
                               'transcript_start':int, 'read_rate':float, 'polya_length':float, 'qc_tag':str})

In [9]:
# Filter reads
tot3_pass_df = tot3_nano_df[tot3_nano_df['qc_tag'].isin(['PASS','NOREGION','ADAPTER'])].reset_index(drop=True)
tot3_pass_df = tot3_pass_df.merge(tot3_transcript_map_100nt[['name_read','name_gene']], left_on='readname', right_on='name_read')

# Add a category for poly(A) tail
tot3_pass_df['polyA_tail'] = 'other'
tot3_pass_df.loc[(tot3_pass_df['qc_tag'].isin(['PASS','ADAPTER'])),'polyA_tail'] = 'pA'
tot3_pass_df.loc[(tot3_pass_df['qc_tag'] == 'NOREGION'),'polyA_tail'] = 'no_pA'

In [10]:
# Filter reads
tot4_pass_df = tot4_nano_df[tot4_nano_df['qc_tag'].isin(['PASS','NOREGION','ADAPTER'])].reset_index(drop=True)
tot4_pass_df = tot4_pass_df.merge(tot4_transcript_map_100nt[['name_read','name_gene']], left_on='readname', right_on='name_read')

# Add a category for poly(A) tail
tot4_pass_df['polyA_tail'] = 'other'
tot4_pass_df.loc[(tot4_pass_df['qc_tag'].isin(['PASS','ADAPTER'])),'polyA_tail'] = 'pA'
tot4_pass_df.loc[(tot4_pass_df['qc_tag'] == 'NOREGION'),'polyA_tail'] = 'no_pA'

In [11]:
tot3_pass_df_process = tot3_pass_df.merge(tot3_process_df[['read','gene_name','category_3prime']], left_on=['name_read','name_gene'], right_on=['read','gene_name'])
tot4_pass_df_process = tot4_pass_df.merge(tot4_process_df[['read','gene_name','category_3prime']], left_on=['name_read','name_gene'], right_on=['read','gene_name'])

tot3_pass_df_process['category2'] = tot3_pass_df_process['category_3prime'] + ',' + tot3_pass_df_process['polyA_tail']
tot4_pass_df_process['category2'] = tot4_pass_df_process['category_3prime'] + ',' + tot4_pass_df_process['polyA_tail']



In [12]:
# Count reads with/without pA tail
tot3_nano_counts = pd.DataFrame(tot3_pass_df_process.groupby(['name_gene','category2'])['readname'].count()).reset_index().rename(columns={'readname':'read_count'})
tot3_nano_counts_piv = tot3_nano_counts.pivot(index='name_gene',columns='category2',values='read_count').reset_index().rename(columns={'yes':'with_polyA','no':'no_polyA'}).fillna(0)

tot4_nano_counts = pd.DataFrame(tot4_pass_df_process.groupby(['name_gene','category2'])['readname'].count()).reset_index().rename(columns={'readname':'read_count'})
tot4_nano_counts_piv = tot4_nano_counts.pivot(index='name_gene',columns='category2',values='read_count').reset_index().rename(columns={'yes':'with_polyA','no':'no_polyA'}).fillna(0)


tot3_nano_counts_piv_filt = tot3_nano_counts_piv[(tot3_nano_counts_piv['processed_3prime,no_pA']>=5) | (tot3_nano_counts_piv['processed_3prime,pA']>=5)].reset_index(drop=True)
tot4_nano_counts_piv_filt = tot4_nano_counts_piv[(tot4_nano_counts_piv['processed_3prime,no_pA']>=5) | (tot4_nano_counts_piv['processed_3prime,pA']>=5)].reset_index(drop=True)


tot3_nano_counts_piv_filt.to_csv("/path/to/total_RNA_ligation_rep1_with_or_without_polyA_counts.txt", sep="\t", header=True, index=False)
tot4_nano_counts_piv_filt.to_csv("/path/to/total_RNA_ligation_rep2_with_or_without_polyA_counts.txt", sep="\t", header=True, index=False)



In [20]:
# Filter for reads with pA tail
tot3_pA_df = tot3_pass_df[tot3_pass_df['polyA_tail']=='pA'][['name_read','name_gene','polya_length']].reset_index(drop=True)
tot4_pA_df = tot4_pass_df[tot4_pass_df['polyA_tail']=='pA'][['name_read','name_gene','polya_length']].reset_index(drop=True)

tot3_pA_df.to_csv("/path/to/total_RNA_ligation_rep1_polyA_length.txt", sep="\t", header=True, index=False)
tot4_pA_df.to_csv("/path/to/total_RNA_ligation_rep2_polyA_length.txt", sep="\t", header=True, index=False)



## Reads for screenshot figures

In [32]:
polyA_window = 25
TSS_window_post = 25
TSS_window_pre = 0
leader_window = 100

In [48]:
tot1_transcript_map_whole_CO1 = tot1_transcript_map_whole[tot1_transcript_map_whole['name_gene']=='MT-CO1'].reset_index(drop=True)
tot1_transcript_map_whole_CO1_sub = tot1_transcript_map_whole_CO1.sample(100)


tot1_transcript_map_whole_ND1 = tot1_transcript_map_whole[tot1_transcript_map_whole['name_gene']=='MT-ND1'].reset_index(drop=True)
tot1_transcript_map_whole_ND1_sub = tot1_transcript_map_whole_ND1.sample(50)


tot1_transcript_map_whole_CO3 = tot1_transcript_map_whole[(tot1_transcript_map_whole['name_gene']=='MT-CO3') | (tot1_transcript_map_whole['name_gene']=='MT-ATP8-6')].reset_index(drop=True)
tot1_transcript_map_whole_CO3_sub = tot1_transcript_map_whole_CO3.sample(100)

tot1_transcript_map_whole_CO1_sub['name_read'].to_csv("/path/to/reads_for_screenshots_CO1_v1.txt", sep="\t", header=True, index=False)
tot1_transcript_map_whole_ND1_sub['name_read'].to_csv("/path/to/reads_for_screenshots_ND1_v1.txt", sep="\t", header=True, index=False)
tot1_transcript_map_whole_CO3_sub['name_read'].to_csv("/path/to/reads_for_screenshots_CO3_ATP8-6_v1.txt", sep="\t", header=True, index=False)

