In [None]:
"""

Date : August 1, 2019

Author : Heather Landry Drexler

This script will develop datasets for making pie charts of read
end alignments from nano-COP data.

These datasets are used for Figures 1 and 4 of the nano-COP manuscript.
                                            
"""

In [1]:
import numpy as np
import pandas as pd
import pysam

import matplotlib.pyplot as plt
import re
% matplotlib inline

import math

import pybedtools
from pybedtools import BedTool

import seaborn as sns
sns.set_style("white")

In [2]:
def get_all_features_bedtool(annotation_df, polyA_window, post_polyA_window, ss_window_upstream, ss_window_downstream, RNAPET_df, RNAPET_window):

    # make a set for all 3'SS coordinates
    features = []

    # loop through a file with intron coordinates
    # check if feature is an exon
    for i in range(0,len(annotation_df)):
        feature = annotation_df['feature'].iloc[i].split("_")[0]    # feature
 
        if (feature == 'gene'):
            chrom = 'chr'+annotation_df['chrom'].iloc[i]            # chromosome
            start = int(annotation_df['start'].iloc[i])             # start coordinate of intron (last base of exon)
            end = int(annotation_df['end'].iloc[i])                 # end coordinate of intron (last base of intron)
            gene = annotation_df['gene'].iloc[i]                    # gene name
            strand = annotation_df['strand'].iloc[i]                # strand of gene with intron
            name = annotation_df['feature'].iloc[i]                 # get feature and count for output file
   
            if (strand=='+'):
                polyA_start = end - polyA_window
                polyA_end = end + polyA_window
                post_polyA_start = end + polyA_window
                post_polyA_end = end + polyA_window + post_polyA_window
                
            if (strand=='-'):
                polyA_start = start - polyA_window
                polyA_end = start + polyA_window
                post_polyA_start = start - polyA_window - post_polyA_window
                post_polyA_end = start - polyA_window

            features.append([chrom,polyA_start,polyA_end,gene,'polyA',strand])
            features.append([chrom,post_polyA_start,post_polyA_end,gene,'post_polyA',strand])
            
        if (feature == 'exon'):
            chrom = 'chr'+annotation_df['chrom'].iloc[i]            # chromosome
            start = int(annotation_df['start'].iloc[i])             # start coordinate of intron (last base of exon)
            end = int(annotation_df['end'].iloc[i])                 # end coordinate of intron (last base of intron)
            gene = annotation_df['gene'].iloc[i]                    # gene name
            strand = annotation_df['strand'].iloc[i]                # strand of gene with intron
            name = annotation_df['feature'].iloc[i]                 # get feature and count for output file
   
            if (end-start > 10):        
                features.append([chrom,start,end,gene,name,strand])
     
        if (feature == 'intron'):
            chrom = 'chr'+annotation_df['chrom'].iloc[i]            # chromosome
            start = int(annotation_df['start'].iloc[i])             # start coordinate of intron (last base of exon)
            end = int(annotation_df['end'].iloc[i])                 # end coordinate of intron (last base of intron)
            gene = annotation_df['gene'].iloc[i]                    # gene name
            strand = annotation_df['strand'].iloc[i]                # strand of gene with intron
            name = annotation_df['feature'].iloc[i]                 # get feature and count for output file
   
            if (end-start > 10):
                features.append([chrom,start,end,gene,name,strand])
    
                # get 5' splice site positions for introns on each strand
                if (strand=='+'):
                    ss5_start = int(start)-ss_window_upstream
                    ss5_end = int(start)+ss_window_downstream
                if (strand=='-'):
                    ss5_start = int(end)-ss_window_downstream
                    ss5_end = int(end)+ss_window_upstream

                if (ss5_start > 0 and ss5_end > ss5_start):
                    features.append([chrom,ss5_start,ss5_end,gene,name+"_SS",strand])


    for i in range(0,len(RNAPET_df)):

        # get features of line in RNA-PET bed file
        chrom = RNAPET_df.iloc[i]['chrom']
        start = RNAPET_df.iloc[i]['start']
        end = RNAPET_df.iloc[i]['end']
        ID = RNAPET_df.iloc[i]['ID']
        score = RNAPET_df.iloc[i]['score'].astype(str)
        strand = RNAPET_df.iloc[i]['strand']
        size = RNAPET_df.iloc[i]['size'].split(',')
        loc = RNAPET_df.iloc[i]['loc'].split(',')

        # process file based on read strand
        # make a new bed file with only information about the polyA site

        if (strand == "+"):
            polyA_start = end - int(size[1]) + 1 - RNAPET_window
            polyA_end = end + RNAPET_window

        if (strand == "-"): 
            polyA_start = start + 1 - RNAPET_window
            polyA_end = start + int(size[0]) + RNAPET_window

        if (polyA_start > 0):
            features.append([chrom,polyA_start,polyA_end,ID,'RNAPET',strand])

    features_bedtool = BedTool(features)
    return features_bedtool


def get_read_end_bedtool(bamFile):

    bedFile = bamFile.bam_to_bed()
    bedFile_df = bedFile.to_dataframe(low_memory=False)
        
    read_end = []
        
    for i in range(0,len(bedFile_df)):

        chrom = str(bedFile_df['chrom'].iloc[i])
        if (chrom[0:2]!='chr'):
            chrom = 'chr'+chrom
        start = bedFile_df['start'].iloc[i]
        end = bedFile_df['end'].iloc[i]
        read = bedFile_df['name'].iloc[i]
        score = bedFile_df['score'].iloc[i]
        strand = bedFile_df['strand'].iloc[i]

        if (strand == "-"):
            pos_1 = start
            pos_2 = start + 1

        if (strand == "+"):
            pos_1 = end - 1
            pos_2 = end

        read_end.append([chrom,pos_1,pos_2,read,score,strand])

    read_end_bedtool = BedTool(read_end)
    return read_end_bedtool



def get_intersect(read_ends, intron_info):

    intersect = read_ends.intersect(intron_info, wo=True, s=True) # intersect reads from bam file with 3' splice site coordinates, ensure strandedness
    intersect_df = intersect.to_dataframe(names=['chrom_read', 'start_read', 'end_read', 'name_read', 'qual_read', \
                                           'strand_read', 'chr_feature', 'start_feature', \
                                           'end_feature', 'name_gene', 'name_feature', 'strand_feature', 'count'], \
                               dtype={"chrom_read": str, "start_read": int, "end_read": int, \
                                     "name_read": str, "qual_read": int, "strand_read": str, \
                                    "chr_feature": str, "start_feature": int, "end_feature": int, "name_gene": str, \
                                     "name_feature": str,"strand_feature": str, "count": int}) # convert to a dataframe

    return intersect_df



def get_read_end_mapping(intersect_df):

    read_ends = {}

    for i in range(0,len(intersect_df)):

        # get read name and feature type
        read = intersect_df['name_read'].iloc[i]
        feature = intersect_df['name_feature'].iloc[i]

        # if 'polyA' is present in the feature (either alone or with 'post')
        # report the feature as is
        if ('polyA' in feature):
            feature_type = feature

        # if 'RNAPET' is present in the feature, report the feature as is
        elif ('RNAPET' in feature):
            feature_type = feature

        # if length of splitting feature name by '_' is 2, it means that 
        # the feature is either an exon or intron and program will report it
        elif (len(feature.split('_')) == 2):
            feature_type = feature.split('_')[0]
            feature_count = feature.split('_')[1]

        # if length of splitting feature name by '_' is 3, it means that 
        # the feature is a splice site and program will report it
        elif (len(feature.split('_')) == 3):
            feature_type = 'intron_'+feature.split('_')[2]
            feature_count = feature.split('_')[1]

        # if length of splitting feature name by '_' is not 2 or 3, it means that 
        # there was an error somewhere in the pipeline
        elif (len(feature.split('_')) != 2 or len(feature.split('_')) != 3):
            print("ERROR with read: "+str(read))

        # check if read name is in the dictionary, if not save it
        if read not in read_ends.keys():

            # make a new dictionary for the read and end mapping info
            read_ends[read] = [feature_type]

        # check if read name is in the dictionary, if not save it
        if read in read_ends.keys():

            # if end mapping info is different, append it to the dictionary
            if (feature_type not in read_ends[read]):
                read_ends[read].append(feature_type)
    
    return read_ends


def get_read_end_stats(read_ends):
    
    read_features = []

    for k, v in read_ends.items():

        if (len(v) == 1):
            read_features.append([k,v[0]])

        if (len(v) > 1):
            if ("polyA" in v):
                read_features.append([k,"polyA"])

            elif ("post_polyA" in v):
                read_features.append([k,"post_polyA"])

            elif ("intron_SS" in v):
                read_features.append([k,"splice_site"])

            elif ("RNAPET" in v):
                read_features.append([k,"RNAPET"])

            else:
                read_features.append([k,"undetermined"])

    read_features_df = pd.DataFrame(read_features)
    read_features_df.columns = ['read','end_feature']
    
    return read_features_df


In [3]:
# Read in datasets for Supplemental Figure 4A-C - read 3' end positions

# read in features file
hg38_all_features = pd.read_table('/path/to/annotation_files/hg38_all_features_polyA50_postpolyA500_ssUp50_ssDown10_RNAPET50.txt', header=None)
hg38_all_features = BedTool(hg38_all_features.values.tolist())

# read in features file
dmel6_all_features = pd.read_table('/path/to/annotation_files/dmel6_all_features_polyA50_postpolyA500_ssUp50_ssDown10.txt', header=None)
dmel6_all_features = BedTool(dmel6_all_features.values.tolist())


In [None]:
# Supplemental Figure 4A - K562 w/ polyA addition

# upload K562 alignment file for analysis - this is the merged BAM file from all poly(A)-tailed K562 samples
K562_bamFile = pybedtools.BedTool('/path/to/all_K562_hg38_minimap2_uniq_sort.bam')

# get read ends and turn into a bedtool for intersecting 
K562_read_ends = get_read_end_bedtool(K562_bamFile)

# intersect read ends with genome features
K562_intersect = get_intersect(K562_read_ends, hg38_all_features)

# get read ends dictionary
K562_read_end_mapping = get_read_end_mapping(K562_intersect)

# get read end mapping statistics
K562_read_end_stats = get_read_end_stats(K562_read_end_mapping)


# save to csv for plotting
K562_read_ends_df = K562_read_ends.to_dataframe()
K562_read_ends_df.to_csv('/path/to/K562_read_ends.txt', sep='\t', index=False, header=True)
K562_read_end_stats.to_csv('/path/to/K562_read_end_stats.txt', sep='\t', index=False, header=True)


In [None]:
# Supplemental Figure 4B - K562 w/out polyA addition

# upload K562 alignment file for analysis
K562_no_tailing_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_no_tailing_hg38_minimap2_uniq_sort.bam')

# get read ends and turn into a bedtool for intersecting 
K562_no_tailing_read_ends = get_read_end_bedtool(K562_no_tailing_bamFile)

# intersect read ends with genome features
K562_no_tailing_intersect = get_intersect(K562_no_tailing_read_ends, hg38_all_features)

# get read ends dictionary
K562_no_tailing_read_end_mapping = get_read_end_mapping(K562_no_tailing_intersect)

# get read end mapping statistics
K562_no_tailing_read_end_stats = get_read_end_stats(K562_no_tailing_read_end_mapping)

# create a dataframe for plotting the read end mapping pie chart
#K562_no_tailing_pie_chart_df = get_pie_chart_df(K562_no_tailing_read_end_stats,K562_no_tailing_read_ends)


# save to csv for plotting
K562_no_tailing_read_ends_df = K562_no_tailing_read_ends.to_dataframe()
K562_no_tailing_read_ends_df.to_csv('/path/to/K562_no_tailing_read_ends.txt', sep='\t', index=False, header=True)
K562_no_tailing_read_end_stats.to_csv('/path/to/K562_no_tailing_read_end_stats.txt', sep='\t', index=False, header=True)


In [17]:
# Supplemental Figure 4C - S2 w/out polyA addition

# upload S2 alignment file for analysis - this is the merged BAM file from all S2 samples
S2_bamFile = pybedtools.BedTool('/path/to/all_S2_180420_dmel6_minimap2_uniq_sort.bam')

# get read ends and turn into a bedtool for intersecting 
S2_read_ends = get_read_end_bedtool(S2_bamFile)

# intersect read ends with genome features
S2_intersect = get_intersect(S2_read_ends, dmel6_all_features)

# get read ends dictionary
S2_read_end_mapping = get_read_end_mapping(S2_intersect)

# get read end mapping statistics
S2_read_end_stats = get_read_end_stats(S2_read_end_mapping)


# save to csv for plotting
S2_read_ends_df = S2_read_ends.to_dataframe()
S2_read_ends_df.to_csv('/path/to/S2_read_ends.txt', sep='\t', index=False, header=True)
S2_read_end_stats.to_csv('/path/to/S2_read_end_stats.txt', sep='\t', index=False, header=True)


In [None]:
# K562 promethION run w/ polyI addition

# upload K562 alignment file for analysis 
K562_4_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_4_hg38_minimap2_uniq_sort.bam')

# get read ends and turn into a bedtool for intersecting 
K562_4_read_ends = get_read_end_bedtool(K562_4_bamFile)

# intersect read ends with genome features
K562_4_intersect = get_intersect(K562_4_read_ends, hg38_all_features)

# get read ends dictionary
K562_4_read_end_mapping = get_read_end_mapping(K562_4_intersect)

# get read end mapping statistics
K562_4_read_end_stats = get_read_end_stats(K562_4_read_end_mapping)

# save read ends datasets to file
K562_4_read_ends_df = K562_4_read_ends.to_dataframe()
K562_4_read_ends_df.to_csv('/path/to/K562_4_read_ends.txt', sep='\t', index=False, header=True)
K562_4_read_end_stats.to_csv('/path/to/K562_4_read_end_stats.txt', sep='\t', index=False, header=True)


In [4]:
# K562 minION run w/ polyI addition

# upload K562 alignment file for analysis
K562_5a_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_5a_hg38_minimap2_uniq_sort.bam')

# get read ends and turn into a bedtool for intersecting 
K562_5a_read_ends = get_read_end_bedtool(K562_5a_bamFile)

# intersect read ends with genome features
K562_5a_intersect = get_intersect(K562_5a_read_ends, hg38_all_features)

# get read ends dictionary
K562_5a_read_end_mapping = get_read_end_mapping(K562_5a_intersect)

# get read end mapping statistics
K562_5a_read_end_stats = get_read_end_stats(K562_5a_read_end_mapping)

# save read ends datasets to file
K562_5a_read_ends_df = K562_5a_read_ends.to_dataframe()
K562_5a_read_ends_df.to_csv('/path/to/K562_5a_read_ends.txt', sep='\t', index=False, header=True)
K562_5a_read_end_stats.to_csv('/path/to/K562_5a_read_end_stats.txt', sep='\t', index=False, header=True)


In [5]:
# K562 promethION run w/ polyI addition

# upload K562 alignment file for analysis
K562_5b_bamFile = pybedtools.BedTool('/path/to/K562_4sUchr_ONT_5b_hg38_minimap2_uniq_sort.bam')

# get read ends and turn into a bedtool for intersecting 
K562_5b_read_ends = get_read_end_bedtool(K562_5b_bamFile)

# intersect read ends with genome features
K562_5b_intersect = get_intersect(K562_5b_read_ends, hg38_all_features)

# get read ends dictionary
K562_5b_read_end_mapping = get_read_end_mapping(K562_5b_intersect)

# get read end mapping statistics
K562_5b_read_end_stats = get_read_end_stats(K562_5b_read_end_mapping)

# save read ends datasets to file
K562_5b_read_ends_df = K562_5b_read_ends.to_dataframe()
K562_5b_read_ends_df.to_csv('/path/to/K562_5b_read_ends.txt', sep='\t', index=False, header=True)
K562_5b_read_end_stats.to_csv('/path/to/K562_5b_read_end_stats.txt', sep='\t', index=False, header=True)
