In [None]:
"""

Date : October 21, 2018

Author : Heather Landry Drexler

This script will develop datasets used for the confusion matrices in Supplementary Figure 2.
                                            
"""

In [2]:
import numpy as np
import pandas as pd
import pysam

import matplotlib.pyplot as plt
import re
% matplotlib inline

import math

import pybedtools
from pybedtools import BedTool

import seaborn as sns

import time


In [6]:

def get_sequence_bedFile(fasta, iBAM): 
    # create a bed file with read sequence
    new_bed = []

    CigarNumtoOp = {0 : 'M',
                    1 : 'I',
                    2 : 'D',
                    3 : 'N',
                    4 : 'S',             
                    5 : 'H',
                    6 : 'P', 
                    7 : '=',
                    8 : 'X'}

    # read in Bam file line by line
    for read in iBAM:

        # prepare a list for splice calls
        seq = []

        # set variables for parsing the cigar string
        pattern = re.compile('([MIDNSHPX=])')
        Consumes_Query = ["M", "I", "S", "=", "X"]
        Consumes_Reference = ["M", "D", "N", "=", "X"]    

        cigar = read.cigar                 # get cigar string
        chrom = read.reference_name        # get chromosome name
        ref_loc = int(read.pos)            # get starting position of read
        cigar_loc = 0                      # get starting position of cigar (for parsing sequence)
        sequence = read.query_sequence     # get alignment sequence

        # get read strand
        if read.is_reverse:
            strand = "-"
        else:
            strand = "+"

        # loop through cigar string to get coordinates for all positions that
        # match the cigar reference sequence
        for cigar_Entry in read.cigar:
            cigarOp = CigarNumtoOp[cigar_Entry[0]]
            op_Length = cigar_Entry[1]
            cigarOp_start=ref_loc # get the starting reference coordinate of the cigar operator

            if (cigarOp in Consumes_Reference):
                ref_loc = ref_loc + op_Length # add the cigar operator length to the current location coordinate 

            if (cigarOp in Consumes_Query):
                cigar_loc = cigar_loc + op_Length # add the cigar operator length to the current location coordinate 
                cigar_seq = sequence[cigar_loc-op_Length:cigar_loc]

            if (cigarOp == "M"):
                start = ref_loc-op_Length
                end = ref_loc
                new_bed.append([chrom,start,end,cigar_seq,cigarOp,strand])

    match_bedtool = BedTool(new_bed)

    seq_bed_df = pd.DataFrame(new_bed)
    seq_bed_df.columns = ["chrom","start","end","cigar_seq","cigar","strand"]

    # add the reference sequence to the dataframe
    ref_seq_bedtool = match_bedtool.sequence(fi=fasta)
    ref_seq_open = open(ref_seq_bedtool.seqfn)

    ref_seq = []
    for line in ref_seq_open:
        if line[0]!=">":
            seq = line[:-1]
            ref_seq.append(seq)

    seq_bed_df['ref_seq'] = ref_seq

    return seq_bed_df


def get_score_df(seq_bed_df, sample_num):
    score = pd.DataFrame([[0 for y in range(4)] for x in range(4)])
    score.columns = ['A','T','C','G']
    score.index = ['A','T','C','G']

    seq_bed_df = seq_bed_df.sample(n=sample_num)
    
    for i in range(0,len(seq_bed_df)):

        cigar = seq_bed_df.iloc[i]['cigar_seq']
        reference = seq_bed_df.iloc[i]['ref_seq']

        if (len(cigar) != len(reference)):
            print("ERROR: line "+str(i))

        if (len(cigar) == len(reference)):
            for j in range(0,len(cigar)):
                if ((cigar[j]!='N') & (reference[j]!='N')):
                    score[cigar[j]][reference[j]] = score[cigar[j]][reference[j]] + 1
                
    return score



In [None]:
# get fasta sequence
hg38_fasta = pybedtools.example_filename('/path/to/Homo_sapiens.GRCh38.dna.primary_assembly.fa')

# get alignment file
nascent_iBAM=pysam.Samfile("/path/to/K562_4sUchr_ONT_3_hg38_minimap2_uniq_sort.bam", 'rb')

# get sequence bed file
nascent_seq_df = get_sequence_bedFile(hg38_fasta, nascent_iBAM)
nascent_seq_df.to_csv('/path/to/nanoCOP_K562_3_confusion_matrix_seq_df.txt', sep='\t', index=False, header=True)


In [16]:
# get alignment file
UCSCrun1_iBAM=pysam.Samfile("/path/to/RNA_consortium_data/UCSC_Run1_hg38_minimap2_uniq_sort.bam", 'rb')

# get sequence bed file
UCSC_Run1_seq_df = get_sequence_bedFile(hg38_fasta, UCSCrun1_iBAM)
UCSC_Run1_seq_df.to_csv('/path/to/UCSC_Run1_confusion_matrix_seq_df.txt', sep='\t', index=False, header=True)


In [20]:
# get alignment file
Hopkins_Run1_iBAM=pysam.Samfile("/path/to/RNA_consortium_data/Hopkins_Run1_hg38_minimap2_uniq_sort.bam", 'rb')

# get sequence bed file
Hopkins_Run1_seq_df = get_sequence_bedFile(hg38_fasta, Hopkins_Run1_iBAM)
Hopkins_Run1_seq_df.to_csv('/path/to/Hopkins_Run1_confusion_matrix_seq_df.txt', sep='\t', index=False, header=True)
