#### Test function to extract chromosome and start- & end-positions from strings containing genomic locations 

In [3]:
from typing import Tuple
def extract_positional_info(data_string: str) -> Tuple[str, int, int]:
    """
    Extracts the chromosome name, start value, and end value from a string in the format "chromosome_name:start-end".

    Parameters
    ----------
    data_string : str
        The input string in the format "chromosome_name:start-end".

    Returns
    -------
    tuple
        A tuple containing the chromosome name (str), start value (int), and end value (int).
    """
    chromosome, positions = data_string.split(':')
    start_str, end_str = positions.split('-')
    start = int(start_str.replace(',', ''))
    end = int(end_str.replace(',', ''))
    return chromosome, start, end

In [4]:
data_string = "chr1:10,000-20,000"
chromosome, start, end = extract_positional_info(data_string)
print(chromosome)  # Output: chr1
print(start)  # Output: 10000
print(end)  # Output: 20000

chr1
10000
20000


#### Test pyfiglet

In [23]:
from pyfiglet import Figlet

f = Figlet(font="slant")
o = f.renderText("Neet")
print(o)

    _   __          __ 
   / | / /__  ___  / /_
  /  |/ / _ \/ _ \/ __/
 / /|  /  __/  __/ /_  
/_/ |_/\___/\___/\__/  
                       



#### Get number of lines in a file EFFICIENTLY

from: https://stackoverflow.com/questions/845058/how-to-get-line-count-of-a-large-file-cheaply-in-python

In [15]:
from itertools import (takewhile,repeat)

def rawincount(filename):
    f = open(filename, 'rb')
    bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
    return sum( buf.count(b'\n') for buf in bufgen )

In [10]:
def buf_count_newlines_gen(fname):
    def _make_gen(reader):
        while True:
            b = reader(2 ** 16)
            if not b: break
            yield b

    with open(fname, "rb") as f:
        count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
    return count

In [13]:
def count_lines_naive(fname):
    count = 0
    with open(fname, "rb") as f:
        for line in f:
            count += 0
    return count

In [16]:
%timeit count_lines_naive("./curlcake_out.tsv")
%timeit buf_count_newlines_gen("./curlcake_out.tsv")
%timeit rawincount("./curlcake_out.tsv")

744 µs ± 8.57 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
731 µs ± 12 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
717 µs ± 8.45 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


rawincount is the fastest!

In [17]:
def get_num_lines(path: str) -> int:
    """
    Calculate the number of lines in a given file. Function taken from
    https://stackoverflow.com/questions/845058/how-to-get-line-count-of-a-large-file-cheaply-in-python
    
    Parameters
    ----------
    path : str
        Path to a file

    Returns
    -------
    int
        Number of lines in the given file
    """
    f = open(path, 'rb')
    bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
    return sum( buf.count(b'\n') for buf in bufgen )

#### Test sorting function for tsv file

In [18]:
import csv

with open('./curlcake_out.tsv', 'r') as file:
    reader = csv.reader(file, delimiter="\t", )
    data = list(reader)

sorted(data, key=lambda row: (row[0], row[1]))

[['cc6m_2244_t7_ecorv',
  '100',
  '970',
  'C',
  'C',
  '6',
  '930',
  '0',
  '30',
  '31',
  '17',
  '0.006185567010309278',
  '0.9587628865979382',
  '0.0',
  '0.030927835051546393',
  '0.031958762886597936',
  '0.01752577319587629',
  'CAGGCCTCGAC',
  '0.0865979381443299',
  '11.001029866117404',
  '9.489600735086984'],
 ['cc6m_2244_t7_ecorv',
  '1000',
  '2624',
  'C',
  'C',
  '4',
  '2617',
  '2',
  '0',
  '2',
  '80',
  '0.001524390243902439',
  '0.9973323170731707',
  '0.0007621951219512195',
  '0.0',
  '0.0007621951219512195',
  '0.03048780487804878',
  'TCGATCAAGGT',
  '0.03353658536585366',
  '10.800380952380952',
  '8.203074908234921'],
 ['cc6m_2244_t7_ecorv',
  '1001',
  '2624',
  'A',
  'C',
  '207',
  '2331',
  '5',
  '1',
  '55',
  '83',
  '0.07888719512195122',
  '0.8883384146341463',
  '0.0019054878048780487',
  '0.00038109756097560977',
  '0.020960365853658538',
  '0.03163109756097561',
  'CGATCAAGGTC',
  '0.9432164634146342',
  '11.295619047619047',
  '8.50045600

In [9]:
import csv
import heapq
import os
def external_sort(input_file, output_file, key_columns):
    chunk_size = 100000  # Adjust the chunk size as needed

    # Read the input file in chunks and sort each chunk
    with open(input_file, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)  # Read the header row
        chunks = []
        while True:
            chunk = list(heapq.nsmallest(chunk_size, reader, key=key_columns))
            if not chunk:
                break
            chunks.append(chunk)

    # Merge the sorted chunks and write to the output file
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)  # Write the header row

        # Perform k-way merge of the sorted chunks
        merged = heapq.merge(*chunks, key=key_columns)
        writer.writerows(merged)


#### Testing function to read multiple lines in a sliding window

In [116]:
from typing import List

def process_neighbourhood(neighbourhood: List[str], k: int) -> str:
    ref_str = neighbourhood[k].strip("\n")
    nb = neighbourhood.copy()

    ref = nb[k].strip("\n").split("\t")
    ref_chr = ref[0]
    ref_site = int(ref[1])
    del nb[k]

    # for each neighbour check if they are 1.on the same chr and 2.neighbours
    has_nb = False
    nb_info = ""

    for pos in nb:
        pos = pos.strip("\n").split("\t")
        if pos[0] == ref_chr: # check if same chromosome
            site = int(pos[1])
            relative_pos = site - ref_site
            if abs(relative_pos) <= k: # check if pos are close to each other
                has_nb = True
                nb_info += str(relative_pos)+","

    ref_str += f"\t{has_nb}\t{nb_info}\n"
    return ref_str

def read_lines_sliding_window(file_path, out_path, k):
    window_size = 1 + 2*k
    with open(file_path, 'r') as file, open(out_path, "w") as o:
        lines = []
        for line in file:
            lines.append(line)
            if len(lines) > window_size:
                lines.pop(0)
            
            if len(lines) == window_size:
                outline = process_neighbourhood(lines, k)
                o.write(outline)

In [117]:
read_lines_sliding_window("./test_file.txt", "./test_file_out.txt", 2)

#### Test function to handle filenames/directory names for the output

In [35]:
import os
import warnings

def check_get_in_path(self, in_path: str) -> str:
    """
    Check if the given input path is valid and of the expected file type.

    Parameters
    ----------
    in_path : str
        Path to the input file given by the user.

    Returns
    -------
    str
        Valid input file path.

    Raises
    ------
    FileNotFoundError
        If the input file does not exist.

    Warns
    -----
    UserWarning
        If the input file is not of the expected file type (.msf, .pup, .pileup).
        Warns the user to ensure it is a pileup file as produced by Samtools' mpileup function.
    """
    if not os.path.exists(in_path): # does file exist?
        raise FileNotFoundError(f"Input file not found. File '{in_path}' does not exist.")
    file_type = os.path.splitext(in_path)[1]
    if not file_type in [".msf", ".pup", ".pileup"]: # is file likely in pileup format?
        warnings.warn(f"Input file of type {file_type}. Make sure that this is a pileup file as produced by Samtools' mpileup function.", Warning)
    
    return in_path


#### Test function to handle filenames/directory names for the output

In [51]:
import os
import warnings

def check_get_out_path(self, out_path: str, in_path: str) -> str:
    """
    Check if the given out_put path is valid. Can be either a filename or directory.
    If a directory is given, output path will be '[DIR-path]/[INPUT-FILE-BASENAME]_out.tsv'.

    Parameters
    ----------
    out_path : str
        output path given by the user. Either path to a (non-existing) file or a directory
    in_path : str
        path to input file given by the user

    Returns
    -------
    str
        valid path to output file   
    """
    # check if directory/file exists
    if os.path.isdir(out_path):
        if not os.path.exists(out_path):
            raise FileNotFoundError(f"Output directory not found. '{out_path}' does not exist.")

        in_basename = os.path.splitext(os.path.basename(in_path))[0]
        if not out_path.endswith("/"):
            out_path += "/"

        return os.path.join(out_path, in_basename + "_out.tsv")
    
    else:
        dirname = os.path.dirname(out_path)
        if not os.path.exists(dirname):
            raise FileNotFoundError(f"Path to output file not found. '{dirname}' does not exist.")

        file_extension = os.path.splitext(out_path)[1]
        if file_extension != ".tsv":
            warnings.warn(f"Given output file has extension '{file_extension}'. Note that the output file will be of type '.tsv'.")

        return out_path

In [50]:
in_path = '/home/vincent/masterthesis/data/Oligo_ref.fa'
out_path = "/home/vincent/masterthesis/masterthesis-project/curlcake_m6a.tsv"
check_get_out_path(_, out_path, in_path)

'/home/vincent/masterthesis/masterthesis-project/curlcake_m6a.tsv'

#### Test function to loop through two files simultaneously

In [50]:
import csv

def merge_tsv_files(file1, file2, output_file):
    # Open input files in read mode
    with open(file1, 'r') as file1_handle, open(file2, 'r') as file2_handle:
        # Open output file in write mode
        with open(output_file, 'w') as output_handle:
            # Create CSV reader and writer objects
            reader1 = csv.reader(file1_handle, delimiter='\t')
            reader2 = csv.reader(file2_handle, delimiter='\t')
            writer = csv.writer(output_handle, delimiter='\t')

            # Read the header row from both input files
            header1 = next(reader1)
            header2 = next(reader2)

            # Find the column indexes of "chr" and "site" columns in both input files
            chr_index1 = header1.index('chr')
            site_index1 = header1.index('site')
            chr_index2 = header2.index('chr')
            site_index2 = header2.index('site')

            # Write the merged header row to the output file
            output_header = header1 + [col + '_b' for col in header2 if col != 'chr' and col != 'site']
            writer.writerow(output_header)

            # Initialize current_row1 and current_row2 with the first rows from both input files
            current_row1 = next(reader1, None)
            current_row2 = next(reader2, None)

            # Merge the rows from both input files based on "chr" and "site" columns
            while current_row1 and current_row2:
                chr1 = current_row1[chr_index1]
                chr2 = current_row2[chr_index2]
                site1 = int(current_row1[site_index1])
                site2 = int(current_row2[site_index2])

                if chr1 == chr2 and site1 == site2:
                    # Rows have matching positions, write the merged row to the output file
                    output_row = current_row1 + [col for i, col in enumerate(current_row2) if i != chr_index2 and i != site_index2]
                    writer.writerow(output_row)

                    # Move to the next row in both input files
                    current_row1 = next(reader1, None)
                    current_row2 = next(reader2, None)
                elif chr1 < chr2 or (chr1 == chr2 and site1 < site2):
                    # Row 2 has a later position (i.e. position is missing in file 1)
                    writer.writerow(current_row1 + [''] * (len(header2) - 2))

                    # Move to the next row in the first input file
                    current_row1 = next(reader1, None)
                else:
                    # Row 1 has a later position (i.e. position is missing in file 2)
                    output_row = [chr2, site2] + [""] * (len(header1) - 2) + [col for i, col in enumerate(current_row2) if i != chr_index2 and i != site_index2]
                    writer.writerow(output_row)

                    # Move to the next row in the second input file
                    current_row2 = next(reader2, None)

            # Write remaining rows from file1, if any
            while current_row1:
                writer.writerow(current_row1 + [''] * (len(header2) - 2))
                current_row1 = next(reader1, None)

            # Write remaining rows from file2, if any
            while current_row2:
                chr2 = current_row2[chr_index2]
                site2 = int(current_row2[site_index2])

                output_row = [chr2, site2] + [''] * (len(header1) - 2) + [col for i, col in enumerate(current_row2) if i != chr_index2 and i != site_index2]
                writer.writerow(output_row)
                current_row2 = next(reader2, None)

merge_tsv_files('./curlcake_m6a_extracted_w_neighbour.tsv', './curlcake_unm_extracted_w_neighbour.tsv', 'merged_file_test.tsv')

In [41]:
k = 3
nb = [0,1,2,3,4,5,6]
start = False
for current_pos in range(k):
    x = nb[:current_pos+k+1]
    print(x)
    del(x[current_pos])
    print(x)

[0, 1, 2, 3]
[1, 2, 3]
[0, 1, 2, 3, 4]
[0, 2, 3, 4]
[0, 1, 2, 3, 4, 5]
[0, 1, 3, 4, 5]


In [47]:
k = 3
nb = [0,1,2,3,4,5,6]

for current_pos in range(k+1, k*2+1):
    x = nb.copy()
    del(x[current_pos])
    x = x[current_pos-k:]
    print(x, current_pos)

[1, 2, 3, 5, 6] 4
[2, 3, 4, 6] 5
[3, 4, 5] 6


----

#### Checking the pileup string parsing function

In [11]:
from typing import Tuple
import re
def remove_indels(pileup_string: str) -> str:
    """
    Takes a pileup string and removes all occurences of the following patterns:
    '\+[0-9]+[ACGTNacgtn]+' for insertions
    '\-[0-9]+[ACGTNacgtn]+' for deletions

    Parameters
    ----------
    pileup_string : str
        Pileup string extracted from the fifth column of a pileup file

    Returns
    -------
    str
        Pileup strings with all occurences of the patterns above removed
    """
    pattern = "(\+|\-)[0-9]+[ACGTNacgtn]+"
    
    # get the start and end indices of all found patterns 
    coords = []
    for m in re.finditer(pattern, pileup_string):
        str_len = int(pileup_string[m.start(0)+1]) + 1
        coords.append((m.start(0), m.start(0)+1+str_len))
        
    # remove the patterns by the indices
    for start, end in reversed(coords): # reverse list as to not shift the index downstream
        pileup_string = pileup_string[:start] + pileup_string[end:]

    return pileup_string

def parse_pileup_string(pileup_string: str, ref_base: str):
    """
    Extracts the number of each base called at a given position, as well as the number
    of insertions and deletions. Information is extracted from a pileup string (fifth
    column in a pileup file).

    Parameters
    ----------
    pileup_string : str
        Pileup string extracted from the fifth column of a pileup file
    ref_base : str
        reference base at the position corresponding to the pileup string

    Returns
    -------
    dict
        Dictionary containing the number of A, T, C, G, 
        insertions and deletions.
    """
    pileup_string = pileup_string.lower()
    # remove all occurences of a caret and the following letter (could contain a,c,g,t)
    pileup_string = re.sub(r'\^.', '', pileup_string)

    ref_base = ref_base.lower()
    count_dict = {"a": 0, "t": 0, "c": 0, "g": 0, "ins": 0, "del": 0}

    # get number of insertions
    count_dict["ins"] = len(re.findall(r'\+[0-9]+[ACGTNacgtn]+', pileup_string))

    # get number of deletions
    count_dict["del"] = len(re.findall(r'\-[0-9]+[ACGTNacgtn]*|\*', pileup_string))

    # remove indel patterns to count the number of mismatches correctly
    pileup_string = remove_indels(pileup_string)

    # get number of mismatches (i.e. [ACGT])
    count_dict["a"] = pileup_string.count("a")
    count_dict["t"] = pileup_string.count("t")
    count_dict["c"] = pileup_string.count("c")
    count_dict["g"] = pileup_string.count("g")

    # get number of matches (determine where to count matches bases on ref_base)
    n_matches = pileup_string.count('.') + pileup_string.count(',')
    count_dict[ref_base] = n_matches

    return count_dict

def get_relative_count(count_dict, n_reads: int):
    """
    Gets a dictionary containing the absolute counts for A, C, G and T 
    and calculates the relative proportions

    Parameters
    ----------
    count_dict : dict[int]
        Dictionary containing the absolute counts for A, C, G and T
    n_reads : int
        Number of reads at the given position

    Returns
    -------
    dict[float]
        Dictionary containing the relative counts for A, C, G and T
    """
    try:
        count_dict["a_rel"] = count_dict["a"] / n_reads
        count_dict["c_rel"] = count_dict["c"] / n_reads
        count_dict["g_rel"] = count_dict["g"] / n_reads
        count_dict["t_rel"] = count_dict["t"] / n_reads
        count_dict["ins_rel"] = count_dict["ins"] / n_reads
        count_dict["del_rel"] = count_dict["del"] / n_reads

    except ZeroDivisionError:
        count_dict["a_rel"] = 0
        count_dict["c_rel"] = 0
        count_dict["g_rel"] = 0
        count_dict["t_rel"] = 0
        count_dict["ins_rel"] = 0
        count_dict["del_rel"] = 0


    return count_dict

def get_majority_base(count_dict) -> str:
    """
    Gets a dictionary containing the absolute counts for A, C, G and T and returns the
    key of the one with the highest count.

    Parameters
    ----------
    count_dict : dict
        dictionary containing the absolute counts for A, C, G and T

    Returns
    -------
    str
        Key from the dictionary corresponding to the largest value
    """
    dict_subset = dict((k, count_dict[k]) for k in ("a", "c", "g", "t"))
    return max(dict_subset, key = dict_subset.get).upper()

def get_motif(chr: str, site: int, ref: str, k: int) -> str:
    """
    Extracts the motif of k bases up- and downstream from a given chromosomal site.
    Around the start and end of a refernce sequence the missing bases are filled with
    Ns.

    Parameters
    ----------
    chr : str
        name of the chromosome
    site : int
        position on the chromosome (1-indexed)
    ref : str
        reference sequence for the given chromosome 
    k : int
        number of bases to be regarded in both up- and downstream direction 
        
    Returns
    -------
    str
        sequence of k bases around the center site
    """ 
    idx = site-1
    n_ref = len(ref)

    if idx >= 0 and idx < n_ref:
        idx_l = idx-k
        idx_r = idx+k+1
        # left overhang
        if idx_l < 0:
            len_overhang = abs(idx_l)
            overhang = "N" * len_overhang
            motif = overhang + ref[:idx_r]
        # right overhang
        elif idx_r > n_ref:
            len_overhang = idx_r - n_ref
            overhang = "N" * len_overhang
            motif = ref[idx_l:] + overhang
        # no overhang
        else:
            motif = ref[idx_l:idx_r]

        return motif
    
def get_allele_fraction(count_dict, ref_base: str) -> int:
    """
    Calculates the number of reads containing a mismatch, insertion or deletion 
    at a given position.

    Parameters
    ----------
    count_dict : dict
        Dictionary containing the number of occurences of A,C,G,T,ins,del for a given position
    ref_base : str
        reference base at the given position

    Returns
    -------
    int
        Number of mismatched reads a the given position
    """
    mismatch_count_sum = 0
    for b in ["a", "c", "g", "t", "ins", "del"]:
        if b != ref_base.lower():
            mismatch_count_sum += count_dict[b+"_rel"]

    return mismatch_count_sum

import numpy as np
def get_read_quality(read_qualities: str) -> Tuple[float, float]:
    """
    Calculates the mean and std from the read qualities given in the sixth row
    of a pileup file.

    Parameters
    ----------
    read_qualities : str
        Read quality string from pileup file

    Returns
    -------
    tuple[float, float]
        Mean and standard deviation of read qualities
    """
    # transform string to list of corresponding phred numeric values
    vals = [code - 33 for code in read_qualities.encode("ascii")]

    mean = sum(vals)/len(vals)
    std = np.std(vals)

    return mean, std 



def process_position(line) -> str:
    """
    Takes a line from a pileup file and processes it.

    Parameters
    ----------
    line : list[str]
        list containing each element from the pileup line.

    Returns
    -------
    str
        New line derived from the initial one. Can be written to a new file in consequent
        steps.
    """
    # extract elements from list
    chr, site, ref_base, n_reads, read_bases, read_qualities = line[0], int(line[1]), line[2], int(line[3]), line[4], line[5]
    
    # filter by genomic region
    region = None
    if region is not None:
        if not(chr == region[0] and site >= region[1] and site <= region[2]): # both start and end inclusive
            return ""

    # filter by number of reads
    if n_reads < 1:
        return ""

    # get qualitiy measures
    quality_mean, quality_std = get_read_quality(read_qualities)

    # filter by mean read quality
    if quality_mean < 0:
        return ""

    # get reference sequence 
    ref = "GGGCGAATTGGGTACCGGGCCCCCCCTCGAGGTCATCGAATTCCGCCCCTCTCCCTCCCCCCCCCCTAACGTTACTGGCCGAAGCCGCTTGGAATAAGGCCGGTGTGCGTTTGTCTATATGTTATTTTCCACCATATTGCCGTCTTTTGGCAATGTGAGGGCCCGGAAACCTGGCCCTGTCTTCTTGACGAGCATTCCTAGGGGTCTTTCCCCTCTCGCCAAAGGAATGCAAGGTCTGTTGAATGTCGTGAAGGAAGCAGTTCCTCTGGAAGCTTCTTGAAGACAAACAACGTCTGTAGCGACCCTTTGCAGGCAGCGGAACCCCCCACCTGGCGACAGGTGCCTCTGCGGCCAAAAGCCACGTGTATAAGATACACCTGCAAAGGCGGCACAACCCCAGTGCCACGTTGTGAGTTGGATAGTTGTGGAAAGAGTCAAATGGCTCTCCTCAAGCGTATTCAACAAGGGGCTGAAGGATGCCCAGAAGGTACCCCATTGTATGGGATCTGATCTGGGGCCTCGGTGCACATGCTTTACATGTGTTTAGTCGAGGTTAAAAAAACGTCTAGGCCCCCCGAACCACGGGGACGTGGTTTTCCTTTGAAAAACACGATGATAATATGGCCACCACAACCATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAAAGCGGCCGCCACCGCGG"
    # get absolute number of A, C, G, T, ins, del
    count = parse_pileup_string(read_bases, ref_base)

    # get relative number of A, C, G and T counts
    count = get_relative_count(count, n_reads)

    # get allele fraction
    allele_fraction = get_allele_fraction(count, ref_base)

    # filter by allele_fraction
    if allele_fraction < 0:
        return ""

    # get majority base
    majority_base = get_majority_base(count)

    # get 11b motif
    motif = get_motif(chr, site, ref, k=5)

    out = f'{chr}\t{site}\t{n_reads}\t{ref_base}\t{majority_base}\t{count["a"]}\t{count["c"]}\t{count["g"]}\t{count["t"]}\t{count["ins"]}\t{count["del"]}\t{count["a_rel"]}\t{count["c_rel"]}\t{count["g_rel"]}\t{count["t_rel"]}\t{count["ins_rel"]}\t{count["del_rel"]}\t{motif}\t{allele_fraction}\t{quality_mean}\t{quality_std}\n'
    return out


In [25]:
def parse_pileup_string(pileup_string: str, ref_base: str):
    """
    Extracts the number of each base called at a given position, as well as the number
    of insertions and deletions. Information is extracted from a pileup string (fifth
    column in a pileup file).

    Parameters
    ----------
    pileup_string : str
        Pileup string extracted from the fifth column of a pileup file
    ref_base : str
        reference base at the position corresponding to the pileup string

    Returns
    -------
    dict
        Dictionary containing the number of A, T, C, G, 
        insertions and deletions.
    """
    pileup_string = pileup_string.lower()
    # remove all occurences of a caret and the following letter (could contain a,c,g,t)
    pileup_string = re.sub(r'\^.', '', pileup_string)

    ref_base = ref_base.lower()
    count_dict = {"a": 0, "t": 0, "c": 0, "g": 0, "ins": 0, "del": 0}

    # get number of insertions
    count_dict["ins"] = len(re.findall(r'\+[0-9]+[ACGTNacgtn]+', pileup_string))

    # get number of deletions
    count_dict["del"] = len(re.findall(r'\-[0-9]+[ACGTNacgtn]*|\*', pileup_string))

    # remove indel patterns to count the number of mismatches correctly
    pileup_string = remove_indels(pileup_string)

    # get number of mismatches (i.e. [ACGT])
    count_dict["a"] = pileup_string.count("a")
    count_dict["t"] = pileup_string.count("t")
    count_dict["c"] = pileup_string.count("c")
    count_dict["g"] = pileup_string.count("g")

    # get number of matches (determine where to count matches bases on ref_base)
    n_matches = pileup_string.count('.') + pileup_string.count(',')
    count_dict[ref_base] = n_matches

    return count_dict


In [24]:
x = ".......A......-1G......................-2GT.....T.......-2GT....+1A.....*............................................*.....-2GT*................................................*................-2GT.................*...........*..*..............*...........-2GT....................................T*...T.................................*......*..................................................-2GT............-2GT...........T...............*...........................T......................................-2GT......*......................TA.......*.....T......................................+1T....+1G...-2GT*..............+1A.........*....................A.........................................................*..............................................***..................-2GT*....*.-2GT.....T........-2GT...........................-2GT..................................................*...................................*..-2GT.-5GTGCA.....*......................-2GT.-2GT..+1T...A.........T........T.......-1G.T...................T..............*............................-2GT.T..+1G.................................+1T...........................*......T..............................-5GTGCA..........-2GT.......................T*.-2GT..*.................*..-2GT.......-2GT....................-5GTGCA................G........................*................................+1A...T..............................................*.....................T..................T.............-2GT*..................-2GT..........-1G......-2GT..-2GT...T................*.-2GT......-2GT.....+5TGACA.......A..-1G............-2GT..-1G.................................-2GT...................................-2GT.T.................................................-2GT...*....-2GT............+1A..-2GT.......*..............-2GT..........T..............*T.....+2TG........................T....+1A.........................-2GT....T..............A........................................-1G........T......-2GT...........................-2GT.-2GT.......................-2GT......*...........................T*..........T.......-2GT......................-2GT..-2GT..............................-2GT.-5GTGCA....................................T.........................+1T..-2GT..............-2GT.....T.-2GT..........................T..*........-2GT........-1G...............................+1A....-2GT..*............-2GT..............................................-2GT*...T..............T.......*...........-2GT.....T....T..+2GT....*....................+1G.T..-2GT.....T.-2GT....-2GT............*.............-2GT.T.....*..-2GT..*.-2GT.....-2GT..T...-2GT......*...*.......G...-2GT..T................-2GT......**..............................T.....A...........................................-2GT...-1G..*......-1G...........-2GT........-2GT................T............-2GT.......T...-2GT.....**.-2GT...................+1A*........................-1G.....................-2GT......................*...-2GT........T..-2GT.......................**..............*...........A...........-2GT.....-2GT............*.+1A.......**......................*.................-5GTGCA.......*...........-2GT................-2GT...............................*..............T.................*.T..............-2GT.......*............................A..........................A........*...............-2GT.-2GT.......T........................-2GT....-2GT............*.A...........................*.........*.-1G.......T.............-2GT....................*.....T...........*.......A............A..........T..........T...............T..................*...................-2GTT......................*................-2GT.......-2GT...........TT.....-2GT..............T.................*........................*.....-2GT....*.........-2GT...........*..A...-2GT.........T..................-2GT.........T...-2GT......-2GT...................T.....-2GT...........-2GT........T..........-2GT..........T...*.....T...T.-2GT...............*...........T......................-3GTG..T.....T........-2GT........-2GT...-2GT......-2GT................T.....-2GT..........................*...-2GT.*T.-2GT.........*..*...............................T.......T..............+1A.........-2GT..+1A.-2GT.-1G....-2GT......................................*................*T.....*.......-1G.........................-2GT......A...............T......-2GT....*.........T...............T...T*..........T..............*.........*.............*...*...*..........-2GT*..........A.....T........G...........T............-2GT......*..................-2GT.-1G..G...TT....*................T....-2GT.T..........................................................-2GT........*................................*T...............-2GT.............*..............-2GT...*...................*.......-1G..................A...*.....................T...-2GT.-2GT..........+1T.*...T...................................................*...................................................-1G.......................+1T.....-2GT..*...-2GT....................-2GT..............*.....*..............*..*.....T......-2GT......................T*.....-2GT.........-1G...+2GT...........-3GTG......-3GTG..T..........................*.............*..................-2GT..+2TG.........-2GT.....................*..*.......*....T.....*...*.........-1G.........................*....-2GT............*.............-2GT.+1T..-2GT......................*T.......-2GT......................*.*.+1A..*................................A.........T............T......*..-2GT.*..............T................+2AT.....................*...+5TGTGT......*T.........-2GT.........................-2GT........................*.............T........-2GT.............T..-2GT.......-1G..............-3GTG*.....-5GTGCA..........................+1T...*....-2GT..........*.*.................................*..............*.........A....-2GT.T..........*.......-2GT...................T..............................**........T.T.....-1G......-3GTG...........A..T..........................*.............*...........-2GT...................*..*...........-1G........*..........-2GT.....-1G...............*.................T.......T..-2GT....T...T....-2GT...-2GT............+1T......................................-2GT.............-2GT..................................................................*...........-2GTT......-2GT..........-1G.........T...-2GT.......................................*.T....................T......-2GT....................-2GT.................................................-2GT.......-2GT........*...................................-2GT...........T.....*..........*..-2GT.........-2GT.....-2GT...T.....+1T......-2GT...*....-2GT............-2GTT.....*........................-2GT......................................-2GT........................A.T....*........-1G....-2GT......................................T.............-2GT....*T............*....T.........T...T..........-2GT......................................-2GT.-2GT.............T...............*...-2GT...-2GT.........................-2GT............T......-2GT..*..*...........................*........-2GT..............-2GT.....................................................*.....-2GT.-2GT...........*..............-2GT.....................-2GT..............-2GT.............*.....................T.T......................-2GT..*...........................................T.......T....-2GT........*......................*.............*..................*........-2GT............A.......*.......*.T.................+1A.......*................-2GT.-2GT*..*................-2GT.....-2GT....-2GT..A*....................T..*.T...............*...........................*................*.+3ACT...............................-2GT..................T...................T....*.....-2GT......T.....+4TGGT.......-2GT.........T..............T............................*...........................*..-1G.................T.......................-2GT...............A..................+1G..-2GT.-2GTT....G..........-2GT..-2GT........T........-2GT...-2GT..T........,,...........................*....**.................................T...-2GT.-2GT..T.....................*.........-2GT......**.TT.................................-2GT....-2GT..............T.......-2GT...................................*..............-2GT...........-2GT.......................T............................*.......................................-2GT..+2TT..-2GT............*.-2GT..................*.....-2GT.............T.....-2GT...............*...-2GT.............T.............T.........T...+1A.........-2GT........+1A...................*..........................-2GT..-2GT.+2TA....-2GT...-2GT.......T........................*...........................T...............-1G...................*........A.....*.-5GTGCA......+1A..*.+2GT...........*.-1G..................-2GT.........-2GT...A...T......................+2TG....T....-2GT.........*.........T...............+1T.-2GT.......*......................................-2GT...........................*....-2GT.-2GTA..........*.-2GT............................................*A..*.......-1G..+1T...........T.......*.....................*............................A...............-2GT......*...............-2GT...................................-2GT......-2GT.........-1GT....T...........*...............T.-2GT.....................*T..-2GT*T....-2GT............-2GT..............-2GT...............*.................................T...............................-1G*.............*.*..................-2GT....................*.........*..........-2GT...................-3GTG..T...-2GT.......*...................-2GT.......*...............-2GT..T....*.-2GT....T...........*.-2GT.*...........-2GT................-2GT....................-2GT...............-2GT*...........-2GT.*.........A................T.........................................."
count = parse_pileup_string(x, "c")
sum(count.values())

.......a.................................t................*............................................*.....*................................................*.................................*...........*..*..............*...............................................t*...t.................................*......*.........................................................................t...............*...........................t............................................*......................ta.......*.....t.............................................*.......................*....................a.........................................................*..............................................***..................*....*......t.....................................................................................*...................................*........*............................a.........t........t........t...................t..............*.............................t

8997

In [47]:
def remove_indels(pileup_string: str) -> str:
    """
    Takes a pileup string and removes all occurences of the following patterns:
    '\+[0-9]+' for insertions
    '\-[0-9]+' for deletions
    In addition to the pattern itself, remove the following n characters,
    where n is the number specified after + or -.

    Parameters
    ----------
    pileup_string : str
        Pileup string extracted from the fifth column of a pileup file

    Returns
    -------
    str
        Pileup strings with all occurences of the patterns above removed
    """
    pattern = "(\+|\-)[0-9]+"
    
    # get the start and end indices of all found patterns 
    coords = []
    for m in re.finditer(pattern, pileup_string):
        str_len_as_str = pileup_string[m.start()+1:m.end()]
        num_digits = len(str_len_as_str)
        str_len = int(str_len_as_str)
        coords.append((m.start(), m.start()+1+num_digits+str_len))

    # remove the patterns by the indices
    for start, end in reversed(coords): # reverse list as to not shift the index downstream
        pileup_string = pileup_string[:start] + pileup_string[end:]

    return pileup_string


In [51]:
x = '...+100aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaat...c'
remove_indels(x)

3 100 100
[(3, 107)]


'...t...c'

In [5]:
import re

def extract_numbers(string):
    numbers = []

    # Case: "x"
    match = re.match(r"(\d+)", string)
    if match:
        numbers.append(int(match.group(1)))

    # Case: "x,y,z"
    match = re.match(r"(\d+(?:,\d+)*)", string)
    if match:
        number_list = match.group(1).split(',')
        numbers.extend(map(int, number_list))

    # Case: "x-z"
    match = re.match(r"(\d+)-(\d+)", string)
    if match:
        start = int(match.group(1))
        end = int(match.group(2))
        numbers.extend(range(start, end + 1))

    return numbers

# Example usage
strings = ["5", "1,2,3,4,5", "10-15"]
for string in strings:
    numbers = extract_numbers(string)
    print(f"{string}: {numbers}")


5: [5, 5]
1,2,3,4,5: [1, 1, 2, 3, 4, 5]
10-15: [10, 10, 10, 11, 12, 13, 14, 15]


In [7]:
a = "+"
"x" in a

r"(\d+(?:,\d+)*)"
r"(\d+(?:,\d+)*)"

False

In [10]:
x = "1,3,4,5"
list(map(int, x.split(",")))

[1, 3, 4, 5]

In [11]:
list(range(5))

[0, 1, 2, 3, 4]

In [37]:
def get_sites(site: str) -> list[int]:
    try:
        site = [int(site)]
    except:
        if "," in site:
            print(x.split(","))
            site = list(map(int, site.split(",")))
        elif "-" in site:
            tmp = site.split("-")
            start = int(tmp[0])
            end = int(tmp[1])
            site = list(range(start, end+1))
        else:
            raise Exception(f"Could not extract sites from given string '{site}'")

    return site

In [65]:
get_sites("10,13")

AttributeError: 'list' object has no attribute 'split'

In [56]:
import re

string=">150"
try:
    print(int(string))
except:
    match = re.match(r'([<>]=?|==)(\d+)', string)
    if match:
        print(match.group(1))
        print(match.group(2))
    else:
        raise Exception(f"Could not extract information from given string '{string}'")

>
150


In [60]:
x = "A,C,G"
y = x.split(",")
y

['A', 'C', 'G']

In [61]:
map(lambda x: x in ["A", "C", "G", "T", "N"], y)

<map at 0x7f0059892e90>

In [64]:
x = ["A", "A", "C", "G"]


['C', 'A', 'G']

In [76]:
import re
from typing import Tuple, Callable

OPERATORS = {
    "<": lambda x, y: x < y,
    "<=": lambda x, y: x <= y,
    ">": lambda x, y: x > y,
    ">=": lambda x, y: x >= y,
    "==": lambda x, y: x == y
}
def get_n_reads(n_reads_str: str) -> Tuple[float, Callable[[float, float], bool]]:
    try:
        val = float(n_reads_str)
        fun = OPERATORS.get("==")
    except:
        match = re.match(r'([<>]=?|==)(\d+\.?\d*)', n_reads_str)
        if match:
            val = float(match.group(2))
            op = match.group(1)
            fun = OPERATORS.get(op)
        else:
            raise Exception(f"Could not extract information from given string '{n_reads_str}'")
    return val, fun

#### Testing Composition plotting

In [3]:
import plotly.graph_objects as go

file_mod = "/home/vincent/masterthesis/data/nanocompore_data/processed/Oligo_1_extracted.tsv"
file_unm = "/home/vincent/masterthesis/data/nanocompore_data/processed/Oligo_control_extracted.tsv"
bed = "/home/vincent/masterthesis/data/nanocompore_data/processed/composition_plot_test.bed"

['a',
 'c',
 'g',
 't',
 'a',
 'c',
 'g',
 't',
 'a',
 'c',
 'g',
 't',
 'a',
 'c',
 'g',
 't',
 'a',
 'c',
 'g',
 't']

In [29]:
from typing import Set, Tuple, Dict, List
import os
def get_bed_positions(bed_file: str) -> Set[Tuple[str, int]]:
    positions = set()
    with open(bed_file, 'r') as bed:
        for line in bed:
            fields = line.strip().split('\t')
            chromosome = fields[0]
            position = int(fields[1])
            positions.add((chromosome, position))
    return positions

def extract_rows(tsv_file: str, ref_position: str) -> Dict[Tuple[str, int], List[str]]:
    with open(tsv_file, 'r') as tsv:
        next(tsv)
        for line in tsv:
            fields = line.strip().split('\t')
            chromosome = fields[0]
            site = int(fields[1])
            position = (chromosome, site)
            if position in ref_position:
                extracted_info = [float(field) for field in fields[11:15]]
                return extracted_info
    
    return [0, 0, 0, 0]
    

def reformat_data(data):
    pass
     
def get_info(filenames: str):
    filenames = filenames.split(",")

    positions = get_bed_positions(bed)
    bases = ["a", "c", "g", "t"] * len(positions)
    for position in positions:
        samples = []
        rel_counts = []
        for sample in filenames:
        
        
    data = {}
    for filename in filenames:
        basename = os.path.splitext(os.path.basename(filename))[0]
        data[basename] = extract_rows(filename, bed)
    return data


In [30]:
y = get_info(file_mod+","+file_unm)
y.keys()

dict_keys(['Oligo_1_extracted', 'Oligo_control_extracted'])

In [31]:
y

{'Oligo_1_extracted': {('oligo', 15): [0.000540297163439892,
   0.07411076091850517,
   0.004232327780279154,
   0.9208464655560559],
  ('oligo', 25): [0.0015828532876641238,
   0.005345373397685402,
   0.00023353573096683791,
   0.9908402096631895],
  ('oligo', 50): [0.00467803097423539,
   0.0028706099160080802,
   0.9839104086189177,
   0.004766630045717121],
  ('oligo', 60): [0.05515646353856397,
   0.02340820688930161,
   0.7301748607071521,
   0.05452570347268459]},
 'Oligo_control_extracted': {('oligo', 15): [0.0009590946146837385,
   0.06963026902603942,
   0.0073370738023306,
   0.9214261736920347],
  ('oligo', 25): [0.003530131198576829,
   0.015718812541694462,
   0.0005003335557038026,
   0.975136201912386],
  ('oligo', 50): [0.004673955380030616,
   0.002672125653312077,
   0.9823730287412024,
   0.004483736265727045],
  ('oligo', 60): [0.006751767929512527,
   0.003199501205965839,
   0.9706876466438052,
   0.009106272663133541]}}