# utils

In [None]:
#| default_exp utils

In [None]:
#| export
import gzip
import itertools
import os
import csv
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

In [None]:
#| export
def mut_to_str(mutations: list):
    """Converts list of mutations to a comma separated string"""
    mut_str_list=[''.join(map(str,mut)) for mut in mutations]
    mut_str=','.join(mut_str_list)
    return mut_str

In [None]:
mut_to_str([('-', 4, 'A'), ('-', 7, 'T'), ('G', 12, 'C')])

'-4A,-7T,G12C'

In [None]:
#| export
def str_to_mut(gen: str):
    """Converts genotype string to a list of mutations"""
    mutations=[]
    g=gen.split(',')
    for mut in g:
        mut_from=mut[0]
        ix=int(mut[1:-1])
        mut_to=mut[-1]
        mutations.append([mut_from,ix,mut_to])

    return mutations

In [None]:
str_to_mut('-4A,-7T,G12C')

[['-', 4, 'A'], ['-', 7, 'T'], ['G', 12, 'C']]

In [None]:
#| export
def parse_genotypes(genotypes_file):
    gen_list=[]
    with open(genotypes_file,"r") as handle: 
        reader = csv.reader(handle, delimiter='\t')
        for row in reader:
            gen_list.append((row[0],int(row[1])))
    return gen_list

In [None]:
from dgrec.example_data import get_example_data_dir

In [None]:
data_path=get_example_data_dir()
gen_list=parse_genotypes(os.path.join(data_path,"sacB_genotypes.csv"))
for g,n in itertools.islice(gen_list,30,40):
    print(n,"\t",g)

20 	 A72G,A79G
19 	 A72G,A79T,A91G
17 	 T67G,A91G
17 	 A76G,A79T
17 	 A68C,A72G
17 	 A111G
16 	 A68G,A91G
16 	 A86G,A91T
15 	 A72G,A91T
15 	 A79G,A86G


In [None]:
#| export

def downsample_fastq_gz(input_file, output_file, num_reads=10000):
    """Downsamples a compressed FASTQ file to the specified number of reads.

    Args:
        input_file (str): Path to the input FASTQ.gz file.
        output_file (str): Path to the output FASTQ.gz file.
        num_reads (int, optional): Number of reads to keep. Defaults to 10000.
    """

    with gzip.open(input_file, 'rb') as infile, gzip.open(output_file, 'wb') as outfile:
        lines = itertools.islice(infile, num_reads * 4)  # Read 4 lines (1 read) at a time
        for line in lines:
            outfile.write(line)

In [None]:

input_file=os.path.join(data_path,"sacB_example.fastq.gz")
output_file="sacB_example_downsampled.fastq.gz"
downsample_fastq_gz(input_file, output_file, num_reads=100)



In [None]:
#| hide
# Remove test files

# List all files in the directory
files = os.listdir()

# Iterate over the files
for file in files:
    if file.endswith(".gz"):
        
        try:
            # Delete the file
            os.remove(file)
        except PermissionError:
            print(f"Permission denied to delete file '{file}'.")
        except FileNotFoundError:
            print(f"File '{file}' not found.")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()