# utils

In [None]:
#| default_exp utils

In [None]:
#| export
import gzip
import itertools
import os
import csv
from dgrec import pairwise2
from dgrec.pairwise2 import format_alignment
from Bio.Align import PairwiseAligner
from Bio.Seq import Seq
from Bio import SeqIO



In [None]:
#| export
def align2mut(align):
    """Converts a sequence alignment result from Bio.pairwise2.Align.globalms into a list of mutations.
    Positions are those of the alignment."""
    res=[]
    for i in range(align.end):
        if align.seqA[i]!=align.seqB[i]:
            mut=(align.seqA[i],i,align.seqB[i])
            res.append(mut)
    return res

In [None]:
seqA = "ATCCCGGCAGC"
seqB = "ATCCACGGTCAGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
align2mut(align)

[('-', 4, 'A'), ('-', 8, 'T')]

In [None]:
#| export
def mut_rix(mutations):
    """Reindexes the positions of the mutations to go from 
    their position in the sequence alignment to their position in the original sequence."""
    ph=0
    res_rix=[]
    for mut in mutations:
        rix=mut[1]+ph
        res_rix.append((mut[0],rix,mut[2]))
        if mut[0]=='-':
            ph-=1
            
    return res_rix

In [None]:
seqA = "ATCCCGGCAGC"
seqB = "ATCCACGGTCAGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
print(format_alignment(*align))

mutations=align2mut(align) 
print("Output of align2mut:")
print(mutations)

print("Output of mut_rix:")
print(mut_rix(mutations))

ATCC-CGG-CAGC
|||| ||| ||||
ATCCACGGTCAGC
  Score=20

Output of align2mut:
[('-', 4, 'A'), ('-', 8, 'T')]
Output of mut_rix:
[('-', 4, 'A'), ('-', 7, 'T')]


In [None]:
#| export
def get_mutations(seqA,seqB, match=2, mismatch=-1, gap_open=-1, gap_extend=-.5):
    """Aligns two sequences and returns a genotype string.
    The string is a comma separated list of mutations.
    """
    align=pairwise2.align.globalms(seqA,seqB, match, mismatch, gap_open, gap_extend, one_alignment_only=True)[0]
    mutations=align2mut(align) 
    mutations=mut_rix(mutations)
    return mutations

In [None]:
seqA = "ATCCGGCAGCAGGTCGTGAGC"
seqB = "ATCCACGGTCAGCACGTCGTGGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
print(format_alignment(*align))

get_mutations(seqA,seqB)

ATC--CGG-CAGCAGGTCGTGAGC
|||  ||| |||||.|||||| ||
ATCCACGGTCAGCACGTCGTG-GC
  Score=33.5



[('-', 3, 'C'), ('-', 3, 'A'), ('-', 6, 'T'), ('G', 11, 'C'), ('A', 18, '-')]

In [None]:
#| export
def get_mutations_noalign(seqA,seqB):
    """Returns a genotype string.
    The string is a comma separated list of mutations.
    """
    assert(len(seqA)==len(seqB))
    mutations=[]
    for i in range(len(seqA)):
        if seqA[i]!=seqB[i]:
            mutations.append((seqA[i],str(i),seqB[i]))
    return mutations

In [None]:
get_mutations_noalign("AGCTATGG","AGCTCTGG")

[('A', '4', 'C')]

In [None]:
#| hide
def align_to_mut(alignment):
    muts=[]
    p=0
    for i in range(alignment.indices.shape[1]):
        if alignment.indices[0][i]==-1:
            muts.append(["-",p,alignment.query[alignment.indices[1][i]]])
        elif alignment.indices[1][i]==-1:
            p=alignment.indices[0][i]
            muts.append([alignment.target[alignment.indices[0][i]],p,"-"])
        elif alignment.target[alignment.indices[0][i]]!=alignment.query[alignment.indices[1][i]]:
            p=alignment.indices[0][i]
            muts.append([alignment.target[alignment.indices[0][i]],p,alignment.query[alignment.indices[1][i]]])
        else:
            p=alignment.indices[0][i]
    return muts

In [None]:
#| hide
aligner=PairwiseAligner()
aligner.mode = 'global'
aligner.match_score = 2
aligner.mismatch_score = -1
aligner.open_gap_score = -1
aligner.extend_gap_score = -0.5

alignments = aligner.align(seqA, seqB)
alignment = alignments[0]

print(alignment)
align_to_mut(alignment)

target            0 ATC--CGG-CAGCAGGTCGTGAGC 21
                  0 |||--|||-|||||.||||||-|| 24
query             0 ATCCACGGTCAGCACGTCGTG-GC 23



[['-', 2, 'C'], ['-', 2, 'A'], ['-', 5, 'T'], ['G', 11, 'C'], ['A', 18, '-']]

In [None]:
#| hide

aligner=PairwiseAligner()
aligner.mode = 'global'
aligner.match_score = 2
aligner.mismatch_score = -1
aligner.open_gap_score = -1
aligner.extend_gap_score = -0.5

def get_mutations_new(seqA,seqB):
    """Aligns two sequences and returns a genotype string.
    The string is a comma separated list of mutations.
    This implementation is much slower than the pairwise2 implementation.
    """
    alignments = aligner.align(seqA, seqB)
    align = alignments[0]
    mutations=align_to_mut(align) 
    return mutations

In [None]:
#| hide
seqA = "ATCCGGCAGCAGGTCGTGAGC"
seqB = "ATCCACGGTCAGCACGTCGTGGC"
get_mutations(seqA,seqB)

[('-', 3, 'C'), ('-', 3, 'A'), ('-', 6, 'T'), ('G', 11, 'C'), ('A', 18, '-')]

In [None]:
#| export
def mut_to_str(mutations: list):
    """Converts list of mutations to a comma separated string"""
    mut_str_list=[''.join(map(str,mut)) for mut in mutations]
    mut_str=','.join(mut_str_list)
    return mut_str

In [None]:
mut_to_str([('-', 3, 'C'), ('-', 3, 'A'), ('-', 6, 'T'), ('G', 11, 'C'), ('A', 18, '-')])

'-3C,-3A,-6T,G11C,A18-'

In [None]:
#| export
def str_to_mut(gen: str):
    """Converts genotype string to a list of mutations"""
    
    mutations=[]
    if gen=="":
        return mutations
    else:
        g=gen.split(',')
        for mut in g:
            mut_from=mut[0]
            ix=int(mut[1:-1])
            mut_to=mut[-1]
            mutations.append([mut_from,ix,mut_to])

        return mutations

In [None]:
assert(str_to_mut('')==[])
str_to_mut('-4A,-7T,G12C')


[['-', 4, 'A'], ['-', 7, 'T'], ['G', 12, 'C']]

In [None]:
#| export
def genstr_to_seq(genstr,refseq):
    j=0
    seq=''
    for mut in str_to_mut(genstr):
        tb, i, qb = mut
        seq+=refseq[j:i]
        if tb=="-":
            seq+=qb
            j=i
        elif qb=="-":
            j=i+1
            pass
        else:
            seq+=qb
            j=i+1

    seq+=refseq[j:]

    return seq

In [None]:
genstr='-3C,-3A,-6T,G11C,A18-'
genstr_to_seq(genstr,seqA)

'ATCCACGGTCAGCACGTCGTGGC'

In [None]:
#| hide
seqA = "ATCCGGCAGCAGGTCGTGAGC"
seqB = "ATCCACGGTCAGCACGTCGTGGC"
genstr=mut_to_str(get_mutations(seqA,seqB))
assert(genstr_to_seq(genstr,seqA)==seqB)

seqA = "AGCGCTATGCTGCGCGCGTACTGCCGCTAGCTATGCTCAGGCCGATATATGCGAGC"
seqB = "AGCGCTAGCATGGTGCGCGCGTACTGCAGCTAGCTATGCAGGCCGATATATGCAAGC"
genstr=mut_to_str(get_mutations(seqA,seqB))
assert(genstr_to_seq(genstr,seqA)==seqB)

In [None]:
s="AGCGAGC"
print(len(s)%3)
s[:-(len(s)%3)]

1


'AGCGAG'

In [None]:
# | export
def reverse_complement(dna):
    dna=dna.upper()
    # Dictionary to hold the complement of each base
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C','-': '-','N': 'N'}
    
    # Reverse the DNA string
    reversed_dna = dna[::-1]
    
    # Get the complement for each base in the reversed string
    reverse_complement_dna = ''.join(complement[base] for base in reversed_dna)
    
    return reverse_complement_dna



In [None]:
#| export
def get_prot_mut(genstr,refseq,frame=0,ori=1):
    mut_seq=genstr_to_seq(genstr,refseq)
    if ori==-1:
        refseq=reverse_complement(refseq)
        mut_seq=reverse_complement(mut_seq)

    cut_mut=((len(mut_seq)-frame)%3)
    if cut_mut:
        mut_seq_inframe=mut_seq[frame:-cut_mut]
    else:
        mut_seq_inframe=mut_seq[frame:]

    cut_ref=((len(refseq)-frame)%3)
    if cut_ref:
        refseq_inframe=refseq[frame:-cut_ref]
    else:
        refseq_inframe=refseq[frame:]

    mut_prot=Seq(mut_seq_inframe).translate()
    ref_prot=Seq(refseq_inframe).translate()
    L=min(len(mut_prot),len(ref_prot))
    mut_prot=mut_prot[:L]
    ref_prot=ref_prot[:L]
    return mut_to_str(get_mutations_noalign(ref_prot,mut_prot))


In [None]:
seqA = "AGCGCTATGCTGCGCGCGTACTGCCGCTAGCTATGCTCAGGCCGATATATGCGAGCA"
seqB = "AGCGCTATGCTGCGCGCGAAAACCCGCTAGCTATGCTCAGGTCGATATATGCGAGCA"
genstr=mut_to_str(get_mutations(seqA,seqB,gap_open=-5))
print(genstr)
get_prot_mut(genstr,seqA,frame=1)

T18A,C20A,T21A,G22C,C41T


'T6K,A7P,A13V'

In [None]:
genstr='A61G,-63T,A79T'
refseq='CGCCTTGGTAGCCATCTTCAGTTCCAGTGTTTGCTTCAAATACTAAGTATTTGTGGCCTTTATCTTCTACGTAGTGAGGATCTCTCAGCGTATGGTTGTCGCCTGAGCTGTAGTTGCCTTCATCGATGAACTGCTGTAC'
ori=-1
frame=0
get_prot_mut(genstr,refseq,frame=frame,ori=ori)

'D19E,D25E,K26Q,G27R,H28P,K29Q,Y30I,V32S,F33I,E34*,A35S,N36K,T37H,G38W,T39N,E40*,D41R,G42W,Y43L,Q44P,G45R'

In [None]:
genstr='A61G,A79T'
refseq='CGCCTTGGTAGCCATCTTCAGTTCCAGTGTTTGCTTCAAATACTAAGTATTTGTGGCCTTTATCTTCTACGTAGTGAGGATCTCTCAGCGTATGGTTGTCGCCTGAGCTGTAGTTGCCTTCATCGATGAACTGCTGTAC'
ori=-1
frame=0
get_prot_mut(genstr,refseq,frame=frame,ori=ori)

'D19E'

In [None]:
#| export
def parse_genotypes(genotypes_file):
    gen_list=[]
    with open(genotypes_file,"r") as handle: 
        reader = csv.reader(handle, delimiter='\t')
        for row in reader:
            gen_list.append((row[0],int(row[1])))
    return gen_list

In [None]:
from dgrec.example_data import get_example_data_dir

In [None]:
data_path=get_example_data_dir()
gen_list=parse_genotypes(os.path.join(data_path,"sacB_genotypes.csv"))
for g,n in itertools.islice(gen_list,30,40):
    print(n,"\t",g)

20 	 A72G,A79G
19 	 A72G,A79T,A91G
17 	 T67G,A91G
17 	 A76G,A79T
17 	 A68C,A72G
17 	 A111G
16 	 A68G,A91G
16 	 A86G,A91T
15 	 A72G,A91T
15 	 A79G,A86G


In [None]:
#| export
def get_aa_mut_list(gen_list,refseq, frame=0, ori=1):
    amino_mut_dic={}
    for gen, n in gen_list:
        if "-" not in gen: #excludes insertion or deletions as they will lead to frameshifts
            if "N" not in gen:  #exclue Ns
                mut=get_prot_mut(gen, refseq, frame=frame, ori=ori)
                if mut in amino_mut_dic:
                    amino_mut_dic[mut]+=n
                else:
                    amino_mut_dic[mut]=n
    aa_mut_list=list(amino_mut_dic.items())
    aa_mut_list=sorted(aa_mut_list,key=lambda x: x[1],reverse=True)
    return aa_mut_list


In [None]:
read_ref_file="sacB_ref.fasta"
refseq=next(SeqIO.parse(os.path.join(data_path,read_ref_file),"fasta"))
refseq=str(refseq.seq)
aa_mut_list=get_aa_mut_list(gen_list,refseq,ori=-1)
aa_mut_list[:10]

[('', 43341),
 ('Y22H', 351),
 ('H15Q', 277),
 ('D19E', 246),
 ('L17P', 200),
 ('V23A', 162),
 ('S11P', 117),
 ('D25E', 113),
 ('D19E,Y22H', 75),
 ('T16P', 61)]

In [None]:
#| export

def downsample_fastq_gz(input_file, output_file, num_reads=10000):
    """Downsamples a compressed FASTQ file to the specified number of reads.

    Args:
        input_file (str): Path to the input FASTQ.gz file.
        output_file (str): Path to the output FASTQ.gz file.
        num_reads (int, optional): Number of reads to keep. Defaults to 10000.
    """

    with gzip.open(input_file, 'rb') as infile, gzip.open(output_file, 'wb') as outfile:
        lines = itertools.islice(infile, num_reads * 4)  # Read 4 lines (1 read) at a time
        for line in lines:
            outfile.write(line)

In [None]:

input_file=os.path.join(data_path,"sacB_example.fastq.gz")
output_file="sacB_example_downsampled.fastq.gz"
downsample_fastq_gz(input_file, output_file, num_reads=100)



In [None]:
#| export
def get_basename_without_extension(file_path):
    """
    Extracts the basename of a file without the extension.

    Args:
        file_path (str): The path to the file.

    Returns:
        str: The basename of the file without the extension.
    """

    basename = os.path.basename(file_path)
    if '.' in basename:
        # Split at the last dot to remove the extension
        return basename.rsplit('.', 1)[0]
    else:
        # No extension, return the whole filename
        return basename

In [None]:
# Example usage
file_path = "C:/Users/John/Documents/my_file.txt"
basename_without_extension = get_basename_without_extension(file_path)
print(basename_without_extension)  # Output: my_file

my_file


In [None]:
#| hide
# Remove test files

# List all files in the directory
files = os.listdir()

# Iterate over the files
for file in files:
    if file.endswith(".gz"):
        
        try:
            # Delete the file
            os.remove(file)
        except PermissionError:
            print(f"Permission denied to delete file '{file}'.")
        except FileNotFoundError:
            print(f"File '{file}' not found.")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
#| export
def pickle_save(data_in,file_name_out):
    pickle_out = open(file_name_out,"wb")
    pickle.dump(data_in, pickle_out)
    pickle_out.close()
    


In [None]:
#| export
def pickle_load(file_name_in):
    pickle_in = open(file_name_in,"rb")
    data_out = pickle.load(pickle_in)
    return data_out

In [None]:
# | export
def make_dgr_oligos(target:str #TR DNA
                    ,split_number:int #Number of desired splits
                    ):
    "Split the TR target into the input number and then generates the oligos to order"
    bad_overhangs=['AATT', 'ATAT', 'TATA', 'TTAA', 'ACGT', 'CATG', 'CTAG', 'GATC', 'GTAC', 'TCGA', 'TGCA', 'CCGG', 'CGCG', 'GCGC', 'GGCC']
    target=target.upper()
    split=len(target)//split_number
    overhang_list=[]
    split_k_list=[]
    forward_list=[]
    reverse_list=[]
    full_list=[]


    
    for k in range(1,split_number):
        split_k=k*split
        overhang = target[split_k-2:split_k+2]
        
        while overhang in bad_overhangs+['ATAA','TCAG']:
            if split_k%2 == 0:
                split_k += 1
                print('+1')
            elif split_k%2 == 1:
                split_k += -1
                print('-1')
            overhang = target[split_k-2:split_k+2]
        overhang_list.append(overhang)
        split_k_list.append(split_k)

    forward_list.append('ATAA'+target[:split_k_list[0]-2])
    reverse_list.append(reverse_complement(target[:split_k_list[0]+2]))

    for j in range (len(split_k_list)-1):
        forward_list.append(target[split_k_list[j]-2:split_k_list[j+1]-2])
        reverse_list.append(reverse_complement(target[split_k_list[j]+2:split_k_list[j+1]+2]))


    forward_list.append(target[split_k_list[-1]-2:])
    reverse_list.append('CAGA'+reverse_complement(target[split_k_list[-1]+2:]))

    for i in range(split_number):
        full_list.append(forward_list[i])
        full_list.append(reverse_list[i])
            
    return(full_list)

In [None]:
target='CCTCAGATACAAGCCGGCATAAATAATAACATATTCTATGACCATGATAATAGTGTAGGTGCAAACGCCAACGCTAAAAACACTGGAACCATGAACGGTAATACTGCAGGGACGAATATAGCCAAAACTTCT'
make_dgr_oligos(target,4)

['ATAACCTCAGATACAAGCCGGCATAAATAATAACA',
 'AATATGTTATTATTTATGCCGGCTTGTATCTGAGG',
 'TATTCTATGACCATGATAATAGTGTAGGTGCAA',
 'GCGTTTGCACCTACACTATTATCATGGTCATAG',
 'ACGCCAACGCTAAAAACACTGGAACCATGAACG',
 'TTACCGTTCATGGTTCCAGTGTTTTTAGCGTTG',
 'GTAATACTGCAGGGACGAATATAGCCAAAACTTCT',
 'CAGAAGAAGTTTTGGCTATATTCGTCCCTGCAGTA']

In [None]:
# | export
def reverse_comp_geno_list(geno_list:list # List of genotypes
                           ,ref_seq:str #string of the template sequence
                           ):
    l=len(ref_seq)
    gene_rev_dic={}
    for geno in geno_list:
        if geno[0]!='':
            mut_list=geno[0].split(',')
            umi_count=geno[1]
            rev_mut_list=[]
            for mut in mut_list:
                old_base=mut[0]
                new_base=mut[-1]
                position=int(mut[1:-1])
                rev_mut=reverse_complement(old_base)+str(l-position-1)+reverse_complement(new_base)
                rev_mut_list.append((rev_mut))
            revgen=','.join(rev_mut_list[::-1])
            if revgen in gene_rev_dic:
                gene_rev_dic[revgen]+=umi_count
            else:
                gene_rev_dic[revgen]=umi_count

        else:
            gene_rev_dic['']=geno[1]

    geno_list_rev = list(gene_rev_dic.items())
    return geno_list_rev


In [None]:
# | export
def remove_position(geno,pos_list):
    mut_split=geno.split(',')
    new_geno=[]
    for mut in mut_split:
        if int(mut[1:-1]) not in pos_list:
            new_geno.append(mut)
    return ','.join(new_geno)



In [None]:
# | export
def remove_position_list(geno_list,pos_list):
    new_geno_list=[]
    for k in geno_list:
        geno_k=k[0]
        count_k=k[1]
        # print(geno_k)
        if geno_k!='':
            new_geno_k=remove_position(geno_k,pos_list)
            new_geno_list.append((new_geno_k,count_k))
        else:
            new_geno_k=geno_k
            new_geno_list.append((new_geno_k,count_k))
    return new_geno_list

In [None]:
ref_genome='AACGTATACGGCGGAATATTTGCCGAATGCCGTGTGGACGTAAGCGTGAACGTCAGGATCACGTTTCCCCGACCCGCTGGCATGTCAACAATACGGGAGAACACCTGTACCGCCTCGTTCGCCGCGC'
geno_list_test=[('T19C', 176012),
 ('T19C,T64A,T65A', 169),
 ('T19C,G36T', 40),
 ('T19C,T58G,T63A,T64G', 4),
 ('T19C,A42C', 14),
 ('T19C,T63G', 13),
 ('T19C,T52A,T64A,T65A', 19),
 ('T19C,A41C,A57C,T58C', 1),
 ('T19C,T52A', 94),
 ('T19C,T64A,T65G', 214),
 ('T19C,T63A,T64C,T65A', 2),
 ('T19C,A49C,T64C,T91C', 2),
 ('T19C,T32C,T52C,T64A,T65G,T84C,T91A', 1),
 ('T19C,T82C,T84A', 8),
 ('T19C,T64C', 308),
 ('T19C,T40C,G43A,T58A,A71C,T91C', 1),
 ('T52A,T77C', 1),
 ('T19C,T52C,T58A', 9),
 ('T20C,-52C,T58C,A71C,T77C', 1),
 ('T19C,G70T', 110),
 ('T19C,T32C,T65C,T82C', 1),
 ('T19C,T32C,T46C', 4),
 ('T19C,T32C,A41C,A49C,T64A,T65G', 1),
 ('T19C,T64A', 115),
 ('T19C,T58C', 68)]

geno_list_test=remove_position_list(geno_list_test,[19])
reverse_comp_geno_list(geno_list_test,ref_genome)

[('', 176012),
 ('A61T,A62T', 169),
 ('C90A', 40),
 ('A62C,A63T,A68C', 4),
 ('T84G', 14),
 ('A63C', 13),
 ('A61T,A62T,A74T', 19),
 ('A68G,T69G,T85G', 1),
 ('A74T', 94),
 ('A61C,A62T', 214),
 ('A61T,A62G,A63T', 2),
 ('A35G,A62G,T77G', 2),
 ('A35T,A42G,A61C,A62T,A74G,A94G', 1),
 ('A42T,A44G', 8),
 ('A62G', 308),
 ('A35G,T55G,A68T,C83T,A86G', 1),
 ('A49G,A74T', 1),
 ('A68T,A74G', 9),
 ('A49G,T55G,A68G,-74G,A106G', 1),
 ('C56A', 110),
 ('A44G,A61G,A94G', 1),
 ('A80G,A94G', 4),
 ('A61C,A62T,T77G,T85G,A94G', 1),
 ('A62T', 115),
 ('A68G', 68)]