In [23]:
import csv

In [24]:
def hamming_distance(seq1, seq2): #TESTED
    """
    Calculates the Hamming distance between two nucleotide sequences.

    Arguments:
    seq1 -- a string representing the first nucleotide sequence
    seq2 -- a string representing the second nucleotide sequence

    Returns:
    The Hamming distance between the two sequences, i.e., the number of positions
    at which the corresponding nucleotides are different.
    """
    if len(seq1) != len(seq2):
        raise ValueError("Sequence lengths must be equal")
    
    # Check if sequences contain only valid nucleotides
    valid_nucleotides = {'A', 'T', 'G', 'C'}
    if set(seq1) - valid_nucleotides or set(seq2) - valid_nucleotides:
        raise ValueError("Sequences must contain only A, T, G, or C nucleotides")
    
    # Initialize the Hamming distance
    hamming_dist = 0
    
    # Iterate over the characters in the sequences
    for i in range(len(seq1)):
        if seq1[i] != seq2[i]:
            hamming_dist += 1
    
    return hamming_dist


In [25]:
def read_fasta_file(filename): #TESTED
    """
    Reads a FASTA file and returns the sequence as a string.

    Arguments:
    filename -- the name of the FASTA file to read

    Returns:
    A string representing the sequence in the FASTA file.
    """
    sequence = ""
    with open(filename, "r") as f:
        for line in f:
            if not line.startswith(">"):
                sequence += line.strip()
    return sequence.upper() #return the sequence in upper case to ensure compatibility with the downstream code (e.g. genetic table search)

In [26]:
def reverse_complement(sequence): #TESTED
    """
    Returns the reverse complement of a nucleotide sequence.

    Arguments:
    sequence -- a string representing the nucleotide sequence

    Returns:
    The reverse complement of the input sequence, as a string.
    """
    complement = {"A": "T", "T": "A", "C": "G", "G": "C"}
    rev_comp = ""
    for base in reversed(sequence):
        if base not in complement:
            raise ValueError("Invalid nucleotide: {}".format(base))
        rev_comp += complement[base]
    return rev_comp


In [27]:
def search_sequence(sequence, position, target_sequence, n): #TESTED.  NOT USED
    """
    Searches for the first N occurrences of a given sequence in the negative direction
    starting from a given position.

    Arguments:
    sequence -- a string representing the sequence to search
    position -- an integer representing the starting position
    target_sequence -- a string representing the sequence to find
    n -- an integer representing the number of occurrences to find

    Returns:
    A list of integers representing the positions of the found sequences,
    or -1 for each sequence that was not found.
    """
    positions = []
    i = position - 1  # convert 1-based index to 0-based index
    while n > 0 and i >= 0:
        if sequence[i:i+len(target_sequence)] == target_sequence:
            positions.append(i)
            n -= 1
        i -= 1
    while n > 0:
        positions.append(-1)
        n -= 1
    return positions  # reverse the order of the positions

In [6]:
def translate(sequence): #TESTED
    """
    Translates a nucleotide sequence into an amino acid sequence.

    Arguments:
    sequence -- a string representing the nucleotide sequence

    Returns:
    A string representing the corresponding amino acid sequence
    """
    codon_table = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
    }
    protein = ''
    if len(sequence) % 3 != 0:
        print("Warning: Sequence length is not a multiple of 3.")
    for i in range(0, len(sequence), 3):
        codon = sequence[i:i+3]
        if codon in codon_table:
            protein += codon_table[codon]
        else:
            protein += 'X'  # unknown amino acid
    return protein


In [28]:
def find_n_minima(array1, array2, n): #NOT TESTED, NOT USED
    """
    Finds the first N minima coming from any of the input arrays and
    outputs from which array they came.

    Arguments:
    array1 -- a list representing the first input array
    array2 -- a list representing the second input array
    n -- an integer representing the number of minima to find

    Returns:
    A list of tuples, where each tuple contains the following elements:
    - the minimum value
    - the index of the minimum value in the combined array
    - an integer indicating from which array the minimum value came:
      - 0 for array1
      - 1 for array2
    """
    combined_array = array1 + array2
    indices = range(len(combined_array))
    minima = sorted(zip(combined_array, indices))
    result = []
    for value, index in minima:
        if len(result) == n:
            break
        if index < len(array1):
            result.append((value, index, 0))
        else:
            result.append((value, index - len(array1), 1))
    return result


In [9]:
def find_synonymous_coding_sequences(aa_sequence): #TESTED
    """
    Finds all nucleotide sequences that give the same amino acid sequence as the given
    amino acid sequence.

    Arguments:
    aa_sequence -- a string representing the amino acid sequence

    Returns:
    A list of nucleotide sequences that give the same amino acid sequence as the
    given amino acid sequence
    """
    # Define the reverse genetic code table
    genetic_code = {
        'A': ['GCT', 'GCC', 'GCA', 'GCG'],
        'C': ['TGT', 'TGC'],
        'D': ['GAT', 'GAC'],
        'E': ['GAA', 'GAG'],
        'F': ['TTT', 'TTC'],
        'G': ['GGT', 'GGC', 'GGA', 'GGG'],
        'H': ['CAT', 'CAC'],
        'I': ['ATT', 'ATC', 'ATA'],
        'K': ['AAA', 'AAG'],
        'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
        'M': ['ATG'],
        'N': ['AAT', 'AAC'],
        'P': ['CCT', 'CCC', 'CCA', 'CCG'],
        'Q': ['CAA', 'CAG'],
        'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
        'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
        'T': ['ACT', 'ACC', 'ACA', 'ACG'],
        'V': ['GTT', 'GTC', 'GTA', 'GTG'],
        'W': ['TGG'],
        'Y': ['TAT', 'TAC'],
        '_': ['TAA', 'TAG', 'TGA']
    }


    # Find all nucleotide sequences that give the same amino acid sequence
    nucleotide_seqs = ['']
    for aa in aa_sequence:
        new_nucleotide_seqs = []
        for seq in nucleotide_seqs:
            codons = genetic_code[aa]
            for codon in codons:
                if translate(codon) == aa:
                    new_nucleotide_seqs.append(seq + codon)
        nucleotide_seqs = new_nucleotide_seqs

    return nucleotide_seqs

In [29]:
def saturate_mutagenesis(codon): #TESTED
    """
    Performs saturate mutagenesis on the given codon, which involves generating
    all possible triple-nucleotide substitutions in the codon except for the given codon.

    Arguments:
    codon -- a string representing a codon of nucleotides

    Returns:
    A dictionary with keys as nucleotide triplets and values as amino acids
    translated from those triplets, for all possible triple-nucleotide substitutions
    in the given codon except the given input codon
    """
    substitutions = ['A', 'C', 'G', 'T']
    amino_acids = {}

    for sub1 in substitutions:
        for sub2 in substitutions:
            for sub3 in substitutions:
                if sub1+sub2+sub3 == codon: #If the current codon is same as the input codon, skip
                        continue
                new_codon = sub1 + sub2 + sub3 
                amino_acids[new_codon] = translate(new_codon)

    return amino_acids


In [11]:
def add_row_to_csv(file_path, row): #TESTED
    """
    Adds a row to an existing CSV file.

    Args:
        file_path (str): The path to the CSV file.
        row (list): The row to add to the CSV file.

    Returns:
        None
    """
    with open(file_path, "a", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(row)

In [56]:
def find_occurrences(sequence, search_string, start_position, max_distance): #TESTED
    """
    Finds occurrences of a string in a given sequence of characters, searching in the reverse direction
    from a given starting position, up to a maximum distance.

    Args:
        sequence (str): The sequence of characters to search in.
        search_string (str): The string to search for.
        start_position (int): The starting position of the search.
        max_distance (int): The maximum distance to search from the starting position.

    Returns:
        A list of indices where the search string was found, or an empty list if it was not found.
    """
    occurrences = []
    search_length = len(search_string)

    # Iterate backwards from the start position up to the maximum distance
    for i in range(start_position, start_position- max_distance, -1):
        # Check if the search string matches the substring starting at the current position
        if sequence[i-search_length:i] == search_string:
            occurrences.append(i-search_length)

    return occurrences

In [31]:
def find_string_differences(str1, str2): #TESTED
    """
    Finds the region between the first and last differences between two strings of equal length.

    Args:
        str1 (str): The first string.
        str2 (str): The second string.

    Returns:
        A string with a structure 100bp of unmutated_region(original_sequence/mutated_sequence) 100 bp of unmutated_region
    """
    # Check that the strings are the same length
    if len(str1) != len(str2):
        raise ValueError("Input strings must be the same length")

    # Find the first difference
    first_diff_index = None
    for i in range(len(str1)):
        if str1[i] != str2[i]:
            first_diff_index = i
            break

    # If there are no differences, return empty strings
    if first_diff_index is None:
        return "", ""

    # Find the last difference
    last_diff_index = None
    for i in range(len(str1)-1, first_diff_index-1, -1):
        if str1[i] != str2[i]:
            last_diff_index = i
            break

    # Get the regions between the first and last differences
    region1 = str1[first_diff_index:last_diff_index+1]
    region2 = str2[first_diff_index:last_diff_index+1]
    
    before_region1 = str1[first_diff_index-100:first_diff_index]
    after_region2 = str1[last_diff_index+1:last_diff_index+101]
    
    output = before_region1 + '(' + region1 + '/' + region2 + ')' + after_region2
    
    return output

In [134]:
def generate_pegRNA(seq, aa_pos, pam_pos, mut_aa_nts, first_nt, print_search = 1): #TESTED
    """This is the main function that generates pegRNA for a given AA position and  PAM position inside a given sequence
    
    Args:
        seq(str) - the string sequence of the whole exon4 and the surrounding
        aa_pos (int) - position of the first nucleotide of the AA to be mutated (0-based indexing)
        pam_pos (int) - position of the first G nucleotide of the PAM sequence (0-based indexing), note that we neglect N in NGG
        mut_aa_nts (str) - nucleotide tripled coding for the desired AA mutation
        first_nt (int) - first nucleotide of the first amino-acid in the fasta file to be mutated (needed for defining reference ORF)
        print_search (bool) - whether to print the search process or not
        
    Returns:
        output - a list of tupples containing mutated sequence (str), and three flags: PAM_mutated (bool), seed_mutated (bool) and additional_mutations (to ensure Hamming distance > 1)
    """
    
    output = []
    pam_mutated = 0
    seed_mutated = 0
    additional_mutations = 0
    cannot_mutate_pam = 0
    #Check if PAM is already mutated by the AA mutation
    mut_seq = seq[0:aa_pos] + mut_aa_nts + seq[aa_pos+3:] #whole sequence
    
    if(print_search == 1):
        print('PAM position')
        print(pam_pos)
        print('Original AA:')
        print(seq[aa_pos:aa_pos+3])
        print('Mutated AA')
        print(mut_aa_nts)
    
    if(mut_seq[pam_pos:pam_pos+2] != 'GG'):
        pam_mutated = 1

        if(print_search == 1):
            print('PAM mutated by AA mutation')
            
        hamming = hamming_distance(seq, mut_seq) #In rare cases where PAM is disrupted by the AA mutation that has one nucleotide change, we have to mutate one AA more to ensure Hamming > 1
        
        if(hamming > 1):
            output.append((mut_seq, pam_mutated, seed_mutated, additional_mutations))
            
        if(hamming == 1):
            max_aa_cnt = 3 #We go up to max_aa amino-acids upstream of PAM to search for synonymous mutations 
            cnt = 0
            #Identify index of AA of the first G in PAM
            aa1_index = (pam_pos-first_nt)//3*3+first_nt
            
            while (cnt < max_aa_cnt):
                additional_mutations = 0
                potential_aa_nts = mut_seq[aa1_index-3-cnt*3:aa1_index-cnt*3]
                potential_syn_aas = find_synonymous_coding_sequences(translate(potential_aa_nts))
                if(print_search == 1):
                    print(f'Currently looking {cnt+1} AAs upstream of PAM')
                if(len(potential_syn_aas) > 0):
                    if(print_search == 1):
                        print('Accepted additional mutations')
                    for pot_syn_aa in potential_syn_aas:
                        if(pot_syn_aa != potential_aa_nts): #Function that searches for synonymous mutation also outputs the original mutation so we check here whether they are different   
                            mut_seq = mut_seq[0:aa1_index-3-cnt*3] + pot_syn_aa + mut_seq[aa1_index-cnt*3:]
                            additional_mutations = 1
                            if(print_search == 1):
                                print(pot_syn_aa)
                            output.append((mut_seq, pam_mutated, seed_mutated, additional_mutations))
                cnt = cnt+1
    elif(pam_pos >= aa_pos and pam_pos+2 <= aa_pos+3): #In this case PAM is inside the AA that we want to mutate and we canot mutate it
        cannot_mutate_pam = 1
        if(print_search == 1):
            print('Cannot mutate PAM because it is inside AA that we are mutating')
                    
    additional_mutations = 0
    #If PAM is not disrupted by this AA change, mutate it by inserting a synonymous mutation
    if(pam_mutated == 0):
        aa_starting_pos = [] #starting nucleotides of GG-containing AAs. Could be one or two 
        #We assume that all PAMS are inside coding sequences as it is generally the case

        #Identify index of AA of the first G in PAM
        aa1_index = (pam_pos-first_nt)//3*3+first_nt
        aa_starting_pos.append(aa1_index)

        #Identify index of AA of the second G in PAM. If not the same as in first G, store it for potential synonymous exchange
        aa2_index = (pam_pos+1-first_nt)//3*3+first_nt
        if(aa2_index != aa1_index):
            aa_starting_pos.append(aa2_index)

        #List all possible mutations that mutate PAM containing AAs into synonymous AAs
        original_pam = ""
        for ind in aa_starting_pos:
            original_pam += seq[ind:ind+3]
            
        if(cannot_mutate_pam == 0):
            potential_pam_mutations = find_synonymous_coding_sequences(translate(original_pam))

            #In rare cases, the second G is inside the AA we want to mutate, so that positon should not be touched
            if(len(aa_starting_pos) == 2 and aa2_index == aa_pos):
                new_pams = []
                for pot_pam in potential_pam_mutations:
                    new_pams.append(pot_pam[0:3]+mut_aa_nts)
                potential_pam_mutations = list(set(new_pams)) #Keep only unique ones

            if(print_search == 1):
                print('Original PAM containing codon')
                print(original_pam)
                print('Potential PAM mutating codons')
                print(potential_pam_mutations)

            #Among those, keep only mutations that do not contain 'GG' from PAM any more
            good_pam_mutations = []
            relative_pam_index = pam_pos - aa1_index #PAM position relative to the start of the first PAM-containing AA
            for pot_pam in potential_pam_mutations:
                if(pot_pam[relative_pam_index:relative_pam_index+2] != 'GG'): #One G is OK, GG is not
                    good_pam_mutations.append(pot_pam)

            if(len(good_pam_mutations) > 0):
                pam_mutated = 1
                for good_pam_mut in good_pam_mutations:
                    mut_seq = mut_seq[0:aa_starting_pos[0]] + good_pam_mut + mut_seq[aa_starting_pos[-1]+3:]
                    output.append((mut_seq, pam_mutated, seed_mutated, additional_mutations))

            elif(print_search == 1):
                print('Cannot find synonymous mutations that disrupt PAM')

    if(pam_mutated == 0):
        #Not possible to mutate PAM; mutate a first possible amino-acid upstream of PAM into a synonymous one
        max_aa_cnt = 1 #We search the seed region only for this mutation, which is only one aa upstream of PAM
        cnt = 0
        while (cnt < max_aa_cnt):
            seed_mutated = 0
            potential_aa_nts = mut_seq[aa1_index-3-cnt*3:aa1_index-cnt*3]
            potential_syn_aas = find_synonymous_coding_sequences(translate(potential_aa_nts))
            if(print_search == 1):
                print(f'Currently looking {cnt+1} AAs upstream of PAM')
            if(len(potential_syn_aas) > 0):
                if(print_search == 1):
                    print('Accepted seed mutations')
                for pot_syn_aa in potential_syn_aas:
                    if(pot_syn_aa != potential_aa_nts): #Function that searches for synonymous mutation also outputs the original mutation so we check here whether they are different   
                        mut_seq = mut_seq[0:aa1_index-3-cnt*3] + pot_syn_aa + mut_seq[aa1_index-cnt*3:]
                        seed_mutated = 1
                        if(print_search == 1):
                            print(pot_syn_aa)
                        output.append((mut_seq, pam_mutated, seed_mutated, additional_mutations))
            cnt = cnt+1

    if(pam_mutated == 0 and seed_mutated == 0 and print_search == 1):
        print('Impossible to find mutations that mutate the given PAM or seed into a synonymous mutation')
        
    if(print_search == 1):
        print('################################################') #End of search for a given mutation
    return output
    

In [138]:
def generate_pegRNA_reverse_strand(seq, aa_pos, pam_pos, mut_aa_nts, first_nt, print_search = 1): #TESTED
    """This is the similar function as generate pegRNA except it searches for pegRNA in the antisense direction.
        Positions in the function are still given with respect to sense strand.
        There are essentially 2 differences compared to the generate_pegRNA function:
        1. It searches for CC as PAM instead of GG
        2. If it doesn't find a PAM mutating synonymous codon, it proceeds to downstream codons (in the sense strang), instead of upstream in the generate_pegRNA function
        
        
    Args:
        seq(str) - the string sequence of the whole exon4 and the surrounding
        aa_pos (int) - position of the first nucleotide of the AA to be mutated (0-based indexing)
        pam_pos (int) - position of the first G nucleotide of the PAM sequence (0-based indexing), note that we neglect N in NGG
        mut_aa_nts (str) - nucleotide tripled coding for the desired AA mutation
        first_nt (int) - first nucleotide of the first amino-acid in the fasta file to be mutated (needed for defining reference ORF)
        print_search (bool) - whether to print the search process or not
        
    Returns:
        output - a list of tupples containing mutated sequence (str), and three flags: PAM_mutated (bool), seed_mutated (bool) and additional_mutations (to ensure Hamming distance > 1)
    """
    
    output = []
    pam_mutated = 0
    seed_mutated = 0
    additional_mutations = 0
    cannot_mutate_pam = 0
    #Check if PAM is already mutated by the AA mutation
    mut_seq = seq[0:aa_pos] + mut_aa_nts + seq[aa_pos+3:] #whole sequence
    
    if(print_search == 1):
        print('PAM position')
        print(pam_pos)
        print('Original AA:')
        print(seq[aa_pos:aa_pos+3])
        print('Mutated AA')
        print(mut_aa_nts)
    
    if(mut_seq[pam_pos:pam_pos+2] != 'CC'):
        pam_mutated = 1
        if(print_search == 1):
            print('PAM mutated by AA mutation')
        hamming = hamming_distance(seq, mut_seq)
        
        if(hamming > 1):        
            output.append((mut_seq, pam_mutated, seed_mutated, additional_mutations))
            
        if(hamming == 1): #In rare cases where PAM is disrupted by the AA mutation that has one nucleotide change, we have to mutate one AA more to ensure Hamming > 1
            max_aa_cnt = 3 #We go up to max_aa amino-acids upstream of PAM to search for synonymous mutations
            cnt = 0
            #Identify index of AA of the second G in PAM.
            aa2_index = (pam_pos+1-first_nt)//3*3+first_nt
            
            while (cnt < max_aa_cnt):
                additional_mutations = 0
                potential_aa_nts = mut_seq[aa2_index+3+cnt*3:aa2_index+6+cnt*3]
                potential_syn_aas = find_synonymous_coding_sequences(translate(potential_aa_nts))
                if(print_search == 1):
                    print(f'Currently looking {cnt+1} AAs upstream of PAM')
                if(len(potential_syn_aas) > 0):
                    if(print_search == 1):
                        print('Accepted seed mutations')
                    for pot_syn_aa in potential_syn_aas: #Search through all potential synonymous mutations
                        if(pot_syn_aa != potential_aa_nts): #Function that searches for synonymous mutation also outputs the original mutation so we check here if they are different   
                            mut_seq = mut_seq[0:aa2_index+3+cnt*3] + pot_syn_aa + mut_seq[aa2_index+6+cnt*3:]
                            additional_mutated = 1
                            if(print_search == 1):
                                print(pot_syn_aa)
                            output.append((mut_seq, pam_mutated, seed_mutated, additional_mutations))
                cnt = cnt+1
                
    elif(pam_pos >= aa_pos and pam_pos+2 <= aa_pos+3): #In this case PAM is inside the AA that we want to mutate and we canot mutate it
        cannot_mutate_pam = 1
        if(print_search == 1):
            print('Cannot mutate PAM because it is inside AA that we are mutating')

    additional_mutations = 0
    #If PAM is not disrupted by this AA change, mutate it by inserting a synonymous mutation
    if(pam_mutated == 0):
        aa_starting_pos = [] #starting nucleotides of GG-containing AAs. Could be one or two 
        #We assume that all PAMS are inside coding sequences as it is generally the case

        #Identify index of AA of the first G in PAM
        aa1_index = (pam_pos-first_nt)//3*3+first_nt
        aa_starting_pos.append(aa1_index)

        #Identify index of AA of the second G in PAM. If not the same as in first G, store it for potential synonymous exchange
        aa2_index = (pam_pos+1-first_nt)//3*3+first_nt
        if(aa2_index != aa1_index):
            aa_starting_pos.append(aa2_index)

        #List all possible mutations that mutate PAM containing AAs into synonymous AAs
        original_pam = ""
        for ind in aa_starting_pos:
            original_pam += seq[ind:ind+3]
        
        if(cannot_mutate_pam == 0):
            potential_pam_mutations = find_synonymous_coding_sequences(translate(original_pam))

            #In rare cases, the first G is inside the AA we want to mutate, so that positon should not be touched
            if(len(aa_starting_pos) == 2 and aa1_index == aa_pos):
                new_pams = []
                for pot_pam in potential_pam_mutations:
                    new_pams.append(mut_aa_nts + pot_pam[3:6])
                potential_pam_mutations = list(set(new_pams)) #Keep only unique ones

            if(print_search == 1):
                print('Original PAM containing codon')
                print(original_pam)
                print('Potential PAM mutating codons')
                print(potential_pam_mutations)

            #Among those, keep only mutations that do not contain 'CC' from PAM any more
            good_pam_mutations = []
            relative_pam_index = pam_pos - aa1_index #PAM position relative to the start of the first PAM-containing AA
            for pot_pam in potential_pam_mutations:
                if(pot_pam[relative_pam_index:relative_pam_index+2] != 'CC'): #One C is OK, CC is not
                    good_pam_mutations.append(pot_pam)

            if(len(good_pam_mutations) > 0):
                pam_mutated = 1
                for good_pam_mut in good_pam_mutations:
                    mut_seq = mut_seq[0:aa_starting_pos[0]] + good_pam_mut + mut_seq[aa_starting_pos[-1]+3:]
                    output.append((mut_seq, pam_mutated, seed_mutated, additional_mutations))
            elif(print_search == 1):
                print('Cannot find synonymous mutations that disrupt PAM')
    
    if(pam_mutated == 0):
        #Not possible to mutate PAM; mutate a first possible amino-acid upstream of PAM into a synonymous one
        max_aa_cnt = 1 #We search the seed region only for this mutation, which is only one aa upstream of PAM
        cnt = 0
        while (cnt < max_aa_cnt):
            seed_mutated = 0
            potential_aa_nts = mut_seq[aa_starting_pos[-1]+3+cnt*3:aa_starting_pos[-1]+6+cnt*3]
            potential_syn_aas = find_synonymous_coding_sequences(translate(potential_aa_nts))
            if(print_search == 1):
                print(f'Currently looking {cnt+1} AAs upstream of PAM')
            if(len(potential_syn_aas) > 0):
                if(print_search == 1):
                    print('Accepted seed mutations')
                for pot_syn_aa in potential_syn_aas: #Search through all potential synonymous mutations
                    if(pot_syn_aa != potential_aa_nts): #Function that searches for synonymous mutation also outputs the original mutation so we check here if they are different   
                        mut_seq = mut_seq[0:aa_starting_pos[-1]+3+cnt*3] + pot_syn_aa + mut_seq[aa_starting_pos[-1]+6+cnt*3:]
                        seed_mutated = 1
                        if(print_search == 1):
                            print(pot_syn_aa)
                        output.append((mut_seq, pam_mutated, seed_mutated, additional_mutations))
            cnt = cnt+1
        
    if(pam_mutated == 0 and seed_mutated == 0 and print_search == 1):
        print('Impossible to find mutations that mutate the given PAM or seed into a synonymous mutation')
        
    if(print_search == 1):
        print('################################################') #End of search for a given mutation
    return output

In [144]:
#Main
first_nt = 215 #First nucleotide of the first AA to be mutated in the given file
last_nt = 464 #First nucleotide of the first AA that should not be mutated
seq = read_fasta_file('LDLR ex4_extra seq.fa') #Reading the fasta file (has to be in .fa format and in the same folder as the code)
rev_seq = reverse_complement(seq)


#Prepare Excel header file
csv_file = 'pegRNA_design_update.csv' #Output file
add_row_to_csv(csv_file, ['Position of the first nt in the aa', 'Original_aa', 'Original_nts', 'Mutated_aa', 'Mutated_nts',\
                          'Position of the first G in the PAM (sense direction)', 'Strand orientation', 'PAM mutated', 'Seed mutated', \
                          'Additional synonymous mutations to satisfy Hamming > 1','Total Hamming distance', 'Differrence in sequences'])
for i in range(first_nt, last_nt, 3):
    aa_pos = i
    aa_nts = seq[i:i+3]
    print(i)
    for mut_nts, mut_aa in saturate_mutagenesis(aa_nts).items():
        
        found_pam = 0 #A flag that sets whether we found a nearby PAM or not
        
        pam_positions = find_occurrences(seq, 'GG', aa_pos + 3, 18) #We search for all GG occurences within 18 nucleotides from AA, including AA itself
        if(pam_positions != []):
            found_pam = 1
        for pam_pos in pam_positions:
            output = generate_pegRNA(seq, aa_pos, pam_pos, mut_nts, first_nt, 0)
            for pegRNA in output:
                seq_diff = find_string_differences(seq, pegRNA[0])
                hamming = hamming_distance(seq, pegRNA[0])
                add_row_to_csv(csv_file, [aa_pos+1, translate(aa_nts), aa_nts, mut_aa, mut_nts, pam_pos+1, \
                                          '+', pegRNA[1], pegRNA[2], pegRNA[3], hamming, seq_diff])
            if(output == []):
                add_row_to_csv(csv_file, [aa_pos+1, translate(aa_nts), aa_nts, mut_aa, mut_nts, \
                                          'COULD NOT FIND SYNONYMOUS PAM OR SEED DISRUPTING MUTATIONS'])

        #Searching for PAMs in the antisense direction
        rev_ind = len(seq) - i - 3
        
        pam_positions = find_occurrences(rev_seq, 'GG', rev_ind + 3, 18)
        if(pam_positions != []):
            found_pam = 1
        for pam_pos in pam_positions:
            pam_pos = len(seq) - pam_pos - 3 + 1
            output = generate_pegRNA_reverse_strand(seq, aa_pos, pam_pos, mut_nts, first_nt, 0)
            for pegRNA in output:
                seq_diff = find_string_differences(seq, pegRNA[0])
                hamming = hamming_distance(seq, pegRNA[0])
                add_row_to_csv(csv_file, [aa_pos+1, translate(aa_nts), aa_nts, mut_aa, mut_nts, pam_pos+1, \
                                          '-', pegRNA[1], pegRNA[2], pegRNA[3], hamming, seq_diff])
        if(output == []):
            add_row_to_csv(csv_file, [aa_pos+1, translate(aa_nts), aa_nts, mut_aa, mut_nts, pam_pos+1, \
                                      'COULD NOT FIND SYNONYMOUS PAM OR SEED DISRUPTING MUTATIONS'])

        if(found_pam == 0):
            add_row_to_csv(csv_file, [aa_pos+1, translate(aa_nts), aa_nts, mut_aa, mut_nts, \
                                      'NO NEARBY PAM SEQUENCES'])


215
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
228
231
218
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231
235
228
231


258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
245
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258
246
254
255
258


289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
278
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291
292
279
282
289
290
291


316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
308
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318
316
317
318


393
391
392
393
391
392
393
391
392
393
391
392
393
383
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
391
392
393
386
391
392
393
402
391
392
393
402
391
392
393
402
391
392
393
402
391
392
393
402
391
392
393
402
391
392
393
402
391
392
393
402
391
392
393
402
391
392
393
402
391
392
393
402
391
392


436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
431
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436
441
436


In [140]:
seq.upper()

'CTCCCCAGCCCGCCTTTCCAGGGGCCTGGCCTCACTGCGGCAGCGTCCCCGGCTATAGAATGGGCTGGTGTTGGGAGACTTCACACGGTGATGGTGGTCTCGGCCCATCCATCCCTGCAGCCCCCAAGACGTGCTCCCAGGACGAGTTTCGCTGCCACGATGGGAAGTGCATCTCTCGGCAGTTCGTCTGTGACTCAGACCGGGACTGCTTGGACGGCTCAGACGAGGCCTCCTGCCCGGTGCTCACCTGTGGTCCCGCCAGCTTCCAGTGCAACAGCTCCACCTGCATCCCCCAGCTGTGGGCCTGCGACAACGACCCCGACTGCGAAGATGGCTCGGATGAGTGGCCGCAGCGCTGTAGGGGTCTTTACGTGTTCCAAGGGGACAGTAGCCCCTGCTCGGCCTTCGAGTTCCACTGCCTAAGTGGCGAGTGCATCCACTCCAGCTGGCGCTGTGATGGTGGCCCCGACTGCAAGGACAAATCTGACGAGGAAAACTGCGGTATGGGCGGGGCCAGGGTGGGGGCGGGGCGTCCTATCACCTGTCCCTGGGCTCCCCCAGGTGTGGGACATGCAGTGATTTAGGTGCCGAAGTGGATTTCCAACAACATGCCAAGAAAGTATTCCCATTCATGTTTGTTTCTTTTTTTTCTTTTCTTTCTTTATTTTGTTTTTGAGATGGAGTCTCACTCTGTGATTTTTTTCATCTCTAAATTTCCTACATCCATATGGCCACCATGAGGCCCCAGGCTGGCCGATGGTTGCTGTTAGCTTATTGGGAAATCACTGTTTGGAAGGTGCTGGTTGTTTTTTGTTGTTTGTTGTTTTTGTTTTTGTTTTTGTTTTGAGACGGAGTCTCGCTCTGTCGCCAGGGTGGAGTGCAGTGGCGCGATCAGCTCACTGCAACCTCCGCTTCCTGGGTTCAAGCCATTCTCCTGCCTCAGCCTCCCAAGTAGCGCGGATTACAGGCATGTGCCACCACCTCCGGCTT'

In [17]:
cd Desktop

/Users/gligs/Desktop
