In [37]:
from Bio.Seq import Seq
import pandas as pd
from itertools import product
import re

def circular_permuted(x):
    """
    Args:
        x (iterator)
    Returns:
        list: All circular permutations of x with 'N' replaced by 'A', 'T', 'G', and 'C'
    """
    n = len(x)
    modified_sequences = []

    # Replace 'N' with 'A', 'T', 'G', and 'C' separately and generate circular permutations for each
    for replacement in ['A', 'T', 'G', 'C']:
        modified_x = x.replace('N', replacement)
        modified_sequences.extend([modified_x[i:] + modified_x[:i] for i in range(n)])

    return modified_sequences

def normalise_str(in_dna):
    """
    Find all possible equivalent STR sequences.
    And return the first alphabetically for each replacement of 'N'.
    For example, if 'N' is present, return the first alphabetically for 'A', 'T', 'G', and 'C' replacements.
    """
    if in_dna is None or len(in_dna) == 0:
        return ''

    all_possible = []

    # Find all positions of 'N' in the input sequence
    n_positions = [i for i, nucleotide in enumerate(in_dna) if nucleotide == 'N']

    # If 'N' is present, replace each 'N' with 'A', 'T', 'G', and 'C' separately
    if n_positions:
        results = []
        for replacement_combination in product('ATGC', repeat=len(n_positions)):
            modified_dna = list(in_dna)
            for i, replacement in zip(n_positions, replacement_combination):
                modified_dna[i] = replacement
            modified_results = circular_permuted("".join(modified_dna))
            if modified_results:
                results.append(min(modified_results))
        return results
    else:
        # Circularly permute the original sequence and reverse complement
        for permuted_seq in circular_permuted(in_dna):
            all_possible.append(permuted_seq)
        
        return [min(all_possible)]


def check_match(list1, list2):
    matching_result = None
    for result in list1:
        if result in normalized_reference_rc:
            matching_result = result
            break
        if matching_result is None:
            # If NONE of the results match
            matching_result = "MAYDAY MAYDAY!"
    return matching_result

In [2]:
circular_permuted("ANNG")

['AAAG',
 'AAGA',
 'AGAA',
 'GAAA',
 'ATTG',
 'TTGA',
 'TGAT',
 'GATT',
 'AGGG',
 'GGGA',
 'GGAG',
 'GAGG',
 'ACCG',
 'CCGA',
 'CGAC',
 'GACC']

In [3]:
normalise_str("TNG")

['AGT', 'GTT', 'GGT', 'CGT']

In [None]:
import pandas as pd
from Bio.Seq import Seq
import re

# Read the CSV file into a DataFrame
df = pd.read_csv("STR-disease-loci.csv")

# Create an empty list to store the results
results = []

# Iterate through the rows of the DataFrame
for index, row in df.iterrows():
    gene_strand = row['gene_strand']
    reference_orientation = row['pathogenic_motif_reference_orientation']
    repeatunit_gene_orientation = row['repeatunit_pathogenic_geneorientation']

    # Check if ',' is NOT in the motifs
    if ',' not in reference_orientation and ',' not in repeatunit_gene_orientation:
        # Check if gene_strand is "+"
        if gene_strand == "+":
            pathogenic_motif_gene_orientation = reference_orientation
        else:
            # Reverse complement for gene_strand == "-"
            reference_orientation_rc = str(Seq(reference_orientation).reverse_complement())

            # Run normalise_str for reference_orientation and its reverse complement
            normalized_reference_rc = normalise_str(reference_orientation_rc)
            normalized_repeatunit = normalise_str(repeatunit_gene_orientation)

            # Check if ANY of the results match between normalized_repeatunit and normalized_reference_rc
            matching_result = None
            for result in normalized_repeatunit:
                if result in normalized_reference_rc:
                    matching_result = result
                    break

            if matching_result is None:
                # If NONE of the results match
                matching_result = "MAYDAY MAYDAY!"

            pathogenic_motif_gene_orientation = matching_result

        # Append the result to the results list
        results.append(pathogenic_motif_gene_orientation)
    
    else:
        # Handle rows with commas as a special case
        reference_motifs = [motif.strip() for motif in re.split(r',', reference_orientation)]
        repeatunit_motifs = [motif.strip() for motif in re.split(r',', repeatunit_gene_orientation)]

        matching_results = []

        for reference_motif in reference_motifs:
            for repeatunit_motif in repeatunit_motifs:
                if gene_strand == "+":
                    if reference_motif == repeatunit_motif:
                        matching_results.append(reference_motif)
                else:
                    reference_orientation_rc = str(Seq(reference_motif).reverse_complement())
                    normalized_reference_rc = normalise_str(reference_orientation_rc)
                    normalized_repeatunit = normalise_str(repeatunit_motif)
                    if normalized_reference_rc == normalized_repeatunit:
                        matching_results.append(normalized_reference_rc)

        if not matching_results:
            # If NONE of the results match
            matching_result = "MAYDAY MAYDAY!"
        else:
            # Join matching results with a comma
            matching_result = ', '.join(matching_results)

        # Append the result to the results list
        results.append(matching_result)

# Add the results as a new column in the DataFrame
df['pathogenic_motif_gene_orientation'] = results

# Save the updated DataFrame to a CSV file
df.to_csv("updated_STR-disease-loci.csv", index=False)

In [39]:
# Read the CSV file into a DataFrame
df = pd.read_csv("STR-disease-loci.csv")

# Create an empty list to store the results
results = []

# Iterate through the rows of the DataFrame
for index, row in df.iterrows():
    gene_strand = row['gene_strand']
    reference_orientation = row['pathogenic_motif_reference_orientation']
    repeatunit_gene_orientation = row['repeatunit_pathogenic_geneorientation']

    # Check if gene_strand is "+"
    if gene_strand == "+":
        pathogenic_motif_gene_orientation = reference_orientation
    elif ',' in reference_orientation or ',' in repeatunit_gene_orientation:
        normalized_reference_reverse_complements = []
        reference_motifs = [motif.strip() for motif in re.split(r',', reference_orientation)]
        # print("reference motifs", reference_motifs)
        for motif in reference_motifs:
            seq = Seq(motif)
            reverse_comp = str(seq.reverse_complement())
            normalized_reference_motifs_rc = normalise_str(reverse_comp)
            normalized_reference_reverse_complements.append(str(normalized_reference_motifs_rc))  # Append reverse complement as a string to the list

# Print the list of reverse complements
        # for reverse_comp in reverse_complements:
        #     print("Reverse Complement:", reverse_comp)
        repeatunit_motifs = [motif.strip() for motif in re.split(r',', repeatunit_gene_orientation)]
        normalized_repeatunit_motifs = []
        for motif in repeatunit_motifs:
            normalized_repeatunit = str(normalise_str(motif))
            normalized_repeatunit_motifs.append(str(normalized_repeatunit))  # Append repeat unit
        
        matching_result = None
        for result in normalized_repeatunit_motifs:
            if result in normalized_reference_reverse_complements:
                matching_result = result
                break
        
        if matching_result is None:
            # If NONE of the results match
            matching_result = "MAYDAY MAYDAY!"
        
        pathogenic_motif_gene_orientation = matching_result
        
    else:
        # Reverse complement for gene_strand == "-"
        reference_orientation_rc = str(Seq(reference_orientation).reverse_complement())

        # Run normalise_str for reference_orientation and its reverse complement
        normalized_reference_rc = normalise_str(reference_orientation_rc)
        normalized_repeatunit = normalise_str(repeatunit_gene_orientation)

        # Check if ANY of the results match between normalized_repeatunit and normalized_reference_rc
        matching_result = None
        for result in normalized_repeatunit:
            if result in normalized_reference_rc:
                matching_result = result
                break
        
        if matching_result is None:
            # If NONE of the results match
            matching_result = "MAYDAY MAYDAY!"
        
        pathogenic_motif_gene_orientation = matching_result

    # Append the result to the results list
    results.append(pathogenic_motif_gene_orientation)

# Add the results as a new column in the DataFrame
df['pathogenic_motif_gene_orientation'] = results

# Save the updated DataFrame to a CSV file
df.to_csv("updated_in_progress_STR-disease-loci.csv", index=False)