In [3]:
from Bio import SeqIO

def cut_fasta_sequences(input_fasta, output_fasta, ranges_dict):
    """
    Cuts sequences from a FASTA file based on the given ranges and writes them to a new FASTA file.

    Parameters:
    input_fasta (str): Path to the input FASTA file.
    output_fasta (str): Path to the output FASTA file.
    ranges_dict (dict): Dictionary where keys are sequence IDs and values are tuples of (start, end) positions.
                        Positions are 1-based, and both start and end are inclusive.
    """
    # Parse the input FASTA file
    sequences = SeqIO.parse(input_fasta, "fasta")
    
    # Open the output FASTA file
    with open(output_fasta, "w") as output_handle:
        # Iterate over each sequence in the input FASTA file
        for seq_record in sequences:
            seq_id = seq_record.id
            if seq_id in ranges_dict:
                # Get the specified range for the sequence
                start, end = ranges_dict[seq_id]
                # Extract the subsequence within the given range
                cut_seq = seq_record.seq[start-1:end]  # Convert 1-based to 0-based index
                # Create a new SeqRecord with the cut sequence
                new_record = seq_record[start-1:end]
                # Write the new sequence to the output file
                SeqIO.write(new_record, output_handle, "fasta")
end = 401

# Define the file names and ranges
ranges = {
    "K401": (1, end),
    "AcSu_575": (1, end),
    "AcSu2_530": (1, end),
    "AdPa_408": (1, end),
    "BleSto_572": (1, end),
    "DiPu_513": (1, end),
    "HeAl_513": (1, end),
    "Kif3_592": (1, end),
    "Kifa_500": (1, end),
    "NaGr_392": (1, end),
    "ThTr_655": (1, end),
    "TiLa_514": (1, end),
    "B": (1, end),
    "C": (1, end),
    "D": (1, end),
    "E": (1, end),
    "F": (1, end),
    "G": (1, end)
}

# Example usage:
cut_fasta_sequences("../../data/input_sequences.fasta", "../../data/output_sequences.fasta", ranges)

In [27]:
from Bio import SeqIO
from Bio.Seq import Seq

def convert_amino_acid_groups(input_fasta, output_fasta):
    """
    Converts amino acids in sequences from a FASTA file into group codes and writes them to a new FASTA file.

    Parameters:
    input_fasta (str): Path to the input FASTA file.
    output_fasta (str): Path to the output FASTA file.
    """
    # Define the amino acid group mapping
    group_mapping = {
        'G': 'L', 'A': 'L', 'V': 'L', 'L': 'L', 'M': 'L', 'I': 'L',  # Nonpolar aliphatic
        'S': 'P', 'T': 'P', 'C': 'P', 'P': 'P', 'N': 'P', 'Q': 'P',  # Polar uncharged
        'F': 'A', 'Y': 'A', 'W': 'A',  # Aromatic
        'K': 'R', 'R': 'R', 'H': 'R',  # Positively charged
        'D': 'D', 'E': 'D'  # Negatively charged
    }
    
    # Parse the input FASTA file
    sequences = SeqIO.parse(input_fasta, "fasta")
    
    # Open the output FASTA file
    with open(output_fasta, "w") as output_handle:
        # Iterate over each sequence in the input FASTA file
        for seq_record in sequences:
            # Convert the sequence to the group code
            converted_seq = ''.join(group_mapping.get(aa, 'X') for aa in seq_record.seq)
            # Create a new SeqRecord with the converted sequence
            new_record = seq_record[:]
            new_record.seq = Seq(converted_seq)  # Ensure it's a Seq object
            # Write the new sequence to the output file
            SeqIO.write([new_record], output_handle, "fasta")  # Pass as a list

# Example usage:
convert_amino_acid_groups("../../data/input_sequences.fasta", "../../data/output_sequences_converted.fasta")

In [8]:
# Redefine the file path
file_path = "../../data/AcSu_575, AcSu2_530.fasta"

# Function to read a FASTA file manually
def read_fasta(file_path):
    """
    Reads a FASTA file and returns a dictionary of sequences.

    This function manually parses a FASTA file, extracting sequence identifiers and their corresponding sequences.
    Each sequence is stored in a dictionary with the sequence identifier (header) as the key and the sequence string
    as the value. The function assumes that each sequence in the FASTA file is represented by a header line starting
    with '>', followed by one or more lines of sequence data.

    Parameters:
    file_path (str): The path to the FASTA file to be read.

    Returns:
    dict: A dictionary where keys are sequence identifiers (str) and values are sequences (str).

    Example:
    >>> read_fasta("example.fasta")
    {'seq1': 'ATCG', 'seq2': 'GGTA'}
    """
    sequences = {}
    with open(file_path, "r") as file:
        current_seq = None
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                current_seq = line[1:]  # Remove '>' and use as key
                sequences[current_seq] = ""
            else:
                sequences[current_seq] += line  # Append sequence
    return sequences

# Read the file and inspect the first few sequences
sequences = read_fasta(file_path)

# Display a preview of the first few sequences
sequences

{'HeAl_513': 'MSSIRVVCRFRPQNKIELAQGGCSVVDVA-DDQTVTIKGNESNHTFTFDRIYTEKNSQKDVYDDAAKPVIEDIMQGYNGTIFVYGQTSSGKTHTMQGPS--IDDAELKGVIPRMINTVFDCITKADENIEFIVKASYIEIYMERIRDLLDVRKDNLKVREEKGKGVWVDGTTEVYIYREDDILEVMRAGQANRAIAETKMNAESSRSHSIFILTIQQKNLKEGSNKSGKLYLVDLAGSEKIAKTGAQGLTLDEAKMINKSLSSLGNVINALTDGKSTHIPYRDSKLTRVLQESLGGNSRTTLIINCSPSSYNETETLSTLRFGNRAKSIKNKAKINQERSAAELKILLSKAEKEIESLKDYIKELETVSGVPHSK----------------------------------------------------------------IGNNLDTDKSADVQGLKEKCIQLEKLLFQKEEEKKELSEQLDTISIQLQDKEQELETQTHQVTSLKDEASKYVSLSNENDILSAQLTEIKLLLEKKNYESVEQTLVIEELSAENASIKSQLQEKIESSKGVGGIGDHY-TPS-----------',
 'AcSu2_530': 'MSSIRVVCRFRPQNKIELAQGGCSIIDVS-DNQTVNIKGSESNHTFTFDRIYDERNSQKDVYDDAAKPVIEDIMLGYNGTIFVYGQTSSGKTHTMQGPS--IDDAELKGVIPRMINTVFECINKADQNVEFIVKASYIEIYMEKIRDLLDVRKDNLRVREEKGKGVWVEGTTEVYIYREEDILEVMRTGQANRAIAETKMNAESSRSHSIFILSIQQKNLKEGSNKHGKLYLVDLAGSEKVAKTGAQGLTLDEAKMINKSLSSLGNVINSLTDGKSAHIPYRDSKLTRVLQESLGGNSRTTLIINCSPSSYNEVETVSTLRFGNRAKNIKNKAKINQERSAAELKILLAKAEKEIESLKEYTKELESLTGVPSSKS

In [26]:
# Function to find the aligned position of a subsequence within a reference sequence
def find_aligned_position(ref_seq, subseq, from_right=False):
    """
    Finds the aligned position (including gaps) where the subsequence starts or ends in a reference sequence.

    This function identifies the position of a given subsequence within a reference sequence, taking into account
    any gaps ('-') present in the reference sequence. It can search for the subsequence from the left or right
    end of the reference sequence.

    Parameters:
    ref_seq (str): The reference sequence which may contain gaps.
    subseq (str): The subsequence to locate within the reference sequence.
    from_right (bool): If True, the search starts from the right end of the reference sequence. Defaults to False.

    Returns:
    int: The aligned position in the reference sequence where the subsequence starts or ends.

    Raises:
    ValueError: If the subsequence is not found in the reference sequence or if the mapping to the aligned position fails.
    """
    clean_ref_seq = ref_seq.replace("-", "")  # Remove gaps for accurate indexing
    if from_right:
        start_index = clean_ref_seq.rfind(subseq)  # Find from the right
    else:
        start_index = clean_ref_seq.find(subseq)

    if start_index == -1:
        raise ValueError(f"Subsequence '{subseq}' not found in the reference sequence.")

    # Map this index to the gapped reference sequence
    aligned_index = 0
    ungapped_count = 0

    for i, char in enumerate(ref_seq):
        if char != "-":
            if ungapped_count == start_index:
                return i  # Return the aligned position
            ungapped_count += 1

    raise ValueError("Failed to map the subsequence position to the alignment.")

# Function to truncate all sequences at the aligned position
def truncate_aligned_sequences(fasta_dict, ref_name, subseq_start, subseq_end, output_file):
    """
    Truncates all sequences in a FASTA dictionary at positions aligned to a reference sequence.

    This function identifies the start and end positions of specified subsequences within a reference sequence,
    then truncates all sequences in the provided FASTA dictionary to these aligned positions. The truncated sequences
    are written to a specified output file in FASTA format.

    Parameters:
    fasta_dict (dict): A dictionary containing sequence names as keys and sequences as values.
    ref_name (str): The name of the reference sequence in the dictionary.
    subseq_start (str): The subsequence indicating where truncation should start.
    subseq_end (str): The subsequence indicating where truncation should end.
    output_file (str): The path to the output file where truncated sequences will be saved.

    Returns:
    str: The path to the output file containing the truncated sequences.

    Raises:
    ValueError: If the reference sequence is not found in the dictionary or if subsequence positions cannot be determined.
    """
    if ref_name not in fasta_dict:
        raise ValueError(f"Reference sequence '{ref_name}' not found in the alignment.")

    ref_seq = fasta_dict[ref_name]
    start_position = find_aligned_position(ref_seq, subseq_start)
    end_position = find_aligned_position(ref_seq, subseq_end, from_right=True)

    # Include the subsequence start and end in the truncation
    truncated_sequences = {key: seq[start_position:end_position + len(subseq_end)] for key, seq in fasta_dict.items()}

    # Write to a new FASTA file
    with open(output_file, "w") as out_file:
        for key, seq in truncated_sequences.items():
            out_file.write(f">{key}\n{seq}\n")

    return output_file

# Example usage
ref_sequence_name = "ThTr_655"  # Reference sequence name
sub_sequence_start = "MSTTPLGEVQNT"  # Subsequence to align from the end
sub_sequence_end = "PALPSDNVLK"  # Subsequence to align from the start
output_fasta_path = "../../data/output_truncated_sequences-nagr.fasta"

# Run the function and generate the output file
try:
    output_file_path = truncate_aligned_sequences(sequences, ref_sequence_name, sub_sequence_start, sub_sequence_end, output_fasta_path)
    output_file_path  # Returning the path of the generated file
except ValueError as e:
    str(e)  # Return the error message if an issue occurs

In [11]:
def mask_sequence(sequence_preview):
    """
    Masks a target sequence based on a template and an annotated sequence.

    This function compares three sequences: a template sequence, a target sequence, 
    and an annotated sequence. It generates a masked sequence where each position 
    is retained from the target sequence if it matches the template sequence but 
    differs from the annotated sequence. Otherwise, the position is replaced with 
    a dash ('-'). The function also prints the positions of the retained letters.

    Parameters:
    sequence_preview (dict): A dictionary containing three sequences with keys 
                             'HeAl_513' for the template sequence, 'AcSu2_530' 
                             for the target sequence, and 'H annotated' for the 
                             annotated sequence.

    Returns:
    str: A string representing the masked sequence.
    """
    # Extract the sequences
    template_seq = sequence_preview['HeAl_513']
    target_seq = sequence_preview['AcSu2_530']
    annotated_seq = sequence_preview['H annotated']
    
    # Initialize a list to store the masked sequence
    masked_seq = []
    # Initialize a list to store the positions of letters that didn't get replaced
    retained_positions = []
    
    # Iterate over the sequences by position
    for i in range(len(template_seq)):
        if template_seq[i] == annotated_seq[i] and template_seq[i] != target_seq[i]:
            # Retain the amino acid if it differs in the target sequence
            masked_seq.append(target_seq[i])
            retained_positions.append(i)
        else:
            # Replace with '-' otherwise
            masked_seq.append('-')
    
    # Print the list of positions of the letters that didn't get replaced
    print("Positions of retained letters:", retained_positions)
    
    # Join the list into a string
    return ''.join(masked_seq)

# Example usage
masked_sequence = mask_sequence(sequences)
print(masked_sequence)

Positions of retained letters: [25, 28, 35, 126, 128, 143, 156, 213, 240, 269, 276, 348, 361, 366, 367, 368, 460, 472, 477, 479, 487, 502, 508, 510, 515, 524, 552, 554, 555, 558, 559, 579, 580]
-------------------------I--S------N------------------------------------------------------------------------------------------Q-V--------------K------------R--------------------------------------------------------S--------------------------V----------------------------S------A-----------------------------------------------------------------------A------------T----SLT-------------------------------------------------------------------------------------------S-----------R----D-M-------H--------------A-----T-L----Q--------V---------------------------I-IL--DA-------------------A------------
