In [3]:
import pandas as pd
from google.colab import files

In [11]:
import pandas as pd

def extract_matching_fasta_sequences(csv_file, fasta_input, fasta_output):
    """
    Extracts sequences from a .faa file based on IDs listed in a CSV file and saves them to a new .faa file.

    Parameters:
    - csv_file (str): Path to the CSV file containing sequence IDs in the first column.
    - fasta_input (str): Path to the input FASTA (.faa) file.
    - fasta_output (str): Path to the output FASTA file to save matched sequences.
    """

    # Load list of target IDs from CSV
    df_ids = pd.read_csv(csv_file)
    id_list = set(df_ids.iloc[:, 0].str.strip())

    with open(fasta_input, "r") as infile, open(fasta_output, "w") as outfile:
        write_seq = False
        current_header = ""
        current_seq_lines = []

        for line in infile:
            if line.startswith(">"):
                # Save previous matching sequence
                if write_seq and current_header:
                    outfile.write(current_header)
                    outfile.writelines(current_seq_lines)

                current_header = line
                seq_id = line.split()[0][1:]  # remove ">" and extract ID
                write_seq = seq_id in id_list
                current_seq_lines = []
            else:
                if write_seq:
                    current_seq_lines.append(line)

        # Save the last sequence if it matched
        if write_seq and current_header:
            outfile.write(current_header)
            outfile.writelines(current_seq_lines)

    print(f"✅ Extraction complete. Saved to: {fasta_output}")
