In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [3]:
def remove_gap_positions(family1_file, family2_file, family1_output, family2_output, family1_ids_file, family2_ids_file):
    # Read the sequence IDs from the ID files
    with open(family1_ids_file, "r") as file1, open(family2_ids_file, "r") as file2:
        family1_ids = set(file1.read().splitlines())
        family2_ids = set(file2.read().splitlines())

    # Read the sequences into lists of SeqRecord objects
    family1_records = [record for record in SeqIO.parse(family1_file, "fasta") if record.id in family1_ids]
    family2_records = [record for record in SeqIO.parse(family2_file, "fasta") if record.id in family2_ids]

    # Combine records from both families to identify gap-only columns
    combined_records = family1_records + family2_records

    # Get the length of the alignment
    alignment_length = len(combined_records[0].seq)

    # Identify columns that are not gaps in all sequences
    columns_to_keep = []
    for i in range(alignment_length):
        if not all(record.seq[i] == "-" for record in combined_records):
            columns_to_keep.append(i)

    # Create new sequences without the gap-only columns for both families
    new_family1_records = []
    for record in family1_records:
        new_seq = Seq("".join([record.seq[i] for i in columns_to_keep]))
        new_family1_records.append(SeqRecord(new_seq, id=record.id, description=""))

    new_family2_records = []
    for record in family2_records:
        new_seq = Seq("".join([record.seq[i] for i in columns_to_keep]))
        new_family2_records.append(SeqRecord(new_seq, id=record.id, description=""))

    # Write the filtered sequences to new FASTA files
    SeqIO.write(new_family1_records, family1_output, "fasta")
    SeqIO.write(new_family2_records, family2_output, "fasta")
    print(f"Filtered alignment written to {family1_output} and {family2_output}")

def extract_sequence_ids(input_file, output_ids_file):
    # Read the sequences into a list of SeqRecord objects
    records = list(SeqIO.parse(input_file, "fasta"))

    # Extract sequence IDs and write them to a text file
    with open(output_ids_file, "w") as id_file:
        for record in records:
            id_file.write(record.id + "\n")

    print(f"Sequence IDs written to {output_ids_file}")

In [20]:
cd70NR1file = '../data/node_ids/cd70/N6_ids_aligned.aln'
cd70NR1_ungapped = './cd70/N6_ids_ungapped.aln'
cd70NR4file = '../data/node_ids/cd70/N81_ids_aligned.aln'
cd70NR4_ungapped = './cd70/N81_ids_ungapped.aln'
cd70NR1_ids = 'cd70/NR1_ids.txt'
cd70NR4_ids = 'cd70/NR4_ids.txt'

cd80NR1file = '../data/node_ids/cd80/N7_ids_aligned.aln'
cd80NR1_ungapped = './cd80/N7_ids_ungapped.aln'
cd80NR4file = '../data/node_ids/cd80/N186_ids_aligned.aln'
cd80NR4_ungapped = 'cd80/N186_ids_ungapped.aln'
cd80NR1_ids = 'cd80/NR1_ids.txt'
cd80NR4_ids = 'cd80/NR4_ids.txt'

cd85NR1file = '../data/node_ids/cd85/N7_ids_aligned.aln'
cd85NR1_ungapped = './cd85/N7_ids_ungapped.aln'
cd85NR4file = '../data/node_ids/cd85/N299_ids_aligned.aln'
cd85NR4_ungapped = 'cd85/N299_ids_ungapped.aln'
cd85NR1_ids = 'cd85/NR1_ids.txt'
cd85NR4_ids = 'cd85/NR4_ids.txt'


In [21]:
extract_sequence_ids(cd70NR1file, cd70NR1_ids)
extract_sequence_ids(cd70NR4file, cd70NR4_ids)
remove_gap_positions(cd70NR1file, cd70NR4file, cd70NR1_ungapped, cd70NR4_ungapped, cd70NR1_ids, cd70NR4_ids)


extract_sequence_ids(cd80NR1file, cd80NR1_ids)
extract_sequence_ids(cd80NR4file, cd80NR4_ids)
remove_gap_positions(cd80NR1file, cd80NR4file, cd80NR1_ungapped, cd80NR4_ungapped, cd80NR1_ids, cd80NR4_ids)


extract_sequence_ids(cd85NR1file, cd85NR1_ids)
extract_sequence_ids(cd85NR4file, cd85NR4_ids)
remove_gap_positions(cd85NR1file, cd85NR4file, cd85NR1_ungapped, cd85NR4_ungapped, cd85NR1_ids, cd85NR4_ids)



Sequence IDs written to cd70/NR1_ids.txt
Sequence IDs written to cd70/NR4_ids.txt
Filtered alignment written to ./cd70/N6_ids_ungapped.aln and ./cd70/N81_ids_ungapped.aln
Sequence IDs written to cd80/NR1_ids.txt
Sequence IDs written to cd80/NR4_ids.txt
Filtered alignment written to ./cd80/N7_ids_ungapped.aln and cd80/N186_ids_ungapped.aln
Sequence IDs written to cd85/NR1_ids.txt
Sequence IDs written to cd85/NR4_ids.txt
Filtered alignment written to ./cd85/N7_ids_ungapped.aln and cd85/N299_ids_ungapped.aln
