### ChatGPTs lösning

In [40]:
from Bio import SeqIO
from Bio.Align import PairwiseAligner

def align_to_reference(query_seq, reference_seq):
    """
    Align the query sequence to the reference sequence and return alignment details.
    """
    aligner = PairwiseAligner()
    aligner.mode = 'global'
    aligner.match_score = 1
    aligner.mismatch_score = -1
    aligner.open_gap_score = -2
    aligner.extend_gap_score = -1

    alignment = aligner.align(reference_seq, query_seq)[0]  # Get the best alignment
    aligned_ref = alignment.aligned[0]
    aligned_query = alignment.aligned[1]
    
    return aligned_ref, aligned_query

def extract_region_of_interest(seq, aligned_ref, aligned_query, ref_start, ref_end):
    """
    Extract the region of interest from the aligned query sequence based on reference coordinates.
    """
    query_start = None
    query_end = None

    # Map reference positions to query sequence positions
    for ref_range, query_range in zip(aligned_ref, aligned_query):
        # Check if the primer region falls within this reference range
        if ref_start >= ref_range[0] and ref_start < ref_range[1]:
            query_start = query_range[0] + (ref_start - ref_range[0])
        if ref_end > ref_range[0] and ref_end <= ref_range[1]:
            query_end = query_range[0] + (ref_end - ref_range[0])
    
    return query_start, query_end

    # Extract the region if both start and end were found
    if query_start is not None and query_end is not None:
        return seq[query_start:query_end]
    # return None

def process_sequences(fasta_file, reference_seq, primer_fwd, primer_rev, output_file):
    """
    Extract sequences between primers based on alignment to a reference sequence.
    """
    # Find positions of primers in the reference sequence
    ref_start = reference_seq.index(primer_fwd) + len(primer_fwd)
    ref_end = reference_seq.index(primer_rev)

    lengths = []

    with open(output_file, "w") as out_fasta:
        for record in SeqIO.parse(fasta_file, "fasta"):
            seq = str(record.seq).upper()

            # Align sequence to reference
            aligned_ref, aligned_query = align_to_reference(seq, reference_seq)

            start, end = extract_region_of_interest(seq, aligned_ref, aligned_query, ref_start, ref_end)

            if start is None or end is None:
                print(f">{record.id}\n{start, end}\n")
            #     out_fasta.write(f">{record.id}\n{start, end}\n")

            # # Extract region of interest
            # extracted_seq = extract_region_of_interest(seq, aligned_ref, aligned_query, ref_start, ref_end)

            # if extracted_seq:
            #     lengths.append(len(extracted_seq))
            # else:
            #     print(f"Region not found for sequence {record.id}")
            #     out_fasta.write(f">{record.id}\n{extracted_seq}\n")
    return lengths

In [41]:
from Bio import SeqIO

for record in SeqIO.parse("Ecoli.fasta", "fasta"):
    ecoli = record.seq

# From Extract_sequences_16S
# forward: GTAACAGGAAGAAGCTTGCTTCTTTGCTGAC
# reverse: GTAGGTAGCTTAACCTTCGGGAGGGCGCTTA
forward_V1 = 'CTTCTT'
reverse_V9 = 'AGCTTA'

# From Extract_sequences_16S
# forward: GTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAG
# reverse: 
# CGCACGCAGGCGGUUUGUUAAGUCAGAUGUGAAAUCCCCGGGCUCAACCUGGGAACUGCAUCUGAUACUGGCAAGCUUGAGUCUCGUAGAGGGGGGUAGAAUUCCAG
forward_V3 = 'GCGGGG'
reverse_V4 = 'GGCAAG'

In [43]:
process_sequences("Vaginal_species_all.fasta", ecoli, forward_V3, reverse_V4, "output_modified.fasta")

>NZ_GL878523.1
(None, 537)

>NZ_NQOJ01000020.1
(462, None)

>NZ_QJVB01000008.1
(438, None)

>NZ_NQOJ01000020.1
(462, None)

>NZ_NFMF01000019.1
(None, 622)

>NZ_JBBNGY010000024.1
(None, 539)

>NZ_AUFY01000038.1
(None, None)



[]

In [39]:
a = process_sequences("Vaginal_species_all.fasta", ecoli, forward_V3, reverse_V4, "output_modified.fasta")
print(sum(a)/len(a))

188.925
