In [None]:
pip install biopython

In [None]:
import Bio
import Bio.Blast.Applications
import Bio.Blast.NCBIXML as NCBIXML
import Bio.Blast.NCBIWWW as NCBIWWW
from Bio import SeqIO

In [None]:
print (Bio.__version__)

In [None]:
multi_fasta = open(r"C:\Users\Francisca Alvarez\Desktop\Postulaciones Alemania\Nuevas Postulaciones\Applications\Modificaciones\Certificados\Nueva carpeta\dna2.fasta").read()

**1.- How many records are in the file? A record in a FASTA file is defined as a single-line header, followed by lines of sequence data. The header line is distinguished from the sequence data by a greater-than (">") symbol in the first column. The word following the ">" symbol is the identifier of the sequence, and the rest of the line is an optional description of the entry. There should be no space between the ">" and the first letter of the identifier.**

In [None]:
def count_records_in_fasta(multi_fasta):
    count = sum(1 for record in SeqIO.parse(multi_fasta, "fasta"))
count

**2.- What are the lengths of the sequences in the file? What is the longest sequence and what is the shortest sequence? Is there more than one longest or shortest sequence? What are their identifiers**

In [None]:
from Bio import SeqIO
def count_sequence_lengths(multi_fasta):
    for record in SeqIO.parse(multi_fasta, "fasta"):
        seq_length = len(record.seq)
        print(f" {record.id}: {seq_length} bases.")

# Especifica la ruta del archivo multi-FASTA
file_path = "dna2.fasta"

# Llamamos a la función para contar las longitudes de las secuencias
count_sequence_lengths(file_path)


**3.- In molecular biology, a reading frame is a way of dividing the DNA sequence of nucleotides into a set of consecutive, non-overlapping triplets (or codons). Depending on where we start, there are six possible reading frames: three in the forward (5' to 3') direction and three in the reverse (3' to 5'). For instance, the three possible forward reading frames for the sequence AGGTGACACCGCAAGCCTTATATTAGC are:** 

- AGG TGA CAC CGC AAG CCT TAT ATT AGC
- A GGT GAC ACC GCA AGC CTT ATA TTA GC
- AG GTG ACA CCG CAA GCC TTA TAT TAG C 

**These are called reading frames 1, 2, and 3 respectively. An open reading frame (ORF) is the part of a reading frame that has the potential to encode a protein. It starts with a start codon (ATG), and ends with a stop codon (TAA, TAG or TGA). For instance, ATGAAATAG is an ORF of length 9.
Given an input reading frame on the forward strand (1, 2, or 3) your program should be able to identify all ORFs present in each sequence of the FASTA file, and answer the following questions:**

- What is the length of the longest ORF in the file?
- What is the identifier of the sequence containing the longest ORF? For a given sequence identifier?
- what is the longest ORF contained in the sequence represented by that identifier?
- What is the starting position of the longest ORF in the sequence that contains it?

**The position should indicate the character number in the sequence. For instance, the following ORF in reading frame 1:
sequence1 ATGCCCTAG starts at position 1.
Note that because the following sequence:
sequence2 ATGAAAAAA
does not have any stop codon in reading frame 1, we do not consider it to be an ORF in reading frame 1.**

In [None]:
from Bio import SeqIO
from Bio.Seq import Seq

def find_orfs(dna_seq, stop_codons):
    ORFS = []
    for frame in range(3):  # review in the three frames
        CODONS = [str(dna_seq[i:i+3]) for i in range(frame, len(dna_seq), 3)]
        flag = False

        for i, codon in enumerate(CODONS):
            if codon == "ATG":  # ORF start
                orf = [codon]
                flag = True
                start_pos = frame + i * 3  # save the position

            elif flag:
                orf.append(codon)
                if codon in stop_codons:  # if find a stop codon
                    frame_number = (start_pos % 3) + 1  # Calculate frame (1,2,3)
                    ORFS.append((start_pos, frame_number, "".join(orf)))  # save ORF with the frame
                    flag = False  # Termina el ORF

    return ORFS
stop_codons = ["TAA", "TAG", "TGA"]

with open("dna2.fasta", "r") as file, open("orfs_results.txt", "w") as output_file:  #in open, write your file
    for record in SeqIO.parse(file, "fasta"):
        original_dna = record.seq
        reverse_dna = record.seq.reverse_complement()

        orfs_forward = find_orfs(original_dna, stop_codons)
        orfs_reverse = find_orfs(reverse_dna, stop_codons)

        output_file.write(f"> {record.id} (Strand +1)\n")
        for pos, frame, orf in orfs_forward:
            output_file.write(f"ORF at position {pos}, Frame {frame}: {orf[:30]}...{orf[-3:]} (Length: {len(orf) * 3} bp)\n")

        output_file.write(f"\n> {record.id} (Strand -1)\n")
        for pos, frame, orf in orfs_reverse:
            output_file.write(f"ORF at position {pos}, Frame {frame}: {orf[:30]}...{orf[-3:]} (Length: {len(orf) * 3} bp)\n")


In [None]:
f = open("orfs_results.txt", "r")
print(f.read())

**4.- A repeat is a substring of a DNA sequence that occurs in multiple copies (more than one) somewhere in the sequence. Although repeats can occur on both the forward and reverse strands of the DNA sequence, we will only consider repeats on the forward strand here. Also we will allow repeats to overlap themselves. For example, the sequence ACACA contains two copies of the sequence ACA - once at position 1 (index 0 in Python), and once at position 3. Given a length n, your program should be able to identify all repeats of length n in all sequences in the FASTA file. Your program should also determine how many times each repeat occurs in the file, and which is the most frequent repeat of a given length.**

In [None]:
from Bio import SeqIO
from collections import Counter
import re

def find_repeats(sequence, repeat_length=3):
    repeat_counts = Counter()

    # search repeats 
    for i in range(len(sequence) - repeat_length + 1):
        substring = sequence[i:i+repeat_length]
        matches = [m.start() for m in re.finditer(f"(?={substring})", sequence)]  
        
        if len(matches) > 1:  # if appears more than once
            repeat_counts[substring] = len(matches)

    return repeat_counts

# specific the fasta file
file_path = "dna2.fasta"
repeat_length = 12  # change the number if you need different length for the repeats

all_repeats = Counter()

# read fasta and analyse the sequences
with open(file_path, "r") as fasta_file:
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence = str(record.seq)  # sequence to string
        repeats = find_repeats(sequence, repeat_length)
        all_repeats.update(repeats)  # Sum to the total count

# most frecuent motif
most_common_repeat, most_common_count = all_repeats.most_common(1)[0]

# show results
print("Repeats found:", all_repeats)
print(f"Most frequent repeat: '{most_common_repeat}' appears {most_common_count} times")
