In [8]:
!pip install biopython



In [21]:
# Muhammad Fakhri Andika Mutiara
# 5023211056
# Computational Genomics (Q) - Open Reading Frame (ORF) Finder Assignment

!pip install biopython
from Bio import SeqIO
from Bio.Seq import Seq

def find_orfs(sequence, min_orf_length=100):
    start_codon = 'ATG'
    stop_codons = ['TAA', 'TAG', 'TGA']
    orfs = []

    for strand, nuc in [(+1, sequence), (-1, sequence.reverse_complement())]:
        for frame in range(3):
            length = 3 * ((len(sequence)-frame) // 3)
            for i in range(frame, length, 3):
                codon = nuc[i:i+3]
                if codon == start_codon:
                    for j in range(i+3, length, 3):
                        if nuc[j:j+3] in stop_codons:
                            if (j+3-i) >= min_orf_length:
                                orfs.append((strand, frame, i, j+3, nuc[i:j+3]))
                            break
    return orfs

def print_orfs(orfs, sequence):
    for index, orf in enumerate(orfs, start=1):
        strand, frame, start, end, seq = orf
        print(f"\n[Frame Type: {'+' if strand == 1 else '-'}{frame+1}][Start: {start} Stop: {end}] [Length: {end-start}]")
        print(f"ORF[{index}]: {seq}")
        print("Complete ORF Data")
        codons = [seq[i:i+3] for i in range(0, len(seq), 3)]
        for i, codon in enumerate(codons, start=1):
            amino_acid = codon_to_amino_acid(codon)
            label = ""
            if codon == "ATG":
                label = " (Start)************"
            elif codon in ["TAA", "TAG", "TGA"]:
                label = " (Stop Codon) *****************"
            print(f"Codon[{index},{i}]: {codon} {amino_acid}{label}")
        print("\n")

def codon_to_amino_acid(codon):
    table = {
        'ATA':'Isoleucine', 'ATC':'Isoleucine', 'ATT':'Isoleucine', 'ATG':'Methionine',
        'ACA':'Threonine', 'ACC':'Threonine', 'ACG':'Threonine', 'ACT':'Threonine',
        'AAC':'Asparagine', 'AAT':'Asparagine', 'AAA':'Lysine', 'AAG':'Lysine',
        'AGC':'Serine', 'AGT':'Serine', 'AGA':'Arginine', 'AGG':'Arginine',
        'CTA':'Leucine', 'CTC':'Leucine', 'CTG':'Leucine', 'CTT':'Leucine',
        'CCA':'Proline', 'CCC':'Proline', 'CCG':'Proline', 'CCT':'Proline',
        'CAC':'Histidine', 'CAT':'Histidine', 'CAA':'Glutamine', 'CAG':'Glutamine',
        'CGA':'Arginine', 'CGC':'Arginine', 'CGG':'Arginine', 'CGT':'Arginine',
        'GTA':'Valine', 'GTC':'Valine', 'GTG':'Valine', 'GTT':'Valine',
        'GCA':'Alanine', 'GCC':'Alanine', 'GCG':'Alanine', 'GCT':'Alanine',
        'GAC':'Aspartic Acid', 'GAT':'Aspartic Acid', 'GAA':'Glutamic Acid', 'GAG':'Glutamic Acid',
        'GGA':'Glycine', 'GGC':'Glycine', 'GGG':'Glycine', 'GGT':'Glycine',
        'TCA':'Serine', 'TCC':'Serine', 'TCG':'Serine', 'TCT':'Serine',
        'TTC':'Phenylalanine', 'TTT':'Phenylalanine', 'TTA':'Leucine', 'TTG':'Leucine',
        'TAC':'Tyrosine', 'TAT':'Tyrosine', 'TAA':'Stop', 'TAG':'Stop',
        'TGC':'Cysteine', 'TGT':'Cysteine', 'TGA':'Stop', 'TGG':'Tryptophan',
    }
    return table.get(codon, '?')

def sequence_statistics(sequence):
    total_bases = len(sequence)
    base_counts = {"A": sequence.count("A"), "T": sequence.count("T"), "G": sequence.count("G"), "C": sequence.count("C")}
    print("\nSequence Statistics:")
    for base, count in base_counts.items():
        print(f"total {base}: {count}")
    print(f"\nTotal Base Pair: {total_bases}")

    total_freq = 0
    for base, count in base_counts.items():
        freq = count/total_bases
        print(f"{base} frequency: {count/total_bases:.4f}")
        total_freq += freq
    print(f"Total frequency: {total_freq:.4f}\n")

def print_complete_cds_and_inv_complement(sequence):
    print("\nComplete CDS:")
    print(sequence)
    print("\nInverse Complement:")
    print(sequence.reverse_complement())

def main(fasta_file):
    print("Open Reading Frame (ORF) Finder ver1.0")
    for record in SeqIO.parse(fasta_file, "fasta"):
        print(f"\nSequence file: {record.id} {record.description}")
        sequence = record.seq
        print_complete_cds_and_inv_complement(sequence)
        orfs = find_orfs(sequence)
        sequence_statistics(sequence)
        print(f"ORFs Found: {len(orfs)}\n")
        print_orfs(orfs, sequence)

if __name__ == "__main__":
    fasta_file = "examplefile.fasta"
    main(fasta_file)

Open Reading Frame (ORF) Finder ver1.0

Sequence file: NC_001416.1 NC_001416.1 Enterobacteria phage lambda

Complete CDS:
CCTCTCGGAGCTGGAAATGCAGCTATTGAGATCTTCGAATGCTGCGGAGCTGGAGGCGGAGGCAGCTGGGGAGGTCCGAGCGATGTGACCAGGCCGCCATCGCTCGTCTCTTCCTCTCTCCTGCCGCCTCCTGTGTCGAAAATAACTTTTTTAGTCTAAAGAAAGAAAGACAAAAGTAGTCGTCCGCCCCTCACGCCCTCTCTTCCTCTCAGCCTTCCGCCCGGTGAGGAAGCCCGGGGTGGCTGCTCCGCCGTCGGGGCCGCGCCGCCGAGCCCCAGCGCCCCGGGCCGCCCCCGCACGCCGCCCCCATGCATCCCTTCTACACCCGGGCCGCCACCATGATAGGCGAGATCGCCGCCGCCGTGTCCTTCATCTCCAAGTTTCTCCGCACCAAGGGGCTGACGAGCGAGCGACAGCTGCAGACCTTCAGCCAGAGCCTGCAGGAGCTGCTGGCAGAACATTATAAACATCACTGGTTCCCAGAAAAGCCATGCAAGGGATCGGGTTACCGTTGTATTCGCATCAACCATAAAATGGATC

Inv Complement:
GATCCATTTTATGGTTGATGCGAATACAACGGTAACCCGATCCCTTGCATGGCTTTTCTGGGAACCAGTGATGTTTATAATGTTCTGCCAGCAGCTCCTGCAGGCTCTGGCTGAAGGTCTGCAGCTGTCGCTCGCTCGTCAGCCCCTTGGTGCGGAGAAACTTGGAGATGAAGGACACGGCGGCGGCGATCTCGCCTATCATGGTGGCGGCCCGGGTGTAGAAGGGATGCATGGGGGCGGCGTGCGGGGGCGGCCCGGGGCGCTGGGGCTCGGCGGCGCGGCCCCGACGGCGGAGCAGCCACCCCGGGCTTCCTCACCGG