In [2]:
# Counting DNA Nucleotides
# Given: A DNA string s of length at most 1000 nt.
# Return: Four integers (separated by spaces) counting the respective number of times that the symbols 'A', 'C', 'G', and 'T' occur in s.

def nucleotides_count(genome_file):

    with open(genome_file, 'r') as file:
        sample = file.read()

    nuc_count = {'A': 0, 'C': 0, 'G': 0, 'T': 0}

    for nuc in sample:
        if nuc in nuc_count:
            nuc_count[nuc] += 1

    return " ".join(str(nuc_count[nucleotide]) for nucleotide in ['A', 'C', 'G', 'T'])

genome_file = "datasets/rosalind_dna.txt"
nucleotides_count(genome_file)

'220 252 258 232'

In [4]:
# Transcribing DNA into RNA
# Given: A DNA string t having length at most 1000 nt.
# Return: The transcribed RNA string of t.

def transcribe(genome_file):
    with open(genome_file, 'r') as file:
        dna = file.read()
    
    rna = dna.replace('T', 'U')

    return print(rna)

genome_file = "datasets/rosalind_rna.txt"

transcribe(genome_file)


CCGUUGGAGCACCACAGCUCCGUUUAGAGCGAGCCGUAUCUAUUACGCGAGCCUUGAAUCCAUUGGAAAGGCCAAGUAGAGACAGGCUUGUAUCACACUCUGUUAGAUAGCAUCUAGUGAGGUGUCGAGCGGACUGGGCGUGUAAAUAAAGCAGCAACUUAUUACAAUGGGGCCAGAGGAUCAACAUACCUAACCGAUUGUCCAAAGUUACCUUCGUGGUCUCAUAACGGACUGGCAUCGCUGUGGCAGCAUUGCACGAUGGUGCGCUCAGUACUGGGAUUCCCCUUAACCCUCCACUAGGUUCAGUGACCGAAGAGAGUGGCCCUCGUGUGAUACUGGAUGGCUCGAAUCGUAACAAACCCCGGUCGAUGUCUGUAGCGUAGUUUAUAAACAACAAAUGUCUUUUCGCCGGUUGCAUAACUGUUUCAUUGUACUUUCCAGCCCUCCGCGAAUGUGCUGCCAGGAGCUUGGAAUGCUCCGGAGACGAUUUGACUCAACUAUCUCUCGAACGAGUCGUCCUGUACUUGUGGGGACCAAUAUCAGAUAGGGGUUCUCUAAGCCUUACCAUUAGUAAUAUUUCUCGCUUCGGGGGGCGGACCUCGGAUGCUCUACCCGCGCCGAAGAGCCUCACACUAAAACUGACAGGGUCUACUUGGUGUAUAGGCGGGGGUUUCAAUGGGUCAUUGUUCUCAUCUGUCAACAACUCUUAUAUGUUCGUUCUGGUAUAACUGUGACGACGUAUAUGAAGGUAUUCGGCAGCAGCUAUGAUGGCGACAGAUCUUGUAGGAAUAGCCUGCCGCCAUUCGGGAUUCCUAUCCAGCCUGCCUCACAGGUCAGCUUGAAGCACCAAAGCAGUAAUAAUUGAGCCAUCGUAAAAGAGGUUAUACACUCUAUCGUGACGUUUCAUUCACGCGUUUCACAUGCUGUAUAAACGUCUUAGAAGAAACUAUCCACUUAGUGGUGACACGCAUUGACCCAGCC



In [10]:
# The Secondary and Tertiary Structures of DNA
# In DNA strings, symbols 'A' and 'T' are complements of each other, as are 'C' and 'G'.
# The reverse complement of a DNA string s is the string scformed by reversing the symbols of s, 
# then taking the complement of each symbol (e.g., the reverse complement of "GTCA" is "TGAC"

# Given: A DNA string s of length at most 1000 bp.
# Return: The reverse complement sc of s.

def reverse_complement(genome_file):
    with open(genome_file, 'r') as file:
        dna = file.read().strip()
    reversed_dna = dna[::-1]
    complement_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    new_string = "".join(complement_map.get(base, base) for base in reversed_dna)
    return new_string

def reverse_complement_pythonic(genome_file):
    with open(genome_file, 'r') as file:
        dna = file.read().strip()
    return dna.translate(str.maketrans('ATGC', 'TAGC'))[::-1]

genome_file = "datasets/rosalind_revc.txt"
reverse_complement(genome_file)
reverse_complement_pythonic(genome_file)


'AATTAAACAGGCGTTGCCGCGCCACGATCCCAGGTAAAGCAGCAGGGAACTGGGAGACAACAGGTAGGCCGTGGGTTCGCCCTCCAGGACTGTTAATCACAGAGATTTTGTAAATCGACTGATTGCGTACGGGTAAATGTGTTTCAAATTCTGCTAGTTGTCCAGCGAGGTCAGCGACGCTTTGTCAAGCGATTTGAACAACAATGGAACGTCAGGCAATTTGAGACTCGGCAGATTGAAAACTTGCGACTATAGAGAGTGTGGTCAATGTCTAGCCATCGTAACACCGGTATGCACTCATATATTCCTTATTCTCAATGGGTTGGTCCCTATTCCCAGAAAGGGTTAGAGCTCAGTACTTTTATCAGCTTATTCACCCTCCTTTACAAGTGCGCTGACCAGTGGATCGAGCAACTCAGCGCGGTATGCGAGGATTCTTAATTGAACCAGCTGTGTCAAGGAGGCCATTGAGCCTAATAGTTGGGGAAGCAGAAGTATGTGGATCAGATCACCGACGTTGACATCTCTGATCTTCCAATCGTCAAAACCCCAGAATATAGGTCTCGTACGTTACCACAATCCACCCACGTACGATTTTTTATAGGGAACTTGCCGCACGGGTATCACGGCCGAAAACGACCTTTCGTCTACTAGTGTGGATGAGAAAGCATCCAGAAGGCTTCCGAACCTTAGCAGATCTAGCAACTCCCTGTTTGTGCTGTGCAAGTCACAGCCTCTTTGGCCGCGAGGTTAACACTTATGAATACATTCTCCTAGGTCCAGGAGTAAAGGAAGGTGTAGTTACACA'

In [2]:
# Rabbits and Recurrence Relations
# Given: Positive integers n ≤40 and k ≤5.
# Return: The total number of rabbit pairs that will be present after n months, 
# if we begin with 1 pair and in each generation, every pair of reproduction-age rabbits produces a litter of k rabbit pairs (instead of only 1 pair).

def fibonacci_rabbits(n, k):
    if n == 0:
        return 0 # No rabbits at the start
    elif n == 1:
        return 1 # Initial Pair
    else:
        return fibonacci_rabbits(n - 1, k) + k * fibonacci_rabbits(n - 2, k)

fibonacci_rabbits(33, 3)

249650241628

In practice, the GC-content of most eukaryotic genomes hovers around 50%. However, because genomes are so long, we may be able to distinguish species based on very small discrepancies in GC-content; furthermore, most prokaryotes have a GC-content significantly higher than 50%, so that GC-content can be used to quickly differentiate many prokaryotes and eukaryotes by using relatively small DNA samples

In [10]:
# Computing GC Content
# Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).
# Return: The ID of the string having the highest GC-content, followed by the GC-content of that string. 
# Rosalind allows for a default error of 0.001 in all decimal answers unless otherwise stated; please see the note on absolute error below.

from Bio import SeqIO
from Bio.SeqUtils import gc_fraction

max_gc_content = 0
max_gc_seq_id = ""

for record in SeqIO.parse("datasets/rosalind_gc.txt", "fasta"):
    gc_content = gc_fraction(record.seq) * 100  # Calculate percentage
    if gc_content > max_gc_content:
        max_gc_content = gc_content
        max_gc_seq_id = record.id

print(f"{max_gc_seq_id}\n{max_gc_content:.6f}") 

Rosalind_4575
52.457372


In [None]:
# Counting Point Mutations

# Given: Two DNA strings s and t of equal length (not exceeding 1 kbp).
# Return: The Hamming distance dH(s,t).

