# Coding Exercises

These are some short pieces of code I've written as I work on solving some biology related problems with code in my python learning journey.

#### Counting DNA Nucleotides:
**Parameters:**

*dna* (string): string of dna nucleotides, expected to only contain ATCG
    
**Returns:**

*count* (string): count of the number of each DNA nucleotide in A C G T order

Raises ValueError "Not valid nucleotide sequence for DNA" if any other nucleotides or contents are present in the string

In [None]:
def count_nucleotides(dna):
    nucleotides = ["a","c","g","t"]
    dna = dna.lower().strip()
    for nucleotide in dna:
        if nucleotide not in nucleotides:
            raise ValueError("Not valid nucleotide sequence for DNA")

    a = dna.count("a")
    c = dna.count("c")
    g = dna.count("g")
    t = dna.count("t")
    count_string =f"A: {a} C: {c} G: {g} T: {t}"
    return count_string

#### Transcribing DNA to RNA
**Parameters:**

*dna* (string): string of dna nucleotides, expected to only contain ATCG
    
**Returns:**

*rna* (string): the RNA version of the same dna nucleotide (Ts converted to Us)

Raises ValueError "Not valid nucleotide sequence for DNA" if any other nucleotides or contents are present in the string

In [None]:
def transcribe_to_rna(dna):
    nucleotides = ["a","c","g","t"]
    dna = dna.lower().strip()
    for nucleotide in dna:
        if nucleotide not in nucleotides:
            raise ValueError("Not valid nucleotide sequence for DNA")
    rna = dna.replace("t","u")
    return rna.upper()   

#### Reverse Complementing DNA
**Parameters:**

*dna* (string): string of dna nucleotides, expected to only contain ATCG
    
**Returns:**

*reverse_complement* (string): a reverse complement (reversed, then complemented) of the input dna strand

Raises ValueError "Not valid nucleotide sequence for DNA" if any other nucleotides or contents are present in the string

In [None]:
def reverse_complement_dna(dna):
    nucleotides = ["a","c","g","t"]
    dna = dna.lower().strip()
    for nucleotide in dna:
        if nucleotide not in nucleotides:
            raise ValueError("Not valid nucleotide sequence for DNA")   
    
    complement = {"a":"t", "c":"g", "g":"c", "t":"a",}
    
    reverse_complement = ""
    for i in range(len(dna)-1,-1,-1):
        reverse_complement += complement[dna[i]]
    return reverse_complement.upper()


#### Hamming Distance
**Parameters:**

*dna1* (string): first string of dna nucleotides, expected to only contain ATCG
*dna2* (string): second string of dna nucleotides, expected to only contain ATCG

dna1 and dna2 are expected to be of equal length
    
**Returns:**

*Hamming Distance* (integer): number of point mutations between the two sequences

Raises ValueError "Not valid nucleotide sequence for DNA" if any other nucleotides or contents are present in the string

Raises ValueError "DNA sequences are not the same length" if input strings are of different lengths.

In [None]:
def hamming(dna1,dna2):
    nucleotides = ["a","c","g","t"]
    dna1 = dna1.lower().strip()
    dna2 = dna2.lower().strip()
    for nucleotide in dna1:
        if nucleotide not in nucleotides:
            raise ValueError("Not valid nucleotide sequence for DNA")   
    for nucleotide in dna2:
        if nucleotide not in nucleotides:
            raise ValueError("Not valid nucleotide sequence for DNA") 
    if len(dna1) != len(dna2):
        raise ValueError("DNA sequences are not the same length")
    else:
        hamming_count = 0
        for i in range(len(dna1)):
            if dna1[i] == dna2[i]:
                pass
            else:
                hamming_count += 1
    return hamming_count



#### Identifying Substrings in DNA
**Parameters:**

*dna1* (string): primary string of dna, which is searched for the substring; expected to only contain ACGT

*dna2* (string): second string of dna nucleotides, expected to be a substring of the larger string; expected to only contain ACGT
    
**Returns:**

*Indeces* (string): a string containing the starting position (using 1-based numbering, vs 0-based numbering of python) of each substring location within the primary dna sequence

Raises ValueError "Not valid nucleotide sequence for DNA" if any other nucleotides or contents are present in the string

Raises ValueError "Substring is longer than primary string" if the secondary string is longer than the primary

In [None]:
def motif_dna(dna1,dna2):
    nucleotides = ["a","c","g","t"]
    dna1 = dna1.lower().strip()
    dna2 = dna2.lower().strip()
    for nucleotide in dna1:
        if nucleotide not in nucleotides:
            raise ValueError("Not valid nucleotide sequence for DNA")   
    for nucleotide in dna2:
        if nucleotide not in nucleotides:
            raise ValueError("Not valid nucleotide sequence for DNA") 
    if len(dna1) < len(dna2):
        raise ValueError("Substring is longer than primary string")
    else:
        start = 0
        indeces = []
        index = 0
        while index != -1:
            index = dna1.find(dna2,start)
            if index == -1:
                return " ".join(indeces)
            else:
                index += 1
                indeces.append(str(index))
                start = index 

#### Translating RNA into Protein
**Parameters:**

*rna* (string): string of rna sequence, expected to only contain ACGU
    
**Returns:**

*protein* (string): a string representing the single-letter abbreviations of the proteins encoded by the input rna string

Raises ValueError "Not valid nucleotide sequence for RNA" if any other nucleotides or contents are present in the string

Raises ValueError "No start codon" if no start codon present.

Will start only at a start codon ("AUG" - M) and end either at the end of the sequence, or a stop codon ("UAG","UAA","UGA").

In [None]:
import re

def transcribe(rna):
    stop_codons = ["UAA","UAG","UGA"]
    codon_map = {"UUU":"F", "UUC": "F", "UUA":"L", "UUG":"L",
        "UCU":"S", "UCC":"S","UCA":"S","UCG":"S",
        "UAU":"Y", "UAC":"Y",
        "UGU": "C", "UGC":"C", "UGG":"W",
        "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
        "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
        "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
        "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
        "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
        "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
        "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
        "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
        "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
        "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
        "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
        "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}

    #search for start codon of AUG, and ignore anything before it
    r = r'AUG(.+?)$'
    try:
        searched = re.search(r,rna)
        coding = searched.group(1)
        good = True
    except:
        raise ValueError("No start codon")

    #break into sets of three...
    if good == True:
        triples = [coding[i:i+3] for i in range(0,len(coding),3)]
        triples.insert(0,"AUG")

        #translates over triples and STOPS at a stop codon or end of code
        polypeptide = []
        for codon in triples:
            if len(codon) == 3:
                if codon in stop_codons:
                    return "".join(polypeptide)
                else:
                    polypeptide.append(codon_map[codon])
        return "".join(polypeptide)