<a href="https://colab.research.google.com/github/e-white25/Rosalind_Solutions/blob/main/ROSALIND.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DNA/RNA

In [46]:
#Update 8/24
def fasta_to_dict(fasta_file):
  ''' for parsing through a fasta file w/out Biopython (seqIO)
  for smaller files, use file.readlines(); saves to memory; only use this if need to access
  all reads at once (indexing, or analyzing total content(s))
  for larger files, stream one line at a time w/ line.strip() for converting file contents
  **save header as key: seq as value** '''

  fasta_dict = {}
  header = None
  seq = []

  with open(fasta_file, 'r') as file:   #read is default remember 'file' is an iterator
    for line in file:
      line = line.strip()
      if not line:
        continue      #ignores blank lines
      if line.startswith(">"):
        if header is not None:    #if not 1st header; flush/push read to record
          fasta_dict[header] = ''.join(seq)  #assing corresponding seq to header key using empty string as sep.
        header = line[1:].strip()   #removes '>' up to first white space (i.e just readID)
        seq = []    #reset seq/flush
      else:
        seq.append(line.upper())

  if header is not None:
      fasta_dict[header] = ''.join(seq)   #flush the header+read to record at the end

  return fasta_dict

In [50]:
#Update 8/24
#Test the above on a fasta_text snippet

fasta_text = """>seq1
atcgATTC
ggta
>seq2
ttttcccc
"""

with open("test.fa", "w") as f:
    f.write(fasta_text)

T = fasta_to_dict("test.fa")
T.items()

dict_items([('seq1', 'ATCGATTCGGTA'), ('seq2', 'TTTTCCCC')])

In [83]:
##ID: DNA
#Return: Four integers (separated by spaces) counting the respective number of times that the symbols 'A', 'C', 'G', and 'T' occur in s

# Sample Dataset
seq1 = 'AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC'
# Sample Output

def nucleotides(seq: str):
  s = seq.upper()
  A = s.count('A')
  C = s.count('C')
  G = s.count('G')
  T = s.count('T')
  return A, C, G, T

nucleotides(seq1)


(20, 12, 17, 21)

In [84]:
#Update 8/24
#GC content as % - throught process

def GC_percent(seq: str, include_ambigulous=False) -> float:
  '''# GC content (%) over A/C/G/T only (ignores Ns if present)'''
  s = seq.upper()
  A = s.count('A')
  C = s.count('C')
  G = s.count('G')
  T = s.count('T')

  GC_cont = []
  GC_tot = sum([G] + [C])
  total_bp = len(seq)
  GC_cont = GC_tot % total_bp
  return GC_cont, '%'

#print(GC_percent((seq1), '%'))

# Improved method:

def GC_percent(seq: str, include_ambiguous=False) -> float: #False won't include b as N's
  '''# GC content (%) over A/C/G/T only (ignores Ns if present)'''
  s = seq.upper()
  A = s.count('A')
  C = s.count('C')
  G = s.count('G')
  T = s.count('T')
  GC = sum([G] + [C])

 if GC == 0:
        raise ValueError("No G/C bases countedd; cannot compute GC%.")
  if include_ambiguous: #set to true to include other bases (N)
      total = len(s)
      if total == 0:
        raise ValueError("No valid A/C/G/T bases found; cannot compute GC%.")
  else:
      total = sum([GC] + [G] + [T])
  return 100.0 * GC / total

print(f"{GC_percent(seq1):.2f}%")   #return a float (.2 = 2 deci)


43.28%


In [None]:
##ID: RNA
#Given: A DNA string
#Return: The RNA string (not complementary, just replace Thymine w/Uracil)

#Sample Dataset
DNA_seq = 'GATGGAACTTGACTACGTAAATT'

def transcribe(DNA_seq):
  return DNA_seq.replace('T', 'U')

transcribe(DNA_seq)

'GAUGGAACUUGACUACGUAAAUU'

In [87]:
##ID: REVC
# Given: A DNA string
# Return: The reverse complement

#Sample Dataset
DNA_seq = 'AAAACCCGGT'

# cDNA_lib = {'A':'T', 'T':'A', 'C':'G', 'G':'C'}
# for b in DNA_seq [0:len(DNA_seq)]:
#   seq_complement = cDNA_lib[b]
# from os import replace

### This only works because there is not redundancy (as there would be if cDNA was capitalized) but isn't ideal
# def reverse_complement(DNA_seq):
#   DNA_seq = DNA_seq.replace('A', 't')
#   DNA_seq = DNA_seq.replace('T', 'a')
#   DNA_seq = DNA_seq.replace('C', 'g')
#   DNA_seq = DNA_seq.replace('G', 'c')
#   return DNA_seq(reverse_complement)

cDNA_lib = {'A':'T', 'T':'A', 'C':'G', 'G':'C'}
seq_complement = "" #create an empty string
for base in DNA_seq:
  seq_complement += cDNA_lib[base] #append each base to empty string
print(seq_complement[::-1])

In [109]:
#Update 8/24
#As stated, += opperator isn't great for large files, practice looping through a fasta file

fasta_text2 = """>seq1
atcgATTCcGATCGTTTAacg
ggta
>seq2
ttttccccAAAAA
"""
#write to file
with open("test.fasta", "w") as file:
    file.write(fasta_text2)

!head test.fasta

#can also test this way to see format:
with open("test.fasta", "r") as file:
  for line in file:
    print(repr(line))

>seq1
atcgATTCcGATCGTTTAacg
ggta 
>seq2
ttttccccAAAAA
'>seq1\n'
'atcgATTCcGATCGTTTAacg\n'
'ggta \n'
'>seq2\n'
'ttttccccAAAAA\n'


In [153]:
# return reverse complment for each read
# Split into two functions:
  # > one just to generate reverse comp
  # > then function to parse through input and write output

def rev_comp_seq(seq: str) -> str:

  Rev_comp_dict = {'A' : 'T',
                 'G' : 'C',
                 'C' : 'G',
                 'T' : 'A'}

  input_seq = seq.upper()[::-1]
  return ''.join(Rev_comp_dict.get(b, b) for b in input_seq)    # dict.get(key, default): if key exists > returns value; if not returns input 'b'

# parse through input, generate reverse comp. using above function
# write to output attacted to header

def write_rev_comp_fasta(input_path: str, output_path: str) -> str:

    header = None
    raw_seq = []

    with open(input_path, "r") as fin, open(output_path, "w") as fout:
      for sample in fin:
        sample = sample.strip()
        if not sample:
          continue      #ignores blank lines

        if sample.startswith(">"):
          if header is not None: #flush previous record if exists
            seq = ''.join(raw_seq) #concat original sequences tied to sample
            rev_comp = rev_comp_seq(seq) #compute rev comp
            fout.write(f"{header}\n{rev_comp}\n")

        # next sample
          header = sample
          raw_seq = []
        else:
          raw_seq.append(sample)

        # flush last record at EOF (can do  parse into generator in future instead of flush)
        # make sure flush is outside of the loop
      if header is not None:
        seq = ''.join(raw_seq)
        rev_comp = rev_comp_seq(seq)
        fout.write(f"{header}\n{rev_comp}\n")

    return output_path

write_rev_comp_fasta("test.fasta", "test_rev.fasta")

!head test_rev.fasta

>seq1
TACCCGTTAAACGATCGGAATCGAT
>seq2
TTTTTGGGGAAAA


In [None]:
##ID: REVC
##Given the complementary output, provide the DNA seq Imput
Output = 'ACCGGGTTTT'

DNA_seq_lib = {'A':'T', 'T':'A', 'C':'G', 'G':'C'}
DNA_imput = ""
for b in Output:
  DNA_imput += DNA_seq_lib[b]
print(DNA_imput[::-1])

AAAACCCGGT


In [None]:
##ID: GC

#A commonly used method of string labeling for DNA > FASTA format. In this format, the string is introduced by a line that begins with '>',
# followed by labeling info. Subsequent lines contain the string itself; the first line to begin with '>' indicates the label of the next string.

#In Rosalind's implementation, a string in FASTA format will be labeled by the ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000 and 9999.
#Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).
#Return: The ID of the string having the highest GC-content, followed by the GC-content of that string.
#Rosalind allows for a default error of 0.001 in all decimal answers unless otherwise stated; please see the note on absolute error below.

In [None]:
import pandas as pd

# Sample dataset as a multi-line string
data = """>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT"""

# Splitting data into lines
lines = data.split("\n")

# Dictionary to store sequence data; keys are the ID sequences are the value
sequences = {}

# Parse the data
for line in lines:
    if line.startswith(">"):
        current_label = line[1:]  # Remove ">" to get the identifier
        sequences[current_label] = ""  # Initialize an empty string for the sequence
    else:
        sequences[current_label] += line.strip()  # Append sequence

# Convert to a DataFrame
seq_df = pd.DataFrame(sequences.items(), columns=["ID", "Sequence"])

# Display DataFrame
print(seq_df)

#Return: The ID of the string having the highest GC-content, followed by the GC-content of that string.

# Function to calculate GC content
def GC_content(sequence):
    GC_count = sequence.count('G') + sequence.count('C')
    return (GC_count / len(sequence)) * 100

# Add column with GC content:
seq_df["GC_Content"] = seq_df["Sequence"].apply(GC_content)
print(seq_df)

# Print ID with max GC content
# .idxmax() function finds the index of the maximum value in the "GC_Content" column.
max_GC_ID = seq_df.loc[seq_df["GC_Content"].idxmax()]
print("Highest GC Content:", max_GC_ID)

              ID                                           Sequence
0  Rosalind_6404  CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGG...
1  Rosalind_5959  CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGC...
2  Rosalind_0808  CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTC...
              ID                                           Sequence  \
0  Rosalind_6404  CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGG...   
1  Rosalind_5959  CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGC...   
2  Rosalind_0808  CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTC...   

   GC_Content  
0   53.750000  
1   53.571429  
2   60.919540  
Highest GC Content: ID                                                Rosalind_0808
Sequence      CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTC...
GC_Content                                             60.91954
Name: 2, dtype: object


In [None]:
##ID: HAMM

# The Hamming distance between these two strings is 7. Mismatched symbols are colored red.
### Hamming distance = the number of positions where two strings differ.

# Given two strings s and t of equal length, the Hamming distance between s and t, denoted dH(s,t),
# is the number of corresponding symbols that differ in s and t.

## PROBLEM: Given: Two DNA strings s and t of equal length (not exceeding 1 kbp).
## Return: The Hamming distance dH(s,t)

#Sample Dataset

s = 'GAGCCTACTAACGGGAT'
t = 'CATCGTAATGACGGCCT'

# Variable to store mismatches:
mismatched_bases = 0

# Iterate through each sequence - At each position i, it grabs the nucleotide (nuc_s) from s, and the corresponding one (nuc_t) from t.
for i in range(len(s)):
    nuc_s = s[i]
    nuc_t = t[i]

    if nuc_s != nuc_t:  # Compare
        mismatched_bases += 1  # Increase mismatch count if different
#So every != becomes a little "Hamming ping!"
print("Total mismatches:", mismatched_bases)


Total mismatches: 7


In [None]:

##ID: PROT

#Problem: The 20 commonly occurring amino acids are abbreviated by using 20 letters (all letters except for B, J, O, U, X, and Z).
#Protein strings are constructed from these 20 symbols. Henceforth, the term genetic string will incorporate protein strings along with DNA strings and RNA strings.

#The RNA codon table dictates the details regarding the encoding of specific codons into the amino acid alphabet.

##Given: An RNA string s corresponding to a strand of mRNA (of length at most 10 kbp).
##Return: The protein string encoded by s

#Sample Dataset
RNA = 'AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA'

#Create Dictionary
aa = {}

codon_table = """
UUU F    CUU L    AUU I    GUU V
UUC F    CUC L    AUC I    GUC V
UUA L    CUA L    AUA I    GUA V
UUG L    CUG L    AUG M    GUG V
UCU S    CCU P    ACU T    GCU A
UCC S    CCC P    ACC T    GCC A
UCA S    CCA P    ACA T    GCA A
UCG S    CCG P    ACG T    GCG A
UAU Y    CAU H    AAU N    GAU D
UAC Y    CAC H    AAC N    GAC D
UAA Stop CAA Q    AAA K    GAA E
UAG Stop CAG Q    AAG K    GAG E
UGU C    CGU R    AGU S    GGU G
UGC C    CGC R    AGC S    GGC G
UGA Stop CGA R    AGA R    GGA G
UGG W    CGG R    AGG R    GGG G
"""

# Process the raw codon table
for line in codon_table.strip().split("\n"):  # Split into lines
    Codon_AA = line.split()  # Split each line into parts (codon, amino acid)
    for i in range(0, len(Codon_AA), 2):  # Process in pairs (codon, amino acid)
        codon = Codon_AA[i]
        amino_acid = Codon_AA[i+1]
        aa[codon] = amino_acid  # Add to dictionary

translated_RNA = ""
for i in range(0, len(RNA), 3):
    codon = RNA[i:i+3]
    #print(codon)
    if aa[codon] == "Stop":
        break
    translated_RNA += aa[codon] #appending the amino acid corresponding to each codon to a string called translated_RNA

print(translated_RNA)

MAMAPRTEINSTRING


In [None]:
###ID: SUBS
###Finding a Motif in DNA
#Problem: Given two strings s and t, t is a substring of s if t is contained as a contiguous collection of symbols in s (as a result, t must be no longer than s).
#Given: Two DNA strings s and t (each of length at most 1 kbp).
# Return: All locations of t as a substring of s.

### Sample Dataset
# GATATATGCATATACTT
# ATAT

## PROCESS:
#1 Identify where the motif "ATAT" appears in the string, starting with the first position, by iterating through s, checking if the substring starting at each position matches 𝑡:
### for loop: determine how many starting positions of the substring exist in s
#2 Store 1-based positions of matches
#3 Check if substring matches t
#4 Convert 0-based to 1-based index
s = 'GATATATGCATATACTT'
t = 'ATAT'

# for i in range(len(s) - len(t) + 1):  # Iterate through possible start positions within s
#     if s[i:i+len(t)] == t:  # extracts a substring from s, starting at index i and ending at i + len(t), and checks if it equals "ATAT"
#         print(i + 1)  # Convert to 1-based index

def ATAT_motif_locations(s, t):
    positions = []  # Store 1-based positions of matches
    for i in range(len(s) - len(t) + 1):  # Iterate through s
        if s[i:i+len(t)] == t:  # Check if substring matches t
            positions.append(i + 1)  # Convert 0-based to 1-based index
    return positions #return the positions list

# Call the function to get the positions and then print them
positions = ATAT_motif_locations(s, t)
print(positions)

[2, 4, 10]


In [None]:
###ID: PRTM
##Problem: In a weighted alphabet, every symbol is assigned a positive real number called a weight. A string formed from a weighted alphabet is called a weighted string,
#and its weight is equal to the sum of the weights of its symbols. The standard weight assigned to each member of the 20-symbol amino acid alphabet is the monoisotopic mass of the corresponding amino acid.

#Given: A protein string P of length at most 1000 aa.
#Return: The total weight of P (Consult the monoisotopic mass table)

### Approach:
#1 make a dictionary to store amino acid:specific monoisotopic mass
#2 Iterate Through the Protein Stringand retireve the mass from dictionary
#3 Sum up all the retrieved values

###Sample Dataset
p = 'SKADYEK'

monoisotopic_mass_table = {
    'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259,
    'F': 147.06841, 'G': 57.02146, 'H': 137.05891, 'I': 113.08406,
    'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293,
    'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203,
    'T': 101.04768, 'V': 99.06841, 'W': 186.07931, 'Y': 163.06333
}

total_mass = 0

for aa in p:
  mass = monoisotopic_mass_table[aa]
  total_mass += mass
print(total_mass)

821.3919199999999


In [None]:
###ID: SPLC (RNA Splicing)
#Problem:After identifying the exons and introns of an RNA string, we only need to delete the introns and concatenate the exons to form a new string ready for translation.
## Given: A DNA string s (of length at most 1 kbp) and a collection of substrings of sacting as introns. All strings are given in FASTA format.
## Return: A protein string resulting from transcribing and translating the exons of s (Note: Only one solution will exist for the dataset provided.)

#Sample Dataset
# >Rosalind_10
# ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG
# >Rosalind_12
# ATCGGTCGAA
# >Rosalind_15
# ATCGGTCGAGCGTGT

##1 - Parse the FASTA file so that each FASTA seq can be accessed by its ID
fasta_dict = {
    "Rosalind_10": "ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG",
    "Rosalind_12": "ATCGGTCGAA",
    "Rosalind_15": "ATCGGTCGAGCGTGT"
}
#print(fasta_dict)

##2 distinguish between DNA and introns
main_seq = list(fasta_dict.values())[0]  # Rosalind_10 is the first entry
introns = list(fasta_dict.values())[1:]  # everything else

##3 Remove introns
for seq in introns:
    main_seq = main_seq.replace(seq, "")  #replace every intron seq in main seq with 0 characters ("splice")

#print("Spliced sequence:", main_seq)

##4) turn this into a protein sequence - will need a dictionary

codon_AA = {
    'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
    'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
    'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
    'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
    'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
    'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
    'TAT': 'Y', 'TAC': 'Y', 'TAA': '',  'TAG': '',
    'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
    'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
    'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
    'TGT': 'C', 'TGC': 'C', 'TGA': '',  'TGG': 'W',
    'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
    'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
    'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
  }

##5 parse through the spliced sequence by 3s...
# Find first occurrence of start codon (ATG)
start_index = main_seq.find("ATG")

# If found, slice the sequence starting from there
if start_index != -1: #any postive integer will refer to the index start site of ATG, but a -1 means no ATG detected
    coding_seq = main_seq[start_index:] # this starts the new string at the start codon and goes to the end of the string
else:
    coding_seq = "No start codon found"
## create an empty string for the protein code:
protein = ""

## Iterate by 3's
for i in range(0, len(coding_seq), 3):#start at index 0 (i.e. ATG); go through entire string; 3 characters/iteration
  codon = coding_seq[i:i+3]  #creating a 'string' for each codon by grabbing 3 characters at a time—starting at position i (stored in memory for next function/use)
  aa = codon_AA.get(codon, '')
  if aa == " ": #indicates stop codon (TAA, e.g.)
        break
  if len(codon) < 3:
        break  # Skip incomplete codons
  protein += aa #append  new amino acid to the protein string
print(protein)

MVYIADKQHVASREAYGHMFKVCA


**Figure 2. Palindromic Recognition Site**

![alt text](https://rosalind.info/media/problems/revp/palindrome.png)

In [None]:
##ID: REVP
# Problem
# A DNA string is a reverse palindrome if it is equal to its reverse complement.
# For instance, GCATGC is a reverse palindrome because its reverse complement is GCATGC. See Figure 2.

# Given: A DNA string of length at most 1 kbp in FASTA format.
# Return: The position and length of every reverse palindrome in the string having length between 4 and 12. You may return these pairs in any order.

Sample_Dataset = \
">Rosalind_24\n\
TCAATGCATGCGGGTCTATATGCAT"

# #1 Exract the sequence:
# fasta_dict = {
#     "Rosalind_24": "TCAATGCATGCGGGTCTATATGCAT"
# }

fasta_dict = {}
#can also do it without knowing:
for line in Sample_Dataset.split('\n'):
      line = line.strip()
      if line.startswith(">"):
        current_id = line[1:]  # slice from the 2nd character/index 1
        fasta_dict[current_id] = ""  # initialize new string in dictionary with key 'current_ID' and value = empty string
      else: #anything but the header - in this case it's the sequence
        fasta_dict[current_id] += line  # add/append sequence data to its ID
print(fasta_dict)

#2) Grab the sequence
seq = list(fasta_dict.values())[0]

#Incase you wanted to reference the ID:
list(fasta_dict.keys())[0]

#3) For every substring of length 4 to 12, check if it’s equal to its reverse complement.
    ### Loop through all substrings 4-12 characters
    ### if substring == reverse_complement; then it's a reverse palindrome, print:
    # starting position and length

def rev_comp(seq):
    comp = {"A": "T", "T": "A", "C": "G", "G": "C"}
    rev_seq = seq[::-1]  #reversed string
    result = "" #initialize empty string
    for base in rev_seq:
        result += comp[base]
    return result
print(rev_comp(seq))

#A more efficienct way:  return ''.join(comp[base] for base in seq[::-1]); doesn't rewrite the string each time

for i in range(len(seq)):
    for length in range(4, 13):  # Lengths 4 to 12 inclusive (sliding window-like)
        if i + length > len(seq):
            continue  # Skip if it goes past the end

        substring = seq[i:i+length] #indexing each new substring as it progresses through the seq
        rev_comp_substring = rev_comp(substring) # call function and store return value to new variable

        if substring == rev_comp_substring:  # compare original and reverse to check for palindromes
          print(i + 1, length)  # 1-based indexing

{'Rosalind_24': 'TCAATGCATGCGGGTCTATATGCAT'}
ATGCATATAGACCCGCATGCATTGA
4 6
5 4
6 6
7 4
17 4
18 4
20 6
21 4


In [None]:
##ID: TRAN
# Problem: For DNA strings s1 and s2 having the same length, their transition/transversion ratio R(s1,s2) is the ratio of the total number of transitions to the total number of transversions
# where symbol substitutions are inferred from mismatched corresponding symbols as when calculating Hamming distance (see “Counting Point Mutations”)
## Given: Two DNA strings s1 and s2 of equal length (at most 1 kbp).
## Return: The transition/transversion ratio R(s1,s2)

Sample_Dataset_T = \
">Rosalind_0209\n\
GCAACGCACAACGAAAACCCTTAGGGACTGGATTATTTCGTGATCGTTGTAGTTATTGGA\n\
AGTACGGGCATCAACCCAGTT\n\
>Rosalind_2200\n\
TTATCTGACAAAGAAAGCCGTCAACGGCTGGATAATTTCGCGATCGTGCTGGTTACTGGC\n\
GGTACGAGTGTTCCTTTGGGT"

#1) As always make FASTA sequences into a dictionary
fasta_dict_T = {}
for line in Sample_Dataset_T.split('\n'):
      line = line.strip()
      if line.startswith(">"):
        current_id = line[1:]  # slice from the 2nd character/index 1
        fasta_dict_T[current_id] = ""  # initialize new string in dictionary with key 'current_ID' and value = empty string
      else: #anything but the header - in this case it's the sequence
        fasta_dict_T[current_id] += line  # add/append sequence data to its ID
print(fasta_dict_T)

#2) Assigning variables from problem
s1 = fasta_dict_T["Rosalind_0209"]
s2 = fasta_dict_T["Rosalind_2200"]

#3) Create a function for identifying transversions and transitions
### Transitions: purine-to-purine (A ↔ G) or pyrimidine-to-pyrimidine (C ↔ T) substitutions
### Transversions: purine-to-pyrimidine or vice versa (A/G ↔ C/T).

transition_pairs = [('A', 'G'), ('G', 'A'), ('C', 'T'), ('T', 'C')]

# Step 2: Set counters to zero
# transitions = 0
# transversions = 0

#3 Loop through both sequences at once
# for i in range(len(s1)):
#     base1 = s1[i]
#     base2 = s2[i]

#     if base1 != base2:
#         if (base1, base2) in transition_pairs:
#             transitions += 1
#         else:
#             transversions += 1

#5) Create a function

def calculate_transition_transversion_ratio(s1, s2):
    transition_pairs = [('A', 'G'), ('G', 'A'), ('C', 'T'), ('T', 'C')]
    transitions = 0
    transversions = 0

    for i in range(len(s1)):
        base1 = s1[i]
        base2 = s2[i]

        if base1 != base2:
            if (base1, base2) in transition_pairs:
                transitions += 1
            else:
                transversions += 1

    if transversions == 0: # You'll get a division by zero
        return float('inf')  # Instead of crashing, return "infinity"
    return transitions / transversions, transitions, transversions

#If you want to see # of transitions and transverssions alone,
#you have to Assign the returned values to variables outside the function (Call the function with the desired variables)
ratio, transitions, transversions = calculate_transition_transversion_ratio(s1, s2)

print("Ratio", ratio)
print("Transversions:", transversions)
print("Transitions:", transitions)


{'Rosalind_0209': 'GCAACGCACAACGAAAACCCTTAGGGACTGGATTATTTCGTGATCGTTGTAGTTATTGGAAGTACGGGCATCAACCCAGTT', 'Rosalind_2200': 'TTATCTGACAAAGAAAGCCGTCAACGGCTGGATAATTTCGCGATCGTGCTGGTTACTGGCGGTACGAGTGTTCCTTTGGGT'}
Ratio 1.2142857142857142
Transversions: 14
Transitions: 17


# ID: LCSM
Problem
A common substring of a collection of strings is a substring of every member of the collection.
We say that a common substring is a longest common substring if there does not exist a longer common substring.
For example, "CG" is a common substring of "ACGTACGT" and "AACCGTATA", but it is not as long as possible; in this case, "CGTA" is a longest common substring of "ACGTACGT" and "AACCGTATA".
Note that the longest common substring is not necessarily unique; for a simple example, "AA" and "CC" are both longest common substrings of "AACC" and "CCAA".

Given: A collection of k(k≤100) DNA strings of length at most 1 kbp each in FASTA format.
Return: A longest common substring of the collection. (If multiple solutions exist, you may return any single solution.)

In [None]:
Sample_Dataset_LCSM = """
>Rosalind_1
GATTACA
>Rosalind_2
TAGACCA
>Rosalind_3
ATACA
"""

#Logic... start with one string and generate all the substrings of that string
#The substrings should be 1 base - length of the string
#### Example - rosalind_3:
# Len 5: "ATACA"
# Len 4: "ATAC", "TACA"
# Len 3: "ATA", "TAC", "ACA"
# Len 2: "AT", "TA", "AC", "CA"
# Len 1: "A", "T", "C"

#1) as always set up the FASTA data structure
fasta_dict_LCSM = {}
current_id = None

for line in Sample_Dataset_LCSM.strip().split('\n'): #.strip() moves everything ito one line (but keeps \n); .split('\n') splits string into a list every newline
      line = line.strip()
      if line.startswith(">"):
        current_id = line[1:]  # slice from the 2nd character/index 1
        fasta_dict_LCSM[current_id] = ""  # initialize new string in dictionary with key 'current_ID' and value = empty string
      else: #anything but the header - in this case it's the sequence
        fasta_dict_LCSM[current_id] +=line

#print (fasta_dict_LCSM)
#looking at just the sequences:

sequences = list(fasta_dict_LCSM.values())
#print(sequences)
shortest_seq = min(sequences, key=len) #min(...) finds the smallest thing in the list
#print(shortest_seq)

#Logic... end goal is to find the longest identical substring across all sequences

def ID_longest_substring(sequences):
  shortest_seq = min(sequences, key=len)
  longest_substring = ""
  # Step 1: make a list of all possible substrings
  substrings = []
  for i in range(len(shortest_seq)):
    for e in range(i + 1, len(shortest_seq) + 1): #e is the exlusive position at end of the sequence; include last base by +1
      substrings.append(shortest_seq[i:e])
#print(substrings)
#3) now we want to see if any of the substrings are in the other sequences, and then find the longest one
  longest_substring = ""
  for substring in substrings:
    is_present = True
    for seq in sequences:
        if substring not in seq:
          is_present = False
          break  # stop checking if even one sequence doesn't have it

    if is_present and len(substring) > len(longest_substring):
            longest_substring = substring
  return longest_substring

print(ID_longest_substring(sequences))

# since there are multiple longest common substrings of the same length — and you want to return all of them, not just the first.

def ID_all_longest_common_substrings(sequences):
    shortest_seq = min(sequences, key=len)
    substrings = []

    for i in range(len(shortest_seq)):
        for e in range(i + 1, len(shortest_seq) + 1):
            substrings.append(shortest_seq[i:e])

    longest_substrings = [] #this is a list instead of a string
    max_len = 0             # track the longest substring (length) to use a reference for keeping or discarding future substrings

    for substring in substrings:
        is_present = True
        for seq in sequences:
            if substring not in seq:
                is_present = False
                break

        if is_present:
            if len(substring) > max_len:
                # New longest found — reset the list
                max_len = len(substring)
                longest_substrings = [substring]
            elif len(substring) == max_len:
                # Same length as current longest — add to the list
                longest_substrings.append(substring)

    return longest_substrings
print (ID_all_longest_common_substrings(sequences))

TA
['TA', 'AC', 'CA']


# ID: ORF

Either strand of a DNA double helix can serve as the coding strand for RNA transcription. Hence, a given DNA string implies six total reading frames, or ways in which the same region of DNA can be translated into amino acids: three reading frames result from reading the string itself, whereas three more result from reading its reverse complement.

An open reading frame (ORF) is one which starts from the start codon and ends by stop codon, without any other stop codons in between. Thus, a candidate protein string is derived by translating an open reading frame into amino acids until a stop codon is reached.

Given: A DNA string s
 of length at most 1 kbp in FASTA format.

Return: Every distinct candidate protein string that can be translated from ORFs of s
. Strings can be returned in any order.

Sample Dataset
>Rosalind_99
AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG

In [None]:
pip install Biopython 

In [None]:
from Bio.Data import CodonTable

import types
print(type(CodonTable))
print(dir(CodonTable)) #list contents of module

#looking for prebuilt dictionaries:
[n for n in dir(CodonTable) if "by_" in n]
type(CodonTable.unambiguous_dna_by_name)
list(CodonTable.unambiguous_dna_by_name.keys())[:10]  

In [None]:
#grab standard:
std = CodonTable.unambiguous_dna_by_name["Standard"]

print(dir(std))
print(std.forward_table)
print(std.start_codons)

#assign to variable
aa_dict = std.forward_table
list(aa_dict.items())[:5]

In [None]:
#Import Seq
Sample_Data = '''
>Rosalind_99
AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG
'''

seq2 = "".join(line.strip() for line in Sample_Data.splitlines()
    if line and not line.startswith(">"))

In [None]:
#ID start codons backwards and fwds **STILL WORKING ON THIS***

start_codons = set(std.start_codons)
stop_codons  = set(std.stop_codons)

def translate(DNA_seq):
    protein = []
    for i in range(0, len(DNA_seq)-2, 3):#len(seq)-2 --> stop 2 bases before the end
        codon = DNA_seq[i:i+3]
        if codon in stop_codons:
            break
        if codon in start_codons:
           protein.append('M')   # start codon always codes Methionine
        else:
            protein.append(aa_dict.get(codon, 'x'))
    return ''.join(protein)  

translate(seq2)

'SHVANSGYMGMTPRLGLESLME'

In [None]:
# def chunk_string(s, n=60):    #s: the string you pass into the function (full DNA sequence like "ATCGGGTATTAA").
#     return [s[i:i+n] for i in range(0, len(s), n)]    #n = # of characters

# fasta_list = {k: chunk_string(v) for k, v in fasta_str.items()}