# Computing GC Content
https://rosalind.info/problems/gc/

The GC-content of a DNA string is the percentage of symbols int he string that are "C" or "G".

### Practice

In [1]:
import util

In [2]:
path = "/Users/claudiamameli/Desktop/uni/mec9_programming/python/python_hw_CM/rosalind/trial.txt"

In [3]:
fasta_file = util.read_file_into_lines(path)

In [4]:
fasta_file

['>Rosalind_6404',
 'CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC',
 'TCCCACTAATAATTCTGAGG',
 '>Rosalind_5959',
 'CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT',
 'ATATCCATTTGTCAGCAGACACGC',
 '>Rosalind_0808',
 'CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC',
 'TGGGAACCTGCGGGCAGTAGGTGGAAT']

In [5]:
fasta_dic = {}
for s in fasta_file:
    if '>' in s:
        key = s[1:]
        fasta_dic[key] = ""
    else:
        fasta_dic[key] = fasta_dic[key]+s

percentage = {}
for key, value in fasta_dic.items():
    s = fasta_dic[key]
    percentage[key] = ((s.count('C') + s.count('G'))/len(s))*100

result = 0
for key, value in percentage.items():
    if value > result:
        result = max(result, value)
        final_key = key
        final_value = value
    else:
        next
print(final_key)
print(final_value)

Rosalind_0808
60.91954022988506


### Polishing the code after the class

In [6]:
fasta_dic = {}
for line in fasta_file:
    if line.startswith('>'): # this is a better convention because sometimes we might have '>' in other lines as well
        key = line[1:]
        fasta_dic[key] = ""
    else:
        fasta_dic[key] += line # more concise way

percentage = {}
for key in fasta_dic.keys():
    value = fasta_dic[key] #this step could also be implemented below without having this line, but for readibility I kept it
    percentage[key] = ((value.count('C') + value.count('G'))/len(value)) * 100

result = -1 #this is better because there might be some that are 0!
for key, value in percentage.items():
    if value > result:
        result = max(result, value)
        final_key = key
        final_value = value
    else:
        next
print(final_key)
print(final_value)

Rosalind_0808
60.91954022988506


### Creating functions as a solution

In [7]:
def text_to_fasta(fasta_file):
    """
    This function takes an already imported fasta file separated
    in lines and returns a dictionary with unique identifiers as 
    keys and DNA strings as values.
    """
    fasta_dic = {}
    for line in fasta_file:
        if line.startswith('>'): # this is a better convention because sometimes we might have '>' in other lines as well
            key = line[1:]
            fasta_dic[key] = ""
        else:
            fasta_dic[key] += line # more concise way
    return fasta_dic

In [8]:
path = "/Users/claudiamameli/Desktop/uni/mec9_programming/python/python_hw_CM/rosalind/trial.txt"

In [9]:
fasta_file = util.read_file_into_lines(path)

In [10]:
fasta_dic = text_to_fasta(fasta_file)
fasta_dic

{'Rosalind_6404': 'CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG',
 'Rosalind_5959': 'CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC',
 'Rosalind_0808': 'CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGACTGGGAACCTGCGGGCAGTAGGTGGAAT'}

In [11]:
def gc_content_count(fasta_dic):
    """
    This function calculates the GC contents of various DNA strings. 
    Takes a dictionary formatted as {"ID" : "DnaString"} and returns
    another dictionary with percentages for each "ID".
    """
    percentages = {}
    for key in fasta_dic.keys():
        value = fasta_dic[key]
        percent = ((value.count('C') + value.count('G'))/len(value))
        percentages[key] = percent * 100
    return percentages

In [12]:
rosalind_gc = gc_content_count(fasta_dic)
rosalind_gc

{'Rosalind_6404': 53.75,
 'Rosalind_5959': 53.57142857142857,
 'Rosalind_0808': 60.91954022988506}

In [13]:
# Now to put it all together and have a solution for the Rosalind's exercise
path = "/Users/claudiamameli/Desktop/uni/mec9_programming/python/python_hw_CM/rosalind/trial.txt"
fasta_file = util.read_file_into_lines(path)

rosalind_dic = text_to_fasta(fasta_file)
rosalind_gc = gc_content_count(rosalind_dic)

result = -1 
for key, value in rosalind_gc.items():
    if value > result:
        result = max(result, value)
        final_key = key
        final_value = value
    else:
        next
print(final_key)
print(final_value)

Rosalind_0808
60.91954022988506


### Rosalind's problem

In [14]:
path = "/Users/claudiamameli/Downloads/rosalind_gc.txt"

In [15]:
fasta_file = util.read_file_into_lines(path)

In [16]:
fasta_dic = {}
for s in fasta_file:
    if '>' in s:
        key = s[1:]
        fasta_dic[key] = ""
    else:
        fasta_dic[key] = fasta_dic[key]+s

percentage = {}
for key, value in fasta_dic.items():
    s = fasta_dic[key]
    percentage[key] = ((s.count('C') + s.count('G'))/len(s))*100

result = 0
for key, value in percentage.items():
    if value > result:
        result = max(result, value)
        final_key = key
        final_value = value
    else:
        next
print(final_key)
print(final_value)

Rosalind_7971
51.24056094929881


In [17]:
# Alternative version using functions 

path = "/Users/claudiamameli/Downloads/rosalind_gc.txt"
fasta_file = util.read_file_into_lines(path)

rosalind_dic = text_to_fasta(fasta_file)
rosalind_gc = gc_content_count(rosalind_dic)

result = -1 
for key, value in rosalind_gc.items():
    if value > result:
        result = max(result, value)
        final_key = key
        final_value = value
    else:
        next
print(final_key)
print(final_value)

Rosalind_7971
51.24056094929881


### In class solution 
