# Final exam

In [191]:
import os
from pprint import pprint

import numpy as np
import pandas as pd

In [24]:
def read_fasta(path):
    assert os.path.isfile(path)
    try:
        fh = open(path, 'r')
    except IOError:
        print('error opening file', path)
    
    seqs = {}
    for line in fh:
        line = line.rstrip()
        if line.startswith('>'):
            words = line.split()
            name = words[0][1:]
            seqs[name] = ''
        else:
            seqs[name] += line
    fh.close()
    return seqs

In [81]:
#path = f'{os.path.expanduser("~")}/Downloads/dna.example.fasta'
path = f'{os.path.expanduser("~")}/Downloads/dna2.fasta'

In [82]:
seqs = read_fasta(path)

### Q1, Q2, & Q3

In [202]:
n_seqs = len(seqs)
print(n_seqs)

18


In [203]:
sequence_lengths = [len(v) for k, v in seqs.items()]
print(max(sequence_lengths))
print(min(sequence_lengths))

4894
115


## Q4: What is the length of the longest ORF appearing in reading frame 2 of any of the sequences?

In [206]:
def get_codons_from_seq(seq, frame):
    x = seq[(frame - 1):]
    return [x[i:i+3] for i in range(0, len(x), 3)]

def get_orfs_from_codons(codons):
    start = []
    stop = []
    counter = 0
    for i, codon in enumerate(codons):
        if codon.upper() == 'ATG':
            if len(start) == counter:
                start.append(i)
        if codon.upper() in ['TAA', 'TAG', 'TGA']:
            if len(start) == (counter + 1):
                stop.append(i)
                counter += 1
    df = pd.DataFrame({
        'start_codon': start[:len(stop)], 
        'stop_codon': stop
    })
    df['n_codons'] = df['stop_codon'] - df['start_codon'] + 1
    df['start_base'] = 3 * df['start_codon'] + frame
    df['stop_base'] = 3 * df['stop_codon'] + frame
    df['n_bases'] = 3 * df['n_codons']
    return df

def get_orfs_from_seq(seq, frame):
    return get_orfs_from_codons(get_codons_from_seq(seq, frame=frame))

def get_orfs_from_seqs(seqs, frame):
    return {name: get_orfs_from_seq(seq, frame) for name, seq in seqs.items()}    

In [207]:
orfs = {frame: get_orfs_from_seqs(seqs, frame=frame) for frame in [1,2,3]}

In [208]:
res = {}
for frame in [1,2,3]:
    res[frame] = {
        'longest_orf_name': '',
        'longest_orf_value': 0,
    }
    for name in orfs[frame]:
        _ = orfs[frame][name]['n_bases'].max()
        if _ > res[frame]['longest_orf_value']:
            res[frame]['longest_orf_name'] = name
            res[frame]['longest_orf_value'] = _
#pprint(res)

In [209]:
print(res[2]['longest_orf_name'])
print(res[2]['longest_orf_value'])

gi|142022655|gb|EQ086233.1|16
1458


## Q5: What is the starting position of the longest ORF in reading frame 3 in any of the sequences?

In [194]:
frame = 3
orfs[frame][res[frame]['longest_orf_name']]

Unnamed: 0,start_codon,stop_codon,n_codons,start_base,stop_base,n_bases
0,6,16,11,21,51,33
1,58,176,119,177,531,357
2,211,817,607,636,2454,1821


## Q6: What is the length of the longest ORF appearing in any sequence and in any forward reading frame?

In [195]:
pprint(res)

{1: {'longest_orf_name': 'gi|142022655|gb|EQ086233.1|45',
     'longest_orf_value': np.int64(2394)},
 2: {'longest_orf_name': 'gi|142022655|gb|EQ086233.1|16',
     'longest_orf_value': np.int64(1458)},
 3: {'longest_orf_name': 'gi|142022655|gb|EQ086233.1|527',
     'longest_orf_value': np.int64(1821)}}


## Q7: What is the length of the longest forward ORF that appears in the sequence with the identifier  gi|142022655|gb|EQ086233.1|16?

In [201]:
name = 'gi|142022655|gb|EQ086233.1|16'
pprint({frame: orfs[frame][name] for frame in [1,2,3]})

{1:    start_codon  stop_codon  n_codons  start_base  stop_base  n_bases
0           88         109        22         267        330       66
1          192         200         9         579        603       27
2          424         497        74        1275       1494      222
3          509        1011       503        1530       3036     1509
4         1573        1595        23        4722       4788       69,
 2:    start_codon  stop_codon  n_codons  start_base  stop_base  n_bases
0          153         284       132         462        855      396
1          301         506       206         906       1521      618
2          598         650        53        1797       1953      159
3          659         707        49        1980       2124      147
4         1023        1508       486        3072       4527     1458,
 3:    start_codon  stop_codon  n_codons  start_base  stop_base  n_bases
0           36         474       439         111       1425     1317
1          479      

## Q8: Find the most frequently occurring repeat of length 6 in all sequences. How many times does it occur in all?

In [265]:
from collections import Counter

def get_subsequences_from_seq(seq, n):
    return [seq[i:(i+n)] for i in range(len(seq)-n+1)]

def count_subsequences(subseqs):
    df = pd.DataFrame({
        'subseq': Counter(subseqs).keys(),
        'n_repeats': Counter(subseqs).values(),
    })
    return df

def get_subsequence_counts_from_seq(seq, n):
    subseqs = get_subsequences_from_seq(seq, n)
    return count_subsequences(subseqs)

def get_subsequence_counts_from_seqs(seqs, n):
    counts = {name: get_subsequence_counts_from_seq(seq, n=n) for name, seq in seqs.items()}
    df = pd.concat(counts)\
        .reset_index(level=0)\
        .rename(columns={'level_0':'name'})\
        .groupby('subseq')\
        .sum('n_repeats')\
        .sort_values('n_repeats', ascending=False)\
        .reset_index()
    return df

In [266]:
df = get_subsequence_counts_from_seqs(seqs, n=6)
df.head()

Unnamed: 0,subseq,n_repeats
0,GCGCGC,153
1,CGCGCG,151
2,GCCGCG,147
3,GCGCCG,135
4,CGCGGC,131


## Q9:Find all repeats of length 12 in the input file. Let's use Max to specify the number of copies of the most frequent repeat of length 12.  How many different 12-base sequences occur Max times?

In [267]:
df = get_subsequence_counts_from_seqs(seqs, n=12)
df.head(10)

Unnamed: 0,subseq,n_repeats
0,TCGCCATTCGCC,10
1,ATTCGCCATTCG,10
2,CATTCGCCATTC,10
3,TTCGCCATTCGC,10
4,GCCATTCGCCAT,9
5,CGCCATTCGCCA,9
6,CCATTCGCCATT,9
7,CCAGGTCGCGCC,3
8,CCGGCGCGGCCG,3
9,CGACGAGCTGGT,3


## Q10: Which one of the following repeats of length 7 has a maximum number of occurrences?

In [268]:
df = get_subsequence_counts_from_seqs(seqs, n=7)

In [269]:
df.loc[df['subseq'].isin(['CATCGCC', 'TGCGCGC', 'AATGGCA', 'CGCGCCG']), :]

Unnamed: 0,subseq,n_repeats
0,CGCGCCG,63
40,TGCGCGC,36
709,CATCGCC,13
5497,AATGGCA,2
