In [89]:
#from collections import Counter

In [1]:
!head -10 dna.example.fasta

>gi|142022655|gb|EQ086233.1|43 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
TCGGGCGAAGGCGGCAGCAAGTCGTCCACGCGCAGCGCGGCACCGCGGGCCTCTGCCGTGCGCTGCTTGG
CCATGGCCTCCAGCGCACCGATCGGATCAAAGCCGCTGAAGCCTTCGCGCATCAGGCGGCCATAGTTGGC
GCCAGTGACCGTACCAACCGCCTTGATGCGGCGCTCGGTCATCGCTGCATTGATCGAGTAGCCACCGCCG
CCGCAAATGCCCAGCACGCCAATGCGTTCTTCATCCACATAGGGGAGCGTTACGAGGTAGTCGCAGACCA
CGCGGAAATCCTCGACGCGCAGTGTCGGGTCTTCGGTAAAACGTGGTTCGCCGCCGCTGGCACCCTGGAA
GCTGGCGTCGAAGGCGATGACGACGAAACCTTCCTTGGCCAGCGCCTCGCCATACACGTTCCCCGATGTT
TGCTCCTTGCAGCTGCCGATCGGATGCGCGCTGATGATGGCGGGATATTTCTTGCCTTCGTCGAAGTTCG
GCGGGAAGTGGATGTCGGCTGCGATATCCCAATACACATTCTTGATCTTGACGCTTTTCATGACAGCTCC
GTTCAGGGGGAGGGGGTAAGTTCGCCAGGCCGAATCGTTGGTAGCCAAGCGGCAACGACTCGAATATAGA


1) How many records in the file?

In [2]:
#FILE = './dna.example.fasta'  # dev
FILE = './dna2.fasta'          # exam

In [3]:
class Record:
    def __init__(self, record_str):
        self.record_str = record_str
        self.header, seq = record_str.split('\n', 1)
        self.seq = seq.replace('\n', '')
        self.a, self.id, self.c, self.call, self.descr = (
            self.header.split('|'))

    def __str__(self):
        return self.record_str
    
    def __len__(self):
        return len(self.seq)

In [18]:
class DNASeq:
    def __init__(self, seq: str):
        self.seq = seq
        self.start_codon = 'ATG'
        self.stop_codons = ['TAA', 'TAG', 'TGA']
        self.n = len(seq)
        
    def to_codons(self, frame_start=1):
        codons = [
            self.seq[i:i+3] for i in range(frame_start - 1, self.n, 3)]
        return [c for c in codons if len(c) == 3]

    def get_orfs(self, frame_start=1):
        'ORF: open reading frame'
        orfs = []
        codons = self.to_codons(frame_start)
        while codons:
            if self.start_codon in codons:
                start = codons.index(self.start_codon)
                first_stop = len(codons) + 1
                for stop in self.stop_codons:
                    if stop in codons[start + 1:]:
                        stop_idx = codons.index(stop, start + 1)
                        if stop_idx < first_stop:
                            first_stop = stop_idx
                if first_stop <= len(codons):
                    orf = ''.join(codons[start:first_stop + 1])
                    orfs.append([orf, start * 3])
                    codons = codons[first_stop + 1:]
                else:
                    break
            else:
                break
        return orfs
    
    def get_repeats(self, n):
        'n is the length of the repeat sequence'
        repeats = []
        #for start in range(len(self.seq)):
        #    seq = self.seq[start:start + n]
        #    rep = seq
        #    for next_frame_start in range(start + n, len(self.seq), n):
        #        if (self.seq[next_frame_start:next_frame_start + n] 
        #            == seq):
        #            rep += seq
        #        else:
        #            break
        #    if len(rep) > n:
        #        repeats.append(rep)
        # with overlaps:
        for start in range(len(self.seq)):
            seq = self.seq[start:start + n]
            rep = seq
            count = 1
            for next_frame_start in range(start + 1, start + 1 + n):
                if (self.seq[
                        next_frame_start:next_frame_start + n] == seq):
                    rep = self.seq[start:next_frame_start + n]
                    next_frame_start += 1
                    count += 1
                    continue
            if len(rep) > n:
                repeats.append((rep, seq, count))
        return repeats

In [19]:
seq = DNASeq('ACACA')
print(seq.get_repeats(3))
print()

seq = (
    'TCGGGCGAAGGCGGCAGCAAGTCGTCCACGCGCAGCGCGGCACCGCGGGCCTCTGCCGTGCGCTG'
    'CTTGGCCATGGCCTCCAGCGCACCGATCGGATCAAAGCCGCTGAAGCCTTCGCGCATCAGGCGGC'
    'CATAGTTGGCGCCAGTGACCGTACCAACCGCCTTGATGCGGCGCTCGGTCATCGCTGCATTGATC'
    'GAGTAGCCACCGCCGCCGCAAATGCCCAGCACGCCAATGCGTTCTTCATCCACATAGGGGAGCGT'
    'TACGAGGTAGTCGCAGACCACGCGGAAATCCTCGACGCGCAGTGTCGGGTCTTCGGTAAAACGTG'
    'GTTCGCCGCCGCTGGCACCCTGGAAGCTGGCGTCGAAGGCGATGACGACGAAACCTTCCTTGGCC'
    'AGCGCCTCGCCATACACGTTCCCCGATGTTTGCTCCTTGCAGCTGCCGATCGGATGCGCGCTGAT'
    'GATGGCGGGATATTTCTTGCCTTCGTCGAAGTTCGGCGGGAAGTGGATGTCGGCTGCGATATCCC'
    'AATACACATTCTTGATCTTGACGCTTTTCATGACAGCTCCGTTCAGGGGGAGGGGGTAAGTTCGC'
    'CAGGCCGAATCGTTGGTAGCCAAGCGGCAACGACTCGAATATAGA')
seq = DNASeq(seq)
print(seq.get_repeats(3))

[('ACACA', 'ACA', 2)]

[('GGCGGC', 'GGC', 2), ('GCAGCA', 'GCA', 2), ('GTCGTC', 'GTC', 2), ('CGCGC', 'CGC', 2), ('GCGCG', 'GCG', 2), ('GCTGCT', 'GCT', 2), ('CGCGC', 'CGC', 2), ('GGCGGC', 'GGC', 2), ('GCGGCG', 'GCG', 2), ('CCGCCG', 'CCG', 2), ('CGCCGC', 'CGC', 2), ('GCCGCC', 'GCC', 2), ('CCGCCG', 'CCG', 2), ('CGCCGC', 'CGC', 2), ('TTCTTC', 'TTC', 2), ('GGGG', 'GGG', 2), ('CGCGC', 'CGC', 2), ('AAAA', 'AAA', 2), ('CGCCGC', 'CGC', 2), ('GCCGCC', 'GCC', 2), ('CCGCCG', 'CCG', 2), ('CGCCGC', 'CGC', 2), ('GACGAC', 'GAC', 2), ('ACGACG', 'ACG', 2), ('CGACGA', 'CGA', 2), ('CCCC', 'CCC', 2), ('GCGCG', 'GCG', 2), ('CGCGC', 'CGC', 2), ('TGATGA', 'TGA', 2), ('GATGAT', 'GAT', 2), ('ATGATG', 'ATG', 2), ('TCGTCG', 'TCG', 2), ('CGGCGG', 'CGG', 2), ('ACACA', 'ACA', 2), ('TTTT', 'TTT', 2), ('GGGGG', 'GGG', 3), ('GGGG', 'GGG', 2), ('GGGGG', 'GGG', 3), ('GGGG', 'GGG', 2), ('ATATA', 'ATA', 2)]


In [20]:
seq.get_orfs(2)

[['ATGTTTGCTCCTTGCAGCTGCCGATCGGATGCGCGCTGA', 414]]

In [7]:
seq.get_orfs(3)

['ATGCGCGCTGATGATGGCGGGATATTTCTTGCCTTCGTCGAAGTTCGGCGGGAAGTGGATGTCGGCTGCGATATCCCAATACACATTCTTGATCTTGACGCTTTTCATGACAGCTCCGTTCAGGGGGAGGGGGTAAGTTCGCCAGGCCGAATCGTTGGTAGCCAAGCGGCAACGACTCGAATATAG']

In [8]:
def get_records(path):
    with open(path, 'r') as f:
        t = ''.join(f.readlines())
    records = t.split('>')[1:]
    return [Record(record) for record in records]

In [9]:
records = get_records(FILE)
print(len(records))
print()
print(records[-1])

18

gi|142022655|gb|EQ086233.1|527 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
GAGAACCGGGAACCGGAACCATGACAGCCCCGCGCCGGTTTTACGCGAGATAGCCGGAAACGCCGTCCCA
GAGCAGTTTCAATGCGGTCACCGCCAGCAATCCGTAGCAGCTCCGGTAGATCAGGCGCTGGTCCAGCCTG
CCGTGAAGCCGCCAGCCGAACACCACGCCGGCCGGAATGGCAAGCAGGCACACCGCCATCAACGCCCAGA
CGTTCGCGGTCGGCTGCACGATCAGCAGCCACGGCACTGCCTTGATCGCATTGCCCACGGTGAAGAACAG
GCTCGTCGTTCCCGCGTACATCTCCTTGCTGAGGCCAAGCGGCAGCAGATACATCGCGAGCGGCGGCCCG
CCCGAGTGCGCGACCATCGTCGTGACGCCCGATGCAAGGCCGGCCGAGACTGCCTTCGGCGACGAACGCG
GACGAACCGTCGGCTCCGCCCCGCCCCTCACCCACAGCCCGACGAAGACCAGCGTGACCACCGCCATCAA
AAGCTCGATGGCGCGATGGTCGAGGAAGCGGAAAGCCAGGTAACCGAACCCGATACCGACCACCAGCCCC
GGCAGGAGCAGCACGAGGTCGGGCTTCGACCATGTCGACGGCTTCCAGTACCGCAGCGCGAACAGGTCCA
TCGCGATGAACAGCGGGGCGAGCAAGCCGCCGGCCGTCACGGGGTCCATCACGAGGGACAGCAGCGGAAT
GCCGATGATCGCGAATCCACCACCGAACGCGCCGCGCATGAACGCGATCACGAACACGCCGGCAAACGCG
ATCAGGATCGTGGCCAGCGTCAATTGCAGGCCCATCGCAGCAGGGGTCGCCATCACGACCTCCATGCCGG
TTCGAATCGCGGCGTGGCGGACA

In [11]:
#print(records[0])

2) What are the lengths of the records?  Shortest?  Longest?

In [12]:
for r in records:
    print(len(r))

4635
1151
4894
3511
4076
2867
442
890
967
4338
1352
4564
4804
964
2095
1432
115
2646


In [13]:
def get_longest(records):
    longest = 0
    out = []
    for record in records:
        seq_len = len(record)
        if seq_len > longest:
            longest = seq_len
            out = [record]
        elif seq_len == longest:
            out.append(record)
    return out

In [14]:
def get_shortest(records):
    shortest = 9e999
    out = []
    for record in records:
        seq_len = len(record)
        if seq_len < shortest:
            shortest = seq_len
            out = [record]
        elif seq_len == shortest:
            out.append(record)
    return out

In [15]:
longest = get_longest(records)
shortest = get_shortest(records)

print('longest:')
for r in longest:
    print(r.header, len(r))
print('\nshortest:')
for r in shortest:
    print(r.header, len(r))

longest:
gi|142022655|gb|EQ086233.1|255 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence 4894

shortest:
gi|142022655|gb|EQ086233.1|346 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence 115


3) Get longest ORF

In [27]:
def get_longest_orf(records, frame_start):
    longest = 0
    header = None
    pos = None
    for record in records:
        seq = DNASeq(record.seq)
        orfs = seq.get_orfs(frame_start)
        if orfs:
            n = max([len(o[0]) for o in orfs])
            if n > longest:
                longest = n
                for o in orfs:
                    if len(o[0]) == n:
                        pos = o[1]
                header = record.header
    return longest, pos, header

In [33]:
longest, pos, header = get_longest_orf(records, 1)
print(f'longest: {longest} (at position {pos})\n{header}')

longest: 2394 (at position 384)
gi|142022655|gb|EQ086233.1|45 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence


In [31]:
longest, pos, header = get_longest_orf(records, 2)
print(f'longest: {longest} (at position {pos})\n{header}')

longest: 1458 (at position 945)
gi|142022655|gb|EQ086233.1|16 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence


In [32]:
longest, pos, header = get_longest_orf(records, 3)
print(f'longest: {longest} (at position {pos})\n{header}')

longest: 1821 (at position 102)
gi|142022655|gb|EQ086233.1|527 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence


In [37]:
identifier = 'gi|142022655|gb|EQ086233.1|16'
id_record = None
for r in records:
    if identifier in r.header:
        id_record = r
    
id_record.header

'gi|142022655|gb|EQ086233.1|16 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence'

In [42]:
def get_longest_orf_in_record(record):
    seq = DNASeq(record.seq)
    longest = 0
    for frame in [1, 2, 3]:
        print('Start frame:', frame)
        orfs = seq.get_orfs(frame)
        lens = [o[1] for o in orfs]
        frame_longest = max(lens)
        print('Longest in frame:', frame_longest)
        if frame_longest > longest:
            longest = frame_longest
    return longest

In [43]:
longest = get_longest_orf_in_record(id_record)
print('Longest:', longest)

Start frame: 1
Longest in frame: 1683
Start frame: 2
Longest in frame: 945
Start frame: 3
Longest in frame: 276
Longest: 1683


4) Get repeats

In [62]:
counts = {}

for record in records:
    seq = DNASeq(record.seq)
    for n in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
        repeats = seq.get_repeats(n)
        for r in repeats:
            pattern = r[1]
            if not isinstance(pattern, str):
                print('\n\nERROR:')
                print(pattern)
                print(record.seq)
                break
            reps = r[2]
            key = (pattern, reps)
            if key in counts:
                counts[key] = counts[key] + 1
            else:
                counts[key] = 1
    #print(repeats)
    #print()

In [63]:
most_common = max(counts.values())
for k, v in counts.items():
    if v == most_common:
        print(k)

('CG', 2)


In [70]:
six_counts = {}
for record in records:
    seq = DNASeq(record.seq)
    reps = seq.get_repeats(6)
    for rep in reps:
        key = (rep[1], rep[2])
        if key in six_counts:
            six_counts[key] = six_counts[key] + 1
        else:
            six_counts[key] = 1
                
six_counts

{('GCCGGC', 2): 5,
 ('CCGGCC', 2): 2,
 ('CGCCGC', 2): 9,
 ('CGGCGG', 2): 4,
 ('GGCGGC', 2): 5,
 ('GCGGCG', 2): 3,
 ('CGCGCG', 2): 17,
 ('CCGCCG', 2): 3,
 ('CATGCG', 2): 1,
 ('CGTTGT', 2): 1,
 ('GTTGTC', 2): 1,
 ('TTGTCG', 2): 1,
 ('TGTCGT', 2): 1,
 ('GACGTT', 2): 1,
 ('GCGCGC', 3): 3,
 ('GCGCGC', 2): 15,
 ('GATGAT', 2): 2,
 ('GCGCGT', 2): 1,
 ('CGCGTG', 2): 1,
 ('CGACGC', 2): 1,
 ('GCCGGG', 2): 1,
 ('CCGGGC', 2): 1,
 ('CCGCGC', 2): 1,
 ('CCTGGC', 2): 1,
 ('GGCCGT', 2): 1,
 ('TGCTGC', 3): 1,
 ('GCTGCT', 2): 4,
 ('CTGCTG', 2): 5,
 ('TGCTGC', 2): 5,
 ('CGTACA', 2): 1,
 ('AGCAGC', 2): 5,
 ('GGGGCG', 2): 1,
 ('GGCGCG', 2): 1,
 ('CTTCGC', 2): 1,
 ('GACGAC', 2): 3,
 ('AAAAAA', 2): 2,
 ('TGGGGC', 2): 1,
 ('CGAGCC', 2): 1,
 ('CATCAT', 2): 1,
 ('GTGACG', 2): 1,
 ('CGGCCG', 2): 3,
 ('GGCCGG', 2): 1,
 ('GATCAG', 2): 1,
 ('CACCAT', 2): 1,
 ('ACCATC', 2): 1,
 ('GCGACG', 2): 2,
 ('CGACGA', 2): 6,
 ('GCGAGC', 2): 2,
 ('CGAGCG', 2): 2,
 ('GCCGCC', 2): 3,
 ('CAGCAG', 2): 1,
 ('CGCGCA', 2): 1,
 ('CGCGTC'

In [71]:
seven_counts = {}
for record in records:
    seq = DNASeq(record.seq)
    reps = seq.get_repeats(7)
    for rep in reps:
        key = (rep[1], rep[2])
        if key in seven_counts:
            seven_counts[key] = seven_counts[key] + 1
        else:
            seven_counts[key] = 1
                
seven_counts

{('GCCGGCC', 2): 2,
 ('CGGCGGC', 2): 2,
 ('CGTTGTC', 2): 1,
 ('GTTGTCG', 2): 1,
 ('TTGTCGT', 2): 1,
 ('GGCGGCG', 2): 1,
 ('GTCTCTC', 2): 1,
 ('CGCGGTG', 2): 1,
 ('GCGGTGC', 2): 1,
 ('GCGCGCG', 2): 8,
 ('CGCGCGC', 2): 6,
 ('GCGCGTG', 2): 1,
 ('GCCGGGC', 2): 1,
 ('TGCTGCT', 2): 3,
 ('GCTGCTG', 2): 2,
 ('CTGCTGC', 2): 3,
 ('TCGCGCT', 2): 1,
 ('CGCGCTT', 2): 1,
 ('GCGCTTC', 2): 1,
 ('CGCTTCG', 2): 1,
 ('GCTTCGC', 2): 1,
 ('CTTCGCG', 2): 1,
 ('CATTCGC', 2): 10,
 ('ATTCGCC', 2): 10,
 ('TTCGCCA', 2): 9,
 ('TCGCCAT', 2): 9,
 ('CGCCATT', 2): 9,
 ('GCCATTC', 2): 9,
 ('CCATTCG', 2): 9,
 ('CACCATC', 2): 1,
 ('GCGAGCG', 2): 2,
 ('CGCCGCC', 2): 2,
 ('CAGCAGC', 2): 1,
 ('CGCGTCG', 2): 1,
 ('GCGTCGC', 2): 1,
 ('CGTCGCG', 2): 1,
 ('CGCGCGC', 3): 1,
 ('GTTGTTG', 2): 1,
 ('CGCGAAC', 2): 1,
 ('CGCGCGA', 2): 1,
 ('TGCTGAT', 2): 1,
 ('TCGTCGT', 3): 1,
 ('CGTCGTC', 2): 1,
 ('GTCGTCG', 2): 1,
 ('TCGTCGT', 2): 1,
 ('CCGGCCG', 2): 1,
 ('CGATGCG', 2): 1,
 ('GCCCGCC', 2): 1,
 ('CCCGCCC', 2): 1,
 ('AGCAGCA', 2): 1

In [64]:
sixes = [[k, v] for k, v in counts.items() if len(k[0]) * k[1] == 6]
sixes

[[('GG', 3), 93],
 [('TT', 3), 44],
 [('CC', 3), 93],
 [('AA', 3), 55],
 [('CGC', 2), 539],
 [('ACG', 2), 50],
 [('GTG', 2), 36],
 [('GGG', 2), 77],
 [('GGA', 2), 5],
 [('CGG', 2), 73],
 [('GGC', 2), 81],
 [('GCG', 2), 520],
 [('TCG', 2), 89],
 [('GCA', 2), 39],
 [('CAG', 2), 41],
 [('AGC', 2), 40],
 [('GCC', 2), 87],
 [('CCG', 2), 80],
 [('TTT', 2), 35],
 [('CAT', 2), 18],
 [('ATC', 2), 16],
 [('CGA', 2), 96],
 [('GAC', 2), 39],
 [('ATA', 2), 8],
 [('GAA', 2), 13],
 [('CTG', 2), 47],
 [('TGC', 2), 64],
 [('GTT', 2), 6],
 [('TTG', 2), 6],
 [('CCC', 2), 77],
 [('AGG', 2), 4],
 [('TCT', 2), 12],
 [('CTT', 2), 5],
 [('TTC', 2), 10],
 [('AAA', 2), 37],
 [('CGT', 2), 61],
 [('AAG', 2), 5],
 [('AGA', 2), 10],
 [('GAT', 2), 21],
 [('ACC', 2), 15],
 [('CCA', 2), 13],
 [('GTA', 2), 4],
 [('GTC', 2), 42],
 [('CAC', 2), 32],
 [('CTC', 2), 20],
 [('TGA', 2), 16],
 [('TTA', 2), 1],
 [('TAT', 2), 2],
 [('TAA', 2), 1],
 [('CCT', 2), 6],
 [('ATG', 2), 9],
 [('TGT', 2), 18],
 [('TGG', 2), 13],
 [('GCT'

In [65]:
twelves = [[k, v] for k, v in counts.items() if len(k[0]) * k[1] == 12]
twelves

[[('CCC', 4), 2],
 [('CGCG', 3), 20],
 [('CCCC', 3), 2],
 [('GCCGGC', 2), 5],
 [('CCGGCC', 2), 2],
 [('CGCCGC', 2), 9],
 [('CGGCGG', 2), 4],
 [('GGCGGC', 2), 5],
 [('GCGGCG', 2), 3],
 [('CGCGCG', 2), 17],
 [('CCGCCG', 2), 3],
 [('CATGCG', 2), 1],
 [('CGTTGT', 2), 1],
 [('GTTGTC', 2), 1],
 [('TTGTCG', 2), 1],
 [('TGTCGT', 2), 1],
 [('GACGTT', 2), 1],
 [('GCGC', 3), 18],
 [('GCGCGC', 2), 15],
 [('GATGAT', 2), 2],
 [('GCGCGT', 2), 1],
 [('CGCGTG', 2), 1],
 [('AAA', 4), 6],
 [('GGG', 4), 3],
 [('AAAA', 3), 4],
 [('GGGG', 3), 3],
 [('CGACGC', 2), 1],
 [('GCCGGG', 2), 1],
 [('CCGGGC', 2), 1],
 [('CCGCGC', 2), 1],
 [('CCTGGC', 2), 1],
 [('GGCCGT', 2), 1],
 [('GCTGCT', 2), 4],
 [('CTGCTG', 2), 5],
 [('TGCTGC', 2), 5],
 [('CGTACA', 2), 1],
 [('AGCAGC', 2), 5],
 [('GGGGCG', 2), 1],
 [('GGCGCG', 2), 1],
 [('CTTCGC', 2), 1],
 [('GACGAC', 2), 3],
 [('AAAAAA', 2), 2],
 [('TGGGGC', 2), 1],
 [('CGAGCC', 2), 1],
 [('CATCAT', 2), 1],
 [('GTGACG', 2), 1],
 [('CGGCCG', 2), 3],
 [('GGCCGG', 2), 1],
 [('GAT