# Counting nucleotides

## Test code

Define a function to test the implementations.

In [1]:
def test_nucl_count_funcs(func):
    test_values = [
        ('AACAATG', {'A': 4, 'C': 1, 'G': 1, 'T': 1, 'invalid': 0}),
        ('',        {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'invalid': 0}),
        ('C',       {'A': 0, 'C': 1, 'G': 0, 'T': 0, 'invalid': 0}),
        ('AQTTAC',  {'A': 2, 'C': 1, 'G': 0, 'T': 2, 'invalid': 1}),
    ]
    for dna_str, target_counts in test_values:
        counts = func(dna_str)
        if counts != target_counts:
            print(f'error for {dna_str}, expecting {str(target_counts)}, got {str(counts)}')

## Implementations

### Four variables

In [2]:
def count_nucl_4_vars(dna_str):
    nr_A = nr_C = nr_G = nr_T = nr_invalids = 0
    for nucl in dna_str:
        if nucl == 'A':
            nr_A += 1
        elif nucl == 'C':
            nr_C += 1
        elif nucl == 'G':
            nr_G += 1
        elif nucl == 'T':
            nr_T += 1
        else:
            nr_invalids += 1
    return {'A': nr_A, 'C': nr_C, 'G': nr_G, 'T': nr_T, 'invalid': nr_invalids}

In [3]:
test_nucl_count_funcs(count_nucl_4_vars)

### Standard `dict`

In [4]:
def count_nucl_dict(dna_str):
    counts = {
        'A': 0,
        'C': 0,
        'T': 0,
        'G': 0,
        'invalid': 0,
    }
    for nucl in dna_str:
        if nucl in 'ACGT':
            counts[nucl] += 1
        else:
            counts['invalid'] += 1
    return counts

In [5]:
test_nucl_count_funcs(count_nucl_dict)

### `collentions.Counter`

In [6]:
def count_nucl_counter(dna_str):
    from collections import Counter
    counts = Counter()
    for key in ['A', 'C', 'G', 'T', 'invalid']:
        counts[key] = 0
    for nucl in dna_str:
        if nucl in 'ACGT':
            counts[nucl] += 1
        else:
            counts['invalid'] += 1
    return counts

In [7]:
test_nucl_count_funcs(count_nucl_counter)

### `str.count`

In [8]:
def count_nucl_str_count(dna_str):
    counts = dict()
    for key in ['A', 'C', 'G', 'T', 'invalid']:
        counts[key] = 0
    for nucl in 'ACGT':
        counts[nucl] = dna_str.count(nucl)
    counts['invalid'] = len(dna_str) - sum(counts.values())
    return counts

In [9]:
test_nucl_count_funcs(count_nucl_str_count)

## Speed comparison

In [10]:
import random

In [11]:
dna_str = random.choices('ACGT', k=10_000_000)

In [12]:
%timeit count_nucl_4_vars(dna_str)

1.04 s ± 90.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%timeit count_nucl_dict(dna_str)

1.13 s ± 95.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit count_nucl_counter(dna_str)

2.64 s ± 230 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%timeit count_nucl_str_count(dna_str)

698 ms ± 147 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
dna_str = random.choices('ACGT', k=100_000)

In [17]:
%timeit count_nucl_4_vars(dna_str)

21.6 ms ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
%timeit count_nucl_dict(dna_str)

14.8 ms ± 6.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
%timeit count_nucl_counter(dna_str)

25.9 ms ± 2.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
%timeit count_nucl_str_count(dna_str)

9.4 ms ± 5.46 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
