In [23]:
from math import log, log10, ceil

import numpy as np
import pandas as pd

Total length of all proteins in ENSEMBL 97 human protein data

In [12]:
proteome_size = 4e7
proteome_size

40000000.0

In [14]:
n_best_for_aa20 = 20**7
n_best_for_aa20

1280000000

# For an alphabet size $\Sigma$, what is the appropriate $k$?

## Know that 7 is the best size for proteins, so how does that inform the rest?


$20 ^ 7 = \Sigma ^ k$

$7 log(20) = k log(\Sigma) $

$k = 7 \frac{log(20)}{log(\Sigma)}$

## Or, using number of human protein sequences (40 million, 4e7)


$4 \times 10 ^ 7 = \Sigma ^ k$

$log(4 \times 10 ^ 7) = k  $

$k = \frac{log(4 \times 10 ^ 7)}{log(\Sigma)}$

In [24]:
def get_best_kmer_size(sigma, n_items):
    return int(ceil(log(n_items)/log(sigma)))

get_best_kmer_size(20, n_best_for_aa20)

7

In [15]:
get_best_kmer_size(20, proteome_size)

5.843108934201204

In [29]:
VALID_PEPTIDE_MOLECULES = 'protein', 'peptide', \
                          'protein20', 'peptide20', \
                          'aa20', \
                          'dayhoff', 'dayhoff6', \
                          'botvinnik', 'botvinnik8', \
                          'hydrophobic-polar', 'hp', 'hp2', \
                          'aa9', \
                          'gbmr4', \
                          'sdm12', 'hsdm17'

In [32]:
dict.frnomkeys(VALID_PEPTIDE_MOLECULES)


{'protein': None,
 'peptide': None,
 'protein20': None,
 'peptide20': None,
 'aa20': None,
 'dayhoff': None,
 'dayhoff6': None,
 'botvinnik': None,
 'botvinnik8': None,
 'hydrophobic-polar': None,
 'hp': None,
 'hp2': None,
 'aa9': None,
 'gbmr4': None,
 'sdm12': None,
 'hsdm17': None}

In [38]:
sigmas = {'protein': 20,
 'peptide': 20,
 'protein20': 20,
 'peptide20': 20,
 'aa20': 20,
 'dayhoff': 6,
 'dayhoff6': 6,
 'botvinnik': 8,
 'botvinnik8': 8,
 'hydrophobic-polar': 2,
 'hp': 2,
 'hp2': 2,
 'aa9': 9 ,
 'gbmr4': 4,
 'sdm12': 12,
 'hsdm17': 17}

In [36]:
alphabet_sizes = pd.Series(range(2, 21), name='sigma').to_frame()
alphabet_sizes['from_proteome_size'] = np.ceil(alphabet_sizes['sigma'].map(lambda x: get_best_kmer_size(x, proteome_size)))
alphabet_sizes['from_best_ksize_for_aa20'] = np.ceil(alphabet_sizes['sigma'].map(lambda x: get_best_kmer_size(x, n_best_for_aa20)))
alphabet_sizes['from_best_ksize_for_1e7'] = np.ceil(alphabet_sizes['sigma'].map(lambda x: get_best_kmer_size(x, 1e7)))
alphabet_sizes = alphabet_sizes.astype(int)
alphabet_sizes = alphabet_sizes.set_index('sigma')
alphabet_sizes

Unnamed: 0_level_0,from_proteome_size,from_best_ksize_for_aa20,from_best_ksize_for_1e7
sigma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,26,31,24
3,16,20,15
4,13,16,12
5,11,14,11
6,10,12,9
7,9,11,9
8,9,11,8
9,8,10,8
10,8,10,7
11,8,9,7


In [42]:
best_alphabet_sizes = {key: get_best_kmer_size(sigma, n_best_for_aa20) for key, sigma in sigmas.items()}
best_alphabet_sizes

{'protein': 7,
 'peptide': 7,
 'protein20': 7,
 'peptide20': 7,
 'aa20': 7,
 'dayhoff': 12,
 'dayhoff6': 12,
 'botvinnik': 11,
 'botvinnik8': 11,
 'hydrophobic-polar': 31,
 'hp': 31,
 'hp2': 31,
 'aa9': 10,
 'gbmr4': 16,
 'sdm12': 9,
 'hsdm17': 8}

In [4]:
log(4e7)

17.50439001207821

In [8]:
7 * log10(4)

4.214419939295737

In [3]:
20**7

1280000000

In [None]:
log()