In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import itertools
import os
import glob

import pandas as pd
from joblib import Parallel, delayed
import khmer
import screed

from tqdm import tqdm

import numpy as np
from orpheum.index import (
    maybe_make_peptide_bloom_filter,
    make_peptide_bloom_filter,
    maybe_save_peptide_bloom_filter,
    ALPHABET_SIZES,
)

from path_constants import MAMMALIA_BUSCO_SUBSET_FOLDER

In [23]:
filenames = glob.glob(os.path.join(MAMMALIA_BUSCO_SUBSET_FOLDER, '*.fasta'))
assert len(filenames) == 18

In [24]:
fastas = {os.path.basename(x).split('__')[-1].split('.fasta')[0]: x for x in filenames}
fastas

{'homo_sapiens': '/Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/9606__homo_sapiens.fasta',
 'ornithorhynchus_anatinus': '/Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/9258__ornithorhynchus_anatinus.fasta',
 'nannospalax_galili': '/Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/1026970__nannospalax_galili.fasta',
 'mus_musculus': '/Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/10090__mus_musculus.fasta',
 'erinaceus_europaeus': '/Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/9365__erinaceus_europaeus.fasta',
 'phascolarctos_cinereus': '/Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/38626__phascolarctos_cinereus.fasta',
 'rhinolophus_sinicus': '/Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/89399__

In [25]:
lines = []


def create_index_and_compute_stats(name, fasta, alphabet, ksize, tablesize=int(1e8)):
    """Create peptide bloom filter (index) and compute k-mer complexity"""
    index = make_peptide_bloom_filter(fasta, ksize, alphabet, tablesize=tablesize)
    expected_collisions = khmer.calc_expected_collisions(index)
    maybe_save_peptide_bloom_filter(fasta, index, alphabet, save_peptide_bloom_filter=True)

    n_unique_kmers = index.n_unique_kmers()
    sigma = ALPHABET_SIZES[alphabet]
    line = [name, alphabet, ksize, sigma, expected_collisions, n_unique_kmers]
    del index
    return line

In [26]:
N_JOBS = 8


def format_index_stats(lines):
    columns = [
        "name",
        "molecule",
        "ksize",
        "sigma",
        "expected_collisions",
        "n_unique_kmers",
    ]
    df = pd.DataFrame(lines, columns=columns)
    df["n_theoretical_kmers_log10"] = df["ksize"] * np.log10(df["sigma"])
    df["n_unique_kmers_log10"] = np.log10(df["n_unique_kmers"])
    df["unique_over_theoretical_log10"] = (
        df["n_unique_kmers_log10"] - df["n_theoretical_kmers_log10"]
    )
    return df

In [27]:
dfs = []

alpha_ksizes = (("dayhoff6", 17), ("protein20", 8))

for name, fasta in fastas.items():
    print(f"name: {name}")
    #     iterator = itertools.product(['protein20', 'dayhoff6', 'hp2'], range(2, 51))

    #     line = create_index_and_compute_stats(name, fasta, 'dayhoff6', 17)
    this_fasta_lines = Parallel(n_jobs=N_JOBS, verbose=True)(
        delayed(create_index_and_compute_stats)(name, fasta, alphabet, ksize)
        for alphabet, ksize in alpha_ksizes
    )
    df = format_index_stats(this_fasta_lines)
    csv = os.path.join(
        MAMMALIA_BUSCO_SUBSET_FOLDER, f"busco_mammalia_kmer_complexity__{name}.csv"
    )
    df.to_csv(csv, index=False)
    dfs.append(df)

index_stats = pd.concat(dfs)
# index_stats['n_theoretical_kmers'] = np.power(index_stats.sigma, index_stats.ksize)
# index_stats['unique_over_theoretical'] = index_stats.n_unique_kmers / index_stats.n_theoretical_kmers
# index_stats['unique_over_theoretical_log10'] = -1 * np.log10(index_stats['unique_over_theoretical'])
csv = os.path.join(
    MAMMALIA_BUSCO_SUBSET_FOLDER, "busco_mammalia_kmer_complexity_all_species.csv"
)

index_stats.to_csv(csv, index=False)
print(index_stats.shape)
index_stats.head()

name: homo_sapiens


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
15066it [00:12, 1253.51it/s][Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   16.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   16.1s finished


name: ornithorhynchus_anatinus


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
1265it [00:00, 1421.85it/s][Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

3it [00:00, 1624.23it/s]
3it [00:00, 1744.24it/s]

name: nannospalax_galili


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
123it [00:00, 1192.81it/s]

name: mus_musculus


14519it [00:11, 1261.01it/s][Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   11.8s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   11.8s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
146it [00:00, 1454.37it/s]

name: erinaceus_europaeus


13448it [00:11, 1215.19it/s][Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   11.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   11.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
6it [00:00, 1169.14it/s]

name: phascolarctos_cinereus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

1it [00:00, 710.78it/s]
1it [00:00, 792.42it/s]

name: rhinolophus_sinicus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
8it [00:00, 2578.93it/s]

name: lipotes_vexillifer


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
213it [00:00, 1893.71it/s]

name: capra_hircus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
123it [00:00, 1221.80it/s]

name: aotus_nancymaae


7261it [00:05, 1287.49it/s][Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.9s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.9s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
133it [00:00, 1323.25it/s]

name: oryctolagus_cuniculus


4424it [00:03, 1351.15it/s][Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    3.5s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    3.5s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

1it [00:00, 1392.99it/s]
1it [00:00, 1508.20it/s]

name: peromyscus_maniculatus_bairdii


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
176it [00:00, 1738.27it/s]

name: tupaia_chinensis


1338it [00:00, 1882.59it/s][Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
149it [00:00, 1489.28it/s]

name: macaca_mulatta


5008it [00:03, 1346.12it/s][Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    4.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    4.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

4it [00:00, 1671.70it/s]
4it [00:00, 1481.56it/s]

name: camelus_bactrianus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
36it [00:00, 1654.41it/s]

name: sorex_araneus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

0it [00:00, ?it/s]
3it [00:00, 2066.50it/s]

name: chinchilla_lanigera


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.

1it [00:00, 3104.59it/s]
1it [00:00, 3463.50it/s]

name: ceratotherium_simum_simum
(36, 9)


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished


Unnamed: 0,name,molecule,ksize,sigma,expected_collisions,n_unique_kmers,n_theoretical_kmers_log10,n_unique_kmers_log10,unique_over_theoretical_log10
0,homo_sapiens,dayhoff6,17,6,7.852379e-06,5439082,13.228571,6.735526,-6.493046
1,homo_sapiens,protein20,8,20,8.194001e-06,5498718,10.40824,6.740261,-3.667979
0,ornithorhynchus_anatinus,dayhoff6,17,6,1.73045e-09,647083,13.228571,5.81096,-7.417611
1,ornithorhynchus_anatinus,protein20,8,20,1.852019e-09,658167,10.40824,5.818336,-4.589904
0,nannospalax_galili,dayhoff6,17,6,1.925671e-20,1178,13.228571,3.071145,-10.157426


## See all per-species k-mer complexity files

In [28]:
ls -lha $MAMMALIA_BUSCO_SUBSET_FOLDER/*csv

-rw-r--r-- 1 olgabot  370 Jul 22 12:25 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/busco_mammalia_kmer_complexity__aotus_nancymaae.csv
-rw-r--r-- 1 olgabot  370 Jul 22 12:25 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/busco_mammalia_kmer_complexity__camelus_bactrianus.csv
-rw-r--r-- 1 olgabot  362 Jul 22 12:25 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/busco_mammalia_kmer_complexity__capra_hircus.csv
-rw-r--r-- 1 olgabot  382 Jul 22 12:25 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/busco_mammalia_kmer_complexity__ceratotherium_simum_simum.csv
-rw-r--r-- 1 olgabot  373 Jul 22 12:25 /Users/olgabot/Downloads/00-orpheum-benchmarking/00-external-data/mammalia_busco_subsets/busco_mammalia_kmer_complexity__chinchilla_lanigera.csv
-rw-r--r-- 1 olgabot  377 Jul 22 12:25 /Users/olgabot/Downloads/00-orpheum-benchmarkin