In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import itertools
import os
import glob

import pandas as pd
from joblib import Parallel, delayed
import khmer
import screed

from tqdm import tqdm

import numpy as np
from orpheum.index import (
    maybe_make_peptide_bloom_filter,
    make_peptide_bloom_filter,
    maybe_save_peptide_bloom_filter,
    ALPHABET_SIZES,
)



In [3]:
from path_constants import MAMMALIA_BUSCO_SUBSET_FOLDER

In [4]:
filenames = glob.glob(os.path.join(MAMMALIA_BUSCO_SUBSET_FOLDER, '*.fasta'))
filenames

['../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/37293__aotus_nancymaae.fasta',
 '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/246437__tupaia_chinensis.fasta',
 '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/38626__phascolarctos_cinereus.fasta',
 '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/9365__erinaceus_europaeus.fasta',
 '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/73337__ceratotherium_simum_simum.fasta',
 '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/9837__camelus_bactrianus.fasta',
 '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/34839__chinchilla_lanigera.fasta',
 '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/1026970__nannospalax_galili.fasta',
 '../kmer-homology-data

In [5]:
fastas = {os.path.basename(x).split('__')[-1].split('.fasta')[0]: x for x in filenames}
fastas

{'aotus_nancymaae': '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/37293__aotus_nancymaae.fasta',
 'tupaia_chinensis': '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/246437__tupaia_chinensis.fasta',
 'phascolarctos_cinereus': '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/38626__phascolarctos_cinereus.fasta',
 'erinaceus_europaeus': '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/9365__erinaceus_europaeus.fasta',
 'ceratotherium_simum_simum': '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/73337__ceratotherium_simum_simum.fasta',
 'camelus_bactrianus': '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/9837__camelus_bactrianus.fasta',
 'chinchilla_lanigera': '../kmer-homology-data/01--processed-data/orpheum-benchmarking/mammalia_busco_subsets/34839__chinchilla_l

In [6]:
lines = []


def create_index_and_compute_stats(name, fasta, alphabet, ksize, tablesize=int(1e8)):
    """Create peptide bloom filter (index) and compute k-mer complexity"""
    index = make_peptide_bloom_filter(fasta, ksize, alphabet, tablesize=tablesize)
    expected_collisions = khmer.calc_expected_collisions(index)
    maybe_save_peptide_bloom_filter(fasta, index, alphabet, save_peptide_bloom_filter=True)

    n_unique_kmers = index.n_unique_kmers()
    sigma = ALPHABET_SIZES[alphabet]
    line = [name, alphabet, ksize, sigma, expected_collisions, n_unique_kmers]
    del index
    return line

In [7]:
N_JOBS = 32


def format_index_stats(lines):
    columns = [
        "name",
        "molecule",
        "ksize",
        "sigma",
        "expected_collisions",
        "n_unique_kmers",
    ]
    df = pd.DataFrame(lines, columns=columns)
    df["n_theoretical_kmers_log10"] = df["ksize"] * np.log10(df["sigma"])
    df["n_unique_kmers_log10"] = np.log10(df["n_unique_kmers"])
    df["unique_over_theoretical_log10"] = (
        df["n_unique_kmers_log10"] - df["n_theoretical_kmers_log10"]
    )
    return df

In [None]:
dfs = []

alpha_ksizes = (("dayhoff6", 17), ("protein20", 8))

for name, fasta in fastas.items():
    print(f"name: {name}")
    #     iterator = itertools.product(['protein20', 'dayhoff6', 'hp2'], range(2, 51))

    #     line = create_index_and_compute_stats(name, fasta, 'dayhoff6', 17)
    this_fasta_lines = Parallel(n_jobs=2, verbose=True)(
        delayed(create_index_and_compute_stats)(name, fasta, alphabet, ksize)
        for alphabet, ksize in alpha_ksizes
    )
    df = format_index_stats(this_fasta_lines)
    csv = os.path.join(
        MAMMALIA_BUSCO_SUBSET_FOLDER, f"busco_mammalia_kmer_complexity__{name}.csv"
    )
    df.to_csv(csv, index=False)
    dfs.append(df)

index_stats = pd.concat(dfs)
# index_stats['n_theoretical_kmers'] = np.power(index_stats.sigma, index_stats.ksize)
# index_stats['unique_over_theoretical'] = index_stats.n_unique_kmers / index_stats.n_theoretical_kmers
# index_stats['unique_over_theoretical_log10'] = -1 * np.log10(index_stats['unique_over_theoretical'])
csv = os.path.join(
    MAMMALIA_BUSCO_SUBSET_FOLDER, "busco_mammalia_kmer_complexity_all_species.csv"
)

index_stats.to_csv(csv, index=False)
print(index_stats.shape)
index_stats.head()

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: aotus_nancymaae


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   14.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   14.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: tupaia_chinensis


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    6.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    6.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: phascolarctos_cinereus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: erinaceus_europaeus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   20.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   20.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: ceratotherium_simum_simum


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: camelus_bactrianus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: chinchilla_lanigera


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: nannospalax_galili


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.0s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: peromyscus_maniculatus_bairdii


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: sorex_araneus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: oryctolagus_cuniculus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    4.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    4.7s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: ornithorhynchus_anatinus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished


name: rhinolophus_sinicus
name: lipotes_vexillifer


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: macaca_mulatta


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: capra_hircus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: mus_musculus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   15.8s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   15.8s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: homo_sapiens


In [None]:
ls -lha $MAMMALIA_BUSCO_SUBSET_FOLDER/*csv