In [1]:
%load_ext autoreload
%autoreload 2

In [53]:
import itertools
import os
import glob

import pandas as pd
from joblib import Parallel, delayed
import khmer
import screed

from tqdm import tqdm

import numpy as np
from orpheum.index import maybe_save_peptide_index, make_peptide_index, ALPHABET_SIZES

In [43]:


fasta_folder = '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets'

In [44]:
! hostname

tesla


In [45]:
! ls -lha $fasta_folder

total 43M
drwxrwxr-x 2 olga  czb 4.0K Oct 22  2020 .
drwxrwxrwx 3 olga  czb 256K Oct 14  2020 ..
-rw-r--r-- 1 lekha czb 9.8M Oct 22  2020 10090__mus_musculus.fasta
-rw-r--r-- 1 lekha czb 1.5K Oct 22  2020 1026970__nannospalax_galili.fasta
-rw-r--r-- 1 lekha czb 3.1K Oct 22  2020 118797__lipotes_vexillifer.fasta
-rw-r--r-- 1 lekha czb  535 Oct 22  2020 230844__peromyscus_maniculatus_bairdii.fasta
-rw-r--r-- 1 lekha czb 672K Oct 22  2020 246437__tupaia_chinensis.fasta
-rw-r--r-- 1 lekha czb 1.3K Oct 22  2020 34839__chinchilla_lanigera.fasta
-rw-r--r-- 1 lekha czb 4.9M Oct 22  2020 37293__aotus_nancymaae.fasta
-rw-r--r-- 1 lekha czb 4.7K Oct 22  2020 38626__phascolarctos_cinereus.fasta
-rw-r--r-- 1 lekha czb  20K Oct 22  2020 42254__sorex_araneus.fasta
-rw-r--r-- 1 lekha czb  249 Oct 22  2020 73337__ceratotherium_simum_simum.fasta
-rw-r--r-- 1 lekha czb 1009 Oct 22  2020 89399__rhinolophus_sinicus.fasta
-rw-r--r-- 1 lekha czb 794K Oct 22  2020 9258__ornithorhynchus_anatinus.fasta
-rw-r--r

In [46]:
filenames = glob.glob(os.path.join(fasta_folder, '*.fasta'))
filenames

['/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/246437__tupaia_chinensis.fasta',
 '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/9837__camelus_bactrianus.fasta',
 '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/10090__mus_musculus.fasta',
 '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/118797__lipotes_vexillifer.fasta',
 '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/9925__capra_hircus.fasta',
 '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/89399__rhinolophus_sinicus.fasta',
 '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/73337__ceratotherium_simum_simum.fasta',
 '/home/olga

In [47]:
fastas = {os.path.basename(x).split('__')[-1].split('.fasta')[0]: x for x in filenames}
fastas

{'tupaia_chinensis': '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/246437__tupaia_chinensis.fasta',
 'camelus_bactrianus': '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/9837__camelus_bactrianus.fasta',
 'mus_musculus': '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/10090__mus_musculus.fasta',
 'lipotes_vexillifer': '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/118797__lipotes_vexillifer.fasta',
 'capra_hircus': '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/9925__capra_hircus.fasta',
 'rhinolophus_sinicus': '/home/olga/data_lg/czbiohub-reference/uniprot/releases/2019_11/manually_downloaded/mammalia_busco_subsets/89399__rhinolophus_sinicus.fasta',
 'ceratotherium_simum_simum': '/home/olga/

In [48]:


lines = []

def create_index_and_compute_stats(name, fasta, alphabet, ksize, tablesize=int(1e8)):
    index = make_peptide_index(fasta, ksize, alphabet, tablesize=tablesize)
    expected_collisions = khmer.calc_expected_collisions(index)
    maybe_save_peptide_index(fasta, index, alphabet, save_peptide_index=True)

    n_unique_kmers = index.n_unique_kmers()
    sigma = ALPHABET_SIZES[alphabet]
    line = [name, alphabet, ksize, sigma, expected_collisions, n_unique_kmers]
    del index
    return line

In [49]:
N_JOBS = 32


def format_index_stats(lines):
    columns = ['name', 'molecule', 'ksize', 'sigma', 'expected_collisions', 'n_unique_kmers']
    df = pd.DataFrame(lines, columns=columns)
    df['n_theoretical_kmers_log10'] = df['ksize'] * np.log10(df['sigma'])
    df['n_unique_kmers_log10'] = np.log10(df['n_unique_kmers'])
    df['unique_over_theoretical_log10'] = df['n_unique_kmers_log10'] - df['n_theoretical_kmers_log10']
    return df

In [61]:
dfs = []

alpha_ksizes = (('dayhoff6', 17), ('protein20', 8))

for name, fasta in fastas.items():
    print(f'name: {name}')
#     iterator = itertools.product(['protein20', 'dayhoff6', 'hp2'], range(2, 51))
    
#     line = create_index_and_compute_stats(name, fasta, 'dayhoff6', 17)
    this_fasta_lines = Parallel(n_jobs=2, verbose=True)(
        delayed(create_index_and_compute_stats)(
            name, fasta, alphabet, ksize) for alphabet, ksize in alpha_ksizes)
    df = format_index_stats(this_fasta_lines)
    df.to_csv(f'busco_mammalia_kmer_complexity__{name}.csv', index=False)
    dfs.append(df)

index_stats = pd.concat(dfs)
# index_stats['n_theoretical_kmers'] = np.power(index_stats.sigma, index_stats.ksize)
# index_stats['unique_over_theoretical'] = index_stats.n_unique_kmers / index_stats.n_theoretical_kmers
# index_stats['unique_over_theoretical_log10'] = -1 * np.log10(index_stats['unique_over_theoretical'])
index_stats.to_csv('busco_mammalia_kmer_complexity_all_species.csv', index=False)
print(index_stats.shape)
index_stats.head()

name: tupaia_chinensis


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    2.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    2.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: camelus_bactrianus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: mus_musculus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   10.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   10.4s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: lipotes_vexillifer


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: capra_hircus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.3s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: rhinolophus_sinicus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished


name: ceratotherium_simum_simum
name: nannospalax_galili


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished


name: homo_sapiens


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   10.6s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   10.6s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: erinaceus_europaeus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   10.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   10.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: peromyscus_maniculatus_bairdii


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: macaca_mulatta


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    3.5s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    3.5s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: aotus_nancymaae


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: phascolarctos_cinereus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: chinchilla_lanigera


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: sorex_araneus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: oryctolagus_cuniculus


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    3.1s finished
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


name: ornithorhynchus_anatinus
(36, 9)


[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    1.0s finished


Unnamed: 0,name,molecule,ksize,sigma,expected_collisions,n_unique_kmers,n_theoretical_kmers_log10,n_unique_kmers_log10,unique_over_theoretical_log10
0,tupaia_chinensis,dayhoff6,17,6,6.506573e-10,506371,13.228571,5.704469,-7.524102
1,tupaia_chinensis,protein20,8,20,7.144468e-10,518316,10.40824,5.714595,-4.693645
0,camelus_bactrianus,dayhoff6,17,6,2.747611e-21,724,13.228571,2.859739,-10.368833
1,camelus_bactrianus,protein20,8,20,3.197955e-21,752,10.40824,2.876218,-7.532022
0,mus_musculus,dayhoff6,17,6,7.880357e-06,5443866,13.228571,6.735907,-6.492664


In [58]:
pwd

'/home/olga/code/2020-test-sencha--olgabot/human-qfo-v2/notebooks'

In [60]:
ls -lha *csv

-rw-r--r-- 1 olga czb  370 Apr 27 09:44 busco_mammalia_kmer_complexity__aotus_nancymaae.csv
-rw-r--r-- 1 olga czb  370 Apr 27 09:43 busco_mammalia_kmer_complexity__camelus_bactrianus.csv
-rw-r--r-- 1 olga czb  362 Apr 27 09:43 busco_mammalia_kmer_complexity__capra_hircus.csv
-rw-r--r-- 1 olga czb  382 Apr 27 09:43 busco_mammalia_kmer_complexity__ceratotherium_simum_simum.csv
-rw-r--r-- 1 olga czb  373 Apr 27 09:44 busco_mammalia_kmer_complexity__chinchilla_lanigera.csv
-rw-r--r-- 1 olga czb 4.3K Apr 27 09:44 busco_mammalia_kmer_complexity.csv
-rw-r--r-- 1 olga czb  377 Apr 27 09:43 busco_mammalia_kmer_complexity__erinaceus_europaeus.csv
-rw-r--r-- 1 olga czb  363 Apr 27 09:43 busco_mammalia_kmer_complexity__homo_sapiens.csv
-rw-r--r-- 1 olga czb  373 Apr 27 09:43 busco_mammalia_kmer_complexity__lipotes_vexillifer.csv
-rw-r--r-- 1 olga czb  370 Apr 27 09:43 busco_mammalia_kmer_complexity__macaca_mulatta.csv
-rw-r--r-- 1 olga czb  365 Apr 27 09:43 busco_mammalia_kmer_complexity__mus_musc