In [1]:
import re
import pandas as pd 
import numpy as np 
from aerobot.io import RESULTS_PATH
from aerobot.dataset import dataset_load_feature_order
from aerobot.kmer import kmer_sequence_to_kmers
import os 
from tqdm import tqdm

The Black Sea genome data is in the form of nucleotide contigs. I ran prodigal on the raw nucleotide data (using `prodigal -i bs_contigs.fna -o bs_contigs.gbk -a bs_contigs.faa` ), which resulted in a FASTA nucleotide of amino acid sequences. Each header string contains a unique ID, e.g. GCA_014383885.1.1_3, where the first number following the genome ID (GCA_014383885.1) is the contig from which the predicted amino acid sequence was produced. 

The goal is to re-train an amino acid trimer-based classifier using only the amino acids from contigs. I can then compare the results to the performance of the whole-genome classifier by mapping the predictions from each contig to the predictions of the genome to which the contig belongs. 

In [2]:
BS_PATH = os.path.join(RESULTS_PATH, 'black_sea')

In [4]:
aa_3mer_features = dataset_load_feature_order('aa_3mer')

# for feature in aa_3mer_features:
#     if '*' in feature:
#         print(feature)

In [5]:
kmer_df = get_contig_kmers(os.path.join(BS_PATH, 'bs_contigs.faa'))
kmer_df = kmer_df[['contig_id'] + aa_3mer_features]
kmer_df = kmer_df.set_index('contig_id')

get_contig_kmers: 100%|██████████| 389156/389156 [01:03<00:00, 6136.37it/s] 


: 

In [None]:
# Kernel crashes while writing the k-mer data to a file, so need to do it in chunks. 

Unnamed: 0_level_0,VTP,TPA,PAM,AMF,MFY,FYA,YAD,ADK,DKH,KHR,...,WNH,YCH,GVX,VXX,XXX,XXC,CKW,CMD,CQN,CNW
contig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_014382875.1.1,1.0,2.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_014382875.1.10,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_014382875.1.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_014382875.1.12,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_014382875.1.13,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCA_014383885.1.60,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_014383885.1.61,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_014383885.1.7,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GCA_014383885.1.8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
