In [2]:
import re
import pandas as pd 
import numpy as np 
from aerobot.io import RESULTS_PATH
from aerobot.dataset import dataset_load_feature_order
from aerobot.kmer import kmer_sequence_to_kmers
import os 
from tqdm import tqdm

The Black Sea genome data is in the form of nucleotide contigs. I ran prodigal on the raw nucleotide data (using `prodigal -i bs_contigs.fna -o bs_contigs.gbk -a bs_contigs.faa` ), which resulted in a FASTA nucleotide of amino acid sequences. Each header string contains a unique ID, e.g. GCA_014383885.1.1_3, where the first number following the genome ID (GCA_014383885.1) is the contig from which the predicted amino acid sequence was produced. 

The goal is to re-train an amino acid trimer-based classifier using only the amino acids from contigs. I can then compare the results to the performance of the whole-genome classifier by mapping the predictions from each contig to the predictions of the genome to which the contig belongs. 

In [3]:
BS_PATH = os.path.join(RESULTS_PATH, 'black_sea')

In [4]:
aa_3mer_features = dataset_load_feature_order('aa_3mer')

# Load in the contig k-mers, which were generated by running the count-kmers.py script on HPC. 
kmer_df = pd.read_csv(os.path.join(BS_PATH, 'bs_aa_3mer_from_contigs.csv'))

In [14]:
print(f'K-mer counts generated for {len(kmer_df)} contigs.')

K-mer counts generated for 54544 contigs.


In [5]:
# The k-mer counts seem pretty sparse. In order to make sure everything worked correctly, I am going to merge the contig k-mer counts
# by genome, and compare them to the k-mer counts in the bs_aa_3mer.csv file (which Josh generated).

def get_genome_id(contig_id:str):
    '''Get the genome ID from the contig ID, which is of the form {genome_id}.{contig_index}.'''
    match = re.search('(GCA_\d+\.\d+)\.\d+', contig_id)
    return match.group(1) # Extract the genome ID from the pattern and return. 

kmer_df['genome_id'] = kmer_df.contig_id.apply(get_genome_id)
kmer_df_by_genome = kmer_df.drop(columns=['contig_id']).groupby('genome_id').sum()

In [9]:
# Load in the original k-mers. 
kmer_df_by_genome_ref = pd.read_csv(os.path.join(BS_PATH, 'bs_aa_3mer.csv'), index_col=0)

In [11]:
# Standardize the column order and k-mer counts included in each DataFrame. 
kmer_df_by_genome = kmer_df_by_genome[aa_3mer_features]
kmer_df_by_genome_ref = kmer_df_by_genome_ref[aa_3mer_features]
# Also make sure the index order matches for easier comparison. 
kmer_df_by_genome, kmer_df_by_genome_ref = kmer_df_by_genome.align(kmer_df_by_genome_ref, axis=0)

In [12]:
kmer_df_by_genome.head()

Unnamed: 0,TTA,TAT,ATG,TGG,GGT,GTW,TWS,WSI,SID,IDL,...,QCC,MYC,CQC,NCC,MYW,MWC,CMC,CNW,CCM,CWC
GCA_014381205.1,117.0,143.0,221.0,222.0,216.0,30.0,31.0,18.0,102.0,160.0,...,2.0,3.0,0.0,0.0,1.0,3.0,1.0,2.0,1.0,0.0
GCA_014381215.1,125.0,141.0,171.0,191.0,218.0,49.0,35.0,30.0,156.0,230.0,...,1.0,3.0,0.0,5.0,11.0,0.0,2.0,3.0,2.0,2.0
GCA_014381225.1,128.0,146.0,302.0,220.0,241.0,21.0,11.0,13.0,95.0,195.0,...,2.0,3.0,2.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0
GCA_014381265.1,91.0,93.0,86.0,109.0,99.0,28.0,18.0,24.0,205.0,236.0,...,4.0,3.0,1.0,5.0,2.0,0.0,2.0,8.0,1.0,0.0
GCA_014381285.1,107.0,128.0,118.0,134.0,153.0,12.0,18.0,19.0,196.0,228.0,...,1.0,2.0,2.0,6.0,7.0,4.0,1.0,2.0,2.0,0.0


In [13]:
# Counts in the reference and contig-based k-mer DataFrames seem very similar, though not identical. I am pretty sure Prodigal is 
# deterministic, so perhaps Josh used a different set of contigs, or erged the contig sequences before counting k-mers. I 
# will note that it seems as though the counts in the reference are consistently higher than the counts I generated, so that
# explanation is plausible. 
kmer_df_by_genome_ref.head()

Unnamed: 0,TTA,TAT,ATG,TGG,GGT,GTW,TWS,WSI,SID,IDL,...,QCC,MYC,CQC,NCC,MYW,MWC,CMC,CNW,CCM,CWC
GCA_014381205.1,135.0,162.0,215.0,217.0,213.0,37.0,34.0,19.0,102.0,173.0,...,2.0,4.0,0.0,1.0,1.0,2.0,1.0,1.0,2.0,0.0
GCA_014381215.1,124.0,141.0,170.0,193.0,216.0,47.0,36.0,30.0,154.0,223.0,...,1.0,3.0,0.0,6.0,11.0,0.0,2.0,3.0,2.0,2.0
GCA_014381225.1,129.0,151.0,294.0,219.0,239.0,21.0,12.0,14.0,95.0,193.0,...,2.0,3.0,2.0,0.0,2.0,0.0,1.0,2.0,0.0,1.0
GCA_014381265.1,88.0,91.0,83.0,103.0,98.0,28.0,18.0,24.0,199.0,231.0,...,3.0,3.0,1.0,5.0,2.0,0.0,2.0,8.0,0.0,0.0
GCA_014381285.1,106.0,120.0,117.0,129.0,149.0,12.0,17.0,19.0,191.0,221.0,...,0.0,2.0,2.0,6.0,7.0,4.0,1.0,2.0,1.0,0.0


We have trained amino acid trimer-based models (both logistic and nonlinear) saved in the notebooks directory. These can be used to generate predictions for the contig trimers. Predictions needed to be generated on the HPC (they require too much memory to run locally). They were produced using the commands below. 

In [None]:
# ! python ../scripts/run.py 'nonlinear' --feature-type 'aa_3mer' --save-model 1 --n-epochs 500 --save-model-path 'aa_3mer_nonlinear_model.joblib'
# ! python ../scripts/run.py 'logistic' --feature-type 'aa_3mer' --save-model 1 --max-iter 10000 --save-model-path 'aa_3mer_logistic_model.joblib'

In [15]:
# TODO: Create a models directory to store pre-trained models for better organization. 

# ! python ../scripts/run-pretrained.py -i ../../results/black_sea/bs_aa_3mer_from_contigs.csv -o ../../results/black_sea/bs_run_pretrained_results_logistic_from_contigs.csv -m aa_3mer_logistic.joblib
# ! python ../scripts/run-pretrained.py -i ../../results/black_sea/bs_aa_3mer_from_contigs.csv -o ../../results/black_sea/bs_run_pretrained_results_nonlinear_from_contigs.csv -m aa_3mer_nonlinear.joblib

0 aa_3mer features are missing from the input data. Filled missing data with 0.
