In [1]:
import os
import pandas as pd
from pathlib import Path



In [29]:
# Initialize the output dataframe
output_data = []


In [30]:
# Base directory containing all antibiotic folders
base_dir = '/mnt/raid1b/kdan_data/Paper/Machine_Learning/results_new'

# List of species to look for
species_list = ['Escherichia', 'Enterococcus', 'Staphylococcus', 'Klebsiella', 
                'Acinetobacter', 'Pseudomonas', 'Enterobacter']


In [31]:
# Iterate through each antibiotic folder
for antibiotic_dir in os.listdir(base_dir):
    models_dir = os.path.join(base_dir, antibiotic_dir, 'models')
    
    # Check if models directory exists
    if os.path.exists(models_dir):
        # Initialize metrics dictionary for this antibiotic
        metrics = {'Antibiotic': antibiotic_dir}
        
        # Find the final metrics file
        final_file = next(Path(models_dir).glob('*kmer_rf_xgboost_output_final.txt'), None)
        
        if final_file:
            with open(final_file, 'r') as f:
                lines = f.readlines()
                # Extract final metrics
                accuracy = float(lines[2].split()[-1])
                for line in lines:
                    if line.strip().startswith('macro avg'):
                        parts = line.split()
                        macro_precision = float(parts[2])
                        macro_recall = float(parts[3])
                        macro_f1 = float(parts[4])
                        break
                metrics.update({
                    f'Accuracy_Final': accuracy,
                    f'Precision_Final': macro_precision,
                    f'Recall_Final': macro_recall,
                    f'F1_Score_Final': macro_f1
                })
        
        # Initialize species metrics with NA
        for species in species_list:
            metrics.update({
                f'Accuracy_{species}': 'NA',
                f'Precision_{species}': 'NA',
                f'Recall_{species}': 'NA',
                f'F1_Score_{species}': 'NA'
            })
        
        # Find species-specific files
        for species_file in Path(models_dir).glob('*_output.txt'):
            for species in species_list:
                if species in str(species_file):
                    with open(species_file, 'r') as f:
                        lines = f.readlines()
                        # Update species metrics
                        accuracy = float(lines[2].split()[-1])
                        for line in lines:
                            if line.strip().startswith('macro avg'):
                                parts = line.split()
                                macro_precision = float(parts[2])
                                macro_recall = float(parts[3])
                                macro_f1 = float(parts[4])
                                break
                        metrics.update({
                            f'Accuracy_{species}': accuracy,
                            f'Precision_{species}': macro_precision,
                            f'Recall_{species}': macro_recall,
                            f'F1_Score_{species}': macro_f1
                        })
        
        output_data.append(metrics)


In [32]:
df = pd.DataFrame(output_data)
df

Unnamed: 0,Antibiotic,Accuracy_Final,Precision_Final,Recall_Final,F1_Score_Final,Accuracy_Escherichia,Precision_Escherichia,Recall_Escherichia,F1_Score_Escherichia,Accuracy_Enterococcus,...,Recall_Acinetobacter,F1_Score_Acinetobacter,Accuracy_Pseudomonas,Precision_Pseudomonas,Recall_Pseudomonas,F1_Score_Pseudomonas,Accuracy_Enterobacter,Precision_Enterobacter,Recall_Enterobacter,F1_Score_Enterobacter
0,cefazolin,0.937153,0.94,0.93,0.93,0.937153,0.94,0.93,0.93,,...,,,,,,,,,,
1,chloramphenicol,0.995542,0.99,0.99,0.99,0.99681,0.99,0.99,0.99,,...,,,,,,,,,,
2,oxacillin,0.987013,0.98,0.98,0.98,,,,,,...,,,,,,,,,,
3,nitrofurantoin,0.933566,0.94,0.93,0.93,0.96,0.48,0.5,0.49,,...,,,,,,,0.75,0.8,0.8,0.75
4,meropenem,0.972773,0.96,0.95,0.95,0.998148,1.0,0.91,0.95,,...,0.95,0.95,0.898649,0.9,0.89,0.9,0.944444,0.94,0.93,0.93
5,ampicillin-sulbactam,0.947887,0.94,0.94,0.94,0.903846,0.86,0.91,0.88,,...,0.94,0.94,,,,,,,,
6,levofloxacin,0.970662,0.95,0.96,0.96,0.982143,0.96,0.96,0.96,,...,0.96,0.95,0.964789,0.95,0.97,0.96,0.906977,0.93,0.9,0.9
7,amoxicillin-clavulanicAcid,0.982317,0.98,0.96,0.97,0.984903,0.98,0.93,0.95,,...,,,,,,,,,,
8,colistin,0.86758,0.81,0.78,0.79,0.960526,0.98,0.81,0.87,,...,0.78,0.84,,,,,,,,
9,tigecycline,0.967033,0.98,0.62,0.69,,,,,,...,,,,,,,,,,


In [33]:
df.to_csv('consolidated_stats_with_species_macroAvg.csv', index=False)

In [7]:
#Test the testing dataset of ciprofloxacin

In [8]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
import pickle
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import csr_matrix
from rich.progress import track
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib as mpl
from enum import Enum
import shap
import argparse
from sklearn.model_selection import cross_val_score

print(os.getcwd())
# Save the model. Create the model folder if it does not exist
os.makedirs('./ciprofloxacin/models', exist_ok=True)



/mnt/raid1b/kdan_data/Paper/Machine_Learning/results_new


In [133]:
# -------------------------
# Load the data
# -------------------------
antibiogram = pd.read_csv("/mnt/raid1b/kdan_data/Paper/Data_Acquisition/Antibiograms/QC_antibiotics/used_antibiotics/ciprofloxacin/ciprofloxacin_QCed_metadata.csv")
antibiogram['phenotype'].value_counts()

print(f"Preparing data for antibiotic: {antibiogram['antibiotic'].iloc[0]}")
print("-" * 40)


# Remove antibiograms N phenotype
antibiogram = antibiogram[antibiogram['phenotype'] != 'N']
antibiogram = antibiogram[antibiogram['phenotype'] != 'I']
antibiogram = antibiogram[antibiogram['phenotype'] != 'NS']

antibiogram_unique = antibiogram.drop_duplicates(subset='id')

# Convert 'Element' to binary values
antibiotic_binary = antibiogram_unique['phenotype'].map({'S': 0, 'R': 1})
antibiotic_binary = antibiotic_binary.to_numpy().astype(np.int8)

filename = "/mnt/raid1b/kdan_data/Paper/Data_Acquisition/Assemblies/filtered_assemblies/"
# Open the file and read the lines into a list
files = os.listdir(filename)
assemblies = [os.path.basename(file).split('.fna')[0] for file in files]
# Find the indices of the elements in the sublist
indices = [assemblies.index(element) for element in antibiogram_unique['id']]

#New
# Resetting index for antibiogram_gentamycin_unique
antibiogram_unique.reset_index(drop=True, inplace=True)
antibiogram_unique['organism_name'] = antibiogram_unique['organism_name'].replace('E.coli and', 'Escherichia coli')
indices_antibiograms=antibiogram_unique.index.tolist()

#New
# Extract the genus (first word) from the organism_name column
antibiogram_unique['genus'] = antibiogram_unique['organism_name'].str.split().str[0]

# Find how many from each genus are there
genera_counts = antibiogram_unique['genus'].value_counts()

# Keep the indexes of each found genus
genera_indices = antibiogram_unique.groupby('genus').apply(lambda x: x.index.tolist())

# Report total number of rows
total_rows = antibiogram_unique.shape[0]


# Get the antibiotic from the first row
antibiotic_name = antibiogram_unique.loc[0, 'antibiotic']  # Assuming 'antibiotic' is the column name

# List of genera to include in the output
genera_list = ['Escherichia', 'Klebsiella', 'Acinetobacter', 'Staphylococcus', 'Pseudomonas', 'Enterobacter', 'Enterococcus']

# Example genera_counts data (replace with actual values from your dataset)
genera_counts = antibiogram_unique['genus'].value_counts()  # Replace df with your actual DataFrame

# Dictionary to store the results
summary_data = {
    'Total number': [total_rows],  # Total number of rows in your dataset
    'Antibiotic': [antibiotic_name]  # Add the antibiotic name as a row
}


Preparing data for antibiotic: ciprofloxacin
----------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  antibiogram_unique['organism_name'] = antibiogram_unique['organism_name'].replace('E.coli and', 'Escherichia coli')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  antibiogram_unique['genus'] = antibiogram_unique['organism_name'].str.split().str[0]


In [3]:
def load_data(kmer_size = 3):
    print(f"Loading data for kmer size {kmer_size}")

    
    # Make the final concatenated dataset 
    X_dna = np.load(f"/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_{kmer_size}_flatteneddna_.npy", allow_pickle=True)
    X_protein = np.load(f"/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_{kmer_size}_flattenedprotein_.npy", allow_pickle=True)
    X_rrna = np.load(f"/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_{kmer_size}_flattenedrrna_.npy", allow_pickle=True)

    X_rrna_subset = X_rrna[indices]
    X_dna_subset = X_dna[indices]
    X_protein_subset = X_protein[indices]
    
    del X_dna, X_protein, X_rrna

    X = np.concatenate((X_dna_subset, X_rrna_subset, X_protein_subset), axis=1)
    X = X.astype(np.int16)
    y = antibiotic_binary

     # Combine y and group labels for stratification
    stratification_labels = [f"{y_class}_{group}" for y_class, group in zip(y, antibiogram_unique['genus'])]
    
    # Perform the 80-20 split with both stratification and group constraints
    train_indices, test_indices = train_test_split(
        range(len(X)), 
        test_size=0.2, 
        random_state=42, 
        stratify=stratification_labels
    )
    
    # Use these indices to filter the original DataFrame
    #train_df = antibiogram_unique.iloc[train_indices]
    #test_df = antibiogram_unique.iloc[test_indices]
    
    # Optionally, you can retrieve specific columns if needed
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    X_train = csr_matrix(X_train)
    X_test = csr_matrix(X_test)

    del X, y
    del X_dna_subset, X_protein_subset, X_rrna_subset

    return X_train, X_test, y_train, y_test, kmer_size, train_indices, test_indices



In [4]:
X_train, X_test, y_train, y_test, kmer_size, train_indices, test_indices = load_data(3)


Loading data for kmer size 3


In [151]:
with open("/mnt/raid1b/kdan_data/Paper/Machine_Learning/results_new/ciprofloxacin/models/ciprofloxacin_xgboost_kmer_3.pkl", "rb") as f:
    model = pickle.load(f)

In [124]:
len(assemblies)

18916

In [137]:
index = assemblies.index("GCA_006492705.1")
index

16382

In [135]:
antibiogram_unique

Unnamed: 0,X,antibiotic,sign,value,phenotype,measurement,organism,platform,standard,database,id,organism_name,sign_value,mic_value,old_phenotype,genus
0,80554,ciprofloxacin,>=,4.00,R,mg/L,Escherichia coli VREC1450,VITEK 2,EUCAST,BV BRC,ERR2138491,Escherichia coli,>=4,4.00,R,Escherichia
1,80560,ciprofloxacin,>=,4.00,R,mg/L,Escherichia coli strain M50,VITEK 2,CLSI,BV BRC,SRR3588896,Escherichia coli,>=4,4.00,R,Escherichia
2,80562,ciprofloxacin,<=,0.25,S,mg/L,Escherichia coli strain URMC_94,,CLSI,BV BRC,GCA_004566395.1,Escherichia coli,<=0.25,0.25,S,Escherichia
3,80567,ciprofloxacin,>,8.00,R,mg/L,Escherichia coli strain MUGSI_313,,CLSI,BV BRC,SRR4065643,Escherichia coli,>8,16.00,R,Escherichia
4,80591,ciprofloxacin,>=,4.00,R,mg/L,Escherichia coli VREC1254,VITEK 2,EUCAST,BV BRC,GCF_001606545.1,Escherichia coli,>=4,4.00,R,Escherichia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11162,310781,ciprofloxacin,<=,0.25,S,mg/L,Enterobacter roggenkampii,Vitek,CLSI,NDARO,GCA_017345915.1,Enterobacter roggenkampii,<=0.25,0.25,S,Enterobacter
11163,310797,ciprofloxacin,<=,0.25,S,mg/L,Enterobacter roggenkampii,Vitek,CLSI,NDARO,GCA_017345875.1,Enterobacter roggenkampii,<=0.25,0.25,S,Enterobacter
11164,311061,ciprofloxacin,<=,0.25,S,mg/L,Enterobacter roggenkampii,Vitek,CLSI,NDARO,GCA_030868435.1,Enterobacter roggenkampii,<=0.25,0.25,S,Enterobacter
11165,311082,ciprofloxacin,<=,0.25,S,mg/L,Enterobacter roggenkampii,Vitek,CLSI,NDARO,GCA_030868415.1,Enterobacter roggenkampii,<=0.25,0.25,S,Enterobacter


In [138]:
#antibiogram_unique
row = antibiogram_unique[antibiogram_unique['id'] == 'GCA_006492705.1']
row

Unnamed: 0,X,antibiotic,sign,value,phenotype,measurement,organism,platform,standard,database,id,organism_name,sign_value,mic_value,old_phenotype,genus
8468,50627,ciprofloxacin,>=,4.0,R,mg/L,Acinetobacter baumannii strain MRSN3874,,CLSI,BV BRC,GCA_006492705.1,Acinetobacter baumannii,>=4,4.0,R,Acinetobacter


In [139]:
index = train_indices.index(8468)  # Find index of 8468

print(index)

139


In [140]:
y_train[139]

1

In [141]:
X_GCA_006492705 = X_train[139]

In [15]:
X_dna = np.load(f"/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_3_flatteneddna_.npy", allow_pickle=True)
X_protein = np.load(f"/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_3_flattenedprotein_.npy", allow_pickle=True)
X_rrna = np.load(f"/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_3_flattenedrrna_.npy", allow_pickle=True)

X_rrna_subset = X_rrna[indices]
X_dna_subset = X_dna[indices]
X_protein_subset = X_protein[indices]

del X_dna, X_protein, X_rrna

X = np.concatenate((X_dna_subset, X_rrna_subset, X_protein_subset), axis=1)
X = X.astype(np.int16)
y = antibiotic_binary

In [231]:
kmer_file = "/mnt/raid1b/kdan_data/Paper/test_new_assemblies/kmers_Acinetobacter/kmer_3_flattened_GCA_006492705.1_ASM649270v1_genomic.fna_.npz"
X_npz = np.load(kmer_file, allow_pickle=True)

In [232]:
X_dna = X_npz['all_kmer_counts_dna'].reshape(1, -1)
X_rrna = X_npz['all_kmer_counts_rrna'].reshape(1, -1)
X_protein = X_npz['all_kmer_counts_protein'].reshape(1, -1)
X = np.concatenate((X_dna, X_rrna, X_protein), axis=1)
X

array([[0, 0, 0, ..., 0, 0, 0]])

In [233]:
len(X[0])

717254

In [146]:
X[0][100]

0

In [145]:
dense_GCA_006492705

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int16)

In [147]:
dense_GCA_006492705[0][100]

0

In [241]:
matching_indexes = [i for i in range(len(dense_GCA_006492705[0])) if dense_GCA_006492705[0][i] != X[0][i]]

In [242]:
len(matching_indexes)

5258

In [240]:
len(X[0])

717254

In [225]:
dense_GCA_006492705

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int16)

In [226]:
X_GCA_006492705

<1x717254 sparse matrix of type '<class 'numpy.int16'>'
	with 24012 stored elements in Compressed Sparse Row format>

In [20]:
dense_GCA_006492705 = X_GCA_006492705.toarray()

In [36]:
dense_GCA_006492705[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int16)

In [23]:
len(dense_GCA_006492705[0])

717254

In [43]:
common = set(X[0]) & set(dense_GCA_006492705[0])

In [37]:
X_GCA_006492705

<1x717254 sparse matrix of type '<class 'numpy.int16'>'
	with 24012 stored elements in Compressed Sparse Row format>

In [33]:
len(X[0])

717254

In [218]:
dtest = xgb.DMatrix(X_GCA_006492705)

In [219]:
X_GCA_006492705

<1x717254 sparse matrix of type '<class 'numpy.int16'>'
	with 24012 stored elements in Compressed Sparse Row format>

In [221]:
y_pred= model.predict(dtest)
y_pred

array([1.], dtype=float32)

In [224]:
dtest = xgb.DMatrix(X_GCA_006492705)
y_pred= model.predict(dtest)
y_pred

array([1.], dtype=float32)

In [234]:
dtest = xgb.DMatrix(csr_matrix(X))

In [235]:
y_pred= model.predict(dtest)
y_pred

array([1.], dtype=float32)

### Test the creation of kmers 

In [237]:
import sys
print(sys.version)

3.9.15 (main, Nov 24 2022, 14:31:59) 
[GCC 11.2.0]


In [156]:
import os
import pickle
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import argparse

def build_dataset(kmer_size, assembly, path_dna, path_protein, path_5s, path_16s, path_23s):
    """
    Build the dataset including upstream sequences and matching them with k-mer vocabularies.

    Args:
        kmer_size (int): Size of the k-mers.
        assembly (str): Name of the assembly.
        directory (str): Directory containing upstream sequence files.

    Returns:
        tuple: Contains filtered data lists X, X_dna_chars, and headers_DNA.
    """
    # Get upstream sequences as an additional feature for the model
    upstream = {}
    
    # Get upstream sequences based on the assembly
    #path = os.path.abspath(os.path.join(directory, 'upstream_seqs_final', f'{assembly}.fna'))
    
    with open(path_dna, 'r') as file:
        # Read the fasta file and store a dictionary with the assembly as the key and the sequence as the value
        fasta = {}
        for line in file:
            if line.startswith('>'):
                header = line.strip()
                fasta[header] = ''
            else:
                fasta[header] += line.strip()

        upstream[assembly] = fasta
    print("Got upstream sequences")

        # Get proteins sequences as an additional feature for the model
    proteins = {}
    
    # Get upstream sequences based on the assembly
    #path = os.path.abspath(os.path.join(directory, 'aligned_proteins_final', f'{assembly}.fna'))
    
    with open(path_protein, 'r') as file:
        # Read the fasta file and store a dictionary with the assembly as the key and the sequence as the value
        fasta = {}
        for line in file:
            if line.startswith('>'):
                header = line.strip()
                fasta[header] = ''
            else:
                fasta[header] += line.strip()

        proteins[assembly] = fasta
    print("Got proteins sequences")

        # Get 5s sequences as an additional feature for the model
    rrna_5s = {}
    
    # Get upstream sequences based on the assembly
    #path = os.path.abspath(os.path.join(directory, 'rRNAs/final_5s_seqs', f'{assembly}.fna.txt'))
    
    with open(path_5s, 'r') as file:
        rrna_5s[assembly] = file.read().strip()
    print("Got 5s rRNA sequences")

        # Get 16s sequences as an additional feature for the model
    rrna_16s = {}
    
    # Get upstream sequences based on the assembly
    #path = os.path.abspath(os.path.join(directory, 'rRNAs/final_16s_seqs', f'{assembly}.fna.txt'))
    
    with open(path_16s, 'r') as file:
        rrna_16s[assembly] = file.read().strip()
    print("Got 16s rRNA sequences")

    # Get 23s sequences as an additional feature for the model
    rrna_23s = {}
    
    # Get upstream sequences based on the assembly
    #path = os.path.abspath(os.path.join(directory, 'rRNAs/final_23s_seqs', f'{assembly}.fna.txt'))
    
    with open(path_23s, 'r') as file:
        rrna_23s[assembly] = file.read().strip()
    print("Got 23s rRNA sequences")

    # Prepare the data
    X = []
    count = 0
    kmer_val = kmer_size
    X_dna_chars = []
    headers_DNA = []
    X_protein_chars = []
    headers_PROT = []
    X_rrna_chars = []
    headers_RRNA = []
    sep_features = []

    # Add each upstream sequence of fasta as a separate feature
    for header, sequence in upstream[assembly].items():
        # Replace sequences made entirely of 'X' with empty strings
        if sequence == 'X' * len(sequence):
            sep_features.append('X' * kmer_val)
        else:
            # Clean up sequences and handle unwanted characters
            cleaned_sequence = sequence.split('*')[0].replace('-', '').replace('K', 'G').replace('M', 'A').replace('R', 'G').replace('Y', 'C').replace('S', 'G')
            sep_features.append(cleaned_sequence)
        
        # Collect headers only once if count is 0
        if count == 0:
            headers_DNA.append(header)
        
    X_dna_chars.append(sep_features)
    #print(f"Collected headers: {headers_DNA}")

    sep_features = []
    # Add each protein sequence of fasta as a separate feature
    for header, sequence in proteins[assembly].items():
        if sequence == 'X' * len(sequence):
            sep_features.append('X'*kmer_val)
        else:
            sep_features.append(sequence.split('*')[0].replace('-', '').replace('B', 'D'))
        if count==0:
            headers_PROT.append(header)
    X_protein_chars.append(sep_features)
    # print(count)

    
    sep_features = []
    
    # Get the 5s rRNA sequence
    rna_5s = rrna_5s[assembly]
    # Get the 16s rRNA sequence
    rna_16s = rrna_16s[assembly]
    # Get the 23s rRNA sequence
    rna_23s = rrna_23s[assembly]
    if count==0:
        headers_RRNA.append(['rna_5s', 'rna_16s', 'rna_23s'])
    
    sep_features.append(rna_5s.replace('-', '').replace('R', 'G').replace('Y', 'C').replace('K', 'G').replace('M', 'A').replace('D', 'G'))
    sep_features.append(rna_16s.replace('-', '').replace('R', 'G').replace('Y', 'C').replace('K', 'G').replace('M', 'A').replace('D', 'G'))
    sep_features.append(rna_23s.replace('-', '').replace('R', 'G').replace('Y', 'C').replace('K', 'G').replace('M', 'A').replace('D', 'G'))
    
    X_rrna_chars.append(sep_features)
    
    # Return the prepared data
    return X, X_dna_chars, headers_DNA, X_protein_chars, headers_PROT, X_rrna_chars, headers_RRNA 


def process_kmers(kmer_size, assembly, headers_DNA, X_dna_chars, headers_PROT, X_protein_chars, headers_RRNA, X_rrna_chars):
    """
    Processes k-mers for DNA, protein, and rRNA sequences and returns k-mer counts.

    Parameters:
    kmer_size (int): Size of the k-mers.
    dna_vocab_path (str): File path to the k-mer vocabulary for DNA.
    protein_vocab_path (str): File path to the k-mer vocabulary for protein.
    rrna_vocab_path (str): File path to the k-mer vocabulary for rRNA.
    headers_DNA (list): List of headers for DNA sequences.
    X_dna_chars (list): List of DNA sequences corresponding to the headers.
    headers_PROT (list): List of headers for protein sequences.
    X_protein_chars (list): List of protein sequences corresponding to the headers.
    headers_RRNA (list): List of headers for rRNA sequences.
    X_rrna_chars (list): List of rRNA sequences corresponding to the headers.

    Returns:
    tuple: Tuple containing lists of k-mer counts for DNA, protein, and rRNA.
    """

    # Load the k-mer vocabulary for DNA, proteins, and rRNA
    kmer_vocab_dna = np.load('/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_'+str(kmer_size)+'_flatteneddna_stats_.npy', allow_pickle=True)
    kmer_vocab_protein = np.load('/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_'+str(kmer_size)+'_flattenedprotein_stats_.npy', allow_pickle=True)
    kmer_vocab_rrna = np.load('/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_'+str(kmer_size)+'_flattenedrrna_stats_.npy', allow_pickle=True)


    # Convert the loaded data into DataFrames
    dna_kmer_df = pd.DataFrame(kmer_vocab_dna, columns=['Index', 'Kmer', 'DNA'])
    protein_kmer_df = pd.DataFrame(kmer_vocab_protein, columns=['Index', 'Kmer', 'Protein'])
    rrna_kmer_df = pd.DataFrame(kmer_vocab_rrna, columns=['Index', 'Kmer', 'rRNA'])
    
    # Clean the 'rRNA' column to remove parentheses and commas
    rrna_kmer_df['rRNA'] = rrna_kmer_df['rRNA'].apply(lambda x: x[0])

    # Group k-mers by their associated DNA, protein, and rRNA
    dna_kmer_dict = dna_kmer_df.groupby('DNA')['Kmer'].apply(list).to_dict()
    protein_kmer_dict = protein_kmer_df.groupby('Protein')['Kmer'].apply(list).to_dict()
    rrna_kmer_dict = rrna_kmer_df.groupby('rRNA')['Kmer'].apply(list).to_dict()

    # Initialize lists to hold k-mer counts for DNA, proteins, and rRNA
    all_kmer_counts_dna = []
    all_kmer_counts_protein = []
    all_kmer_counts_rrna = []

    # Process k-mers for DNA sequences
    for dna, kmer_vocab in dna_kmer_dict.items():
        dna_key = dna.split('_')[0]
        matching_index = None
        for i, header in enumerate(headers_DNA):
            header_key = header.split('_')[0]
            if header_key == dna_key:
                matching_index = i
                break

        if matching_index is not None:
            sequence = X_dna_chars[0][matching_index]
            vectorizer = CountVectorizer(analyzer='char', ngram_range=(len(kmer_vocab[0]), len(kmer_vocab[0])), vocabulary=kmer_vocab)
            kmer_matrix = vectorizer.transform([sequence])
            kmer_counts = kmer_matrix.toarray().flatten().tolist()
            all_kmer_counts_dna.extend(kmer_counts)
        else:
            print(f"DNA {dna} not found in headers_DNA. Filling with 0s.")
            all_kmer_counts_dna.extend([0] * len(kmer_vocab))

    # Process k-mers for protein sequences
    for protein, kmer_vocab in protein_kmer_dict.items():
        protein_key = protein.split('_')[0]
        matching_index = None
        for i, header in enumerate(headers_PROT):
            header_key = header.split('_')[0]
            if header_key == protein_key:
                matching_index = i
                break

        if matching_index is not None:
            sequence = X_protein_chars[0][matching_index]
            vectorizer = CountVectorizer(analyzer='char', ngram_range=(len(kmer_vocab[0]), len(kmer_vocab[0])), vocabulary=kmer_vocab)
            kmer_matrix = vectorizer.transform([sequence])
            kmer_counts = kmer_matrix.toarray().flatten().tolist()
            all_kmer_counts_protein.extend(kmer_counts)
        else:
            print(f"Protein {protein} not found in headers_PROT. Filling with 0s.")
            all_kmer_counts_protein.extend([0] * len(kmer_vocab))

    # Process k-mers for rRNA sequences
    for rrna, kmer_vocab in rrna_kmer_dict.items():
        matching_index = None
        for i, header in enumerate(headers_RRNA[0]):
            if header == rrna:
                matching_index = i
                break

        if matching_index is not None:
            sequence = X_rrna_chars[0][matching_index]
            vectorizer = CountVectorizer(analyzer='char', ngram_range=(len(kmer_vocab[0]), len(kmer_vocab[0])), vocabulary=kmer_vocab)
            kmer_matrix = vectorizer.transform([sequence])
            kmer_counts = kmer_matrix.toarray().flatten().tolist()
            all_kmer_counts_rrna.extend(kmer_counts)
        else:
            print(f"rRNA {rrna} not found in headers_RRNA. Filling with 0s.")
            all_kmer_counts_rrna.extend([0] * len(kmer_vocab))

    #np.save(directory+'kmer_' + str(kmer_size) + '_flattened_' + "rrna_" + assembly + '_.npy', all_kmer_counts_rrna)
    #np.save(directory+'kmer_' + str(kmer_size) + '_flattened_' + "dna_" + assembly + '_.npy', all_kmer_counts_dna)
    #np.save(directory+'kmer_' + str(kmer_size) + '_flattened_' + "protein_" + assembly + '_.npy', all_kmer_counts_protein)
	
	

# Return the k-mer counts
    return all_kmer_counts_dna, all_kmer_counts_protein, all_kmer_counts_rrna






In [253]:
assembly = "/mnt/raid1b/kdan_data/Paper/test_new_assemblies/acinetobacter/GCA_006492705.1_ASM649270v1_genomic.fna"
path_dna = "/mnt/raid1b/kdan_data/Paper/test_new_assemblies/annotated_assemblies_Acinetobacter/upstream_seqs_final/GCA_006492705.1_ASM649270v1_genomic.fna"
path_protein = "/mnt/raid1b/kdan_data/Paper/test_new_assemblies/annotated_assemblies_Acinetobacter/aligned_proteins_final/GCA_006492705.1_ASM649270v1_genomic.fna"
path_5s = "/mnt/raid1b/kdan_data/Paper/test_new_assemblies/annotated_assemblies_Acinetobacter/rRNAs/final_5s_seqs/GCA_006492705.1_ASM649270v1_genomic.fna.txt"
path_16s = "/mnt/raid1b/kdan_data/Paper/test_new_assemblies/annotated_assemblies_Acinetobacter/rRNAs/final_16s_seqs/GCA_006492705.1_ASM649270v1_genomic.fna.txt"
path_23s = "/mnt/raid1b/kdan_data/Paper/test_new_assemblies/annotated_assemblies_Acinetobacter/rRNAs/final_23s_seqs/GCA_006492705.1_ASM649270v1_genomic.fna.txt"


In [254]:
X, X_dna_chars, headers_DNA, X_protein_chars, headers_PROT, X_rrna_chars, headers_RRNA = build_dataset(3, assembly, path_dna, path_protein, path_5s, path_16s, path_23s)

Got upstream sequences
Got proteins sequences
Got 5s rRNA sequences
Got 16s rRNA sequences
Got 23s rRNA sequences


In [160]:

# Find indexes where elements are not 'XXX'
indexes_not_xxx = [i for i, val in enumerate(X_protein_chars[0]) if val != 'XXX']

print("Indexes without 'XXX':", indexes_not_xxx)


Indexes without 'XXX': [119, 218, 269, 412, 492, 589, 590, 675, 679, 717, 821, 822, 911, 929, 930, 931, 982, 1024, 1034, 1060, 1112, 1118, 1119, 1139, 1156, 1241, 1253, 1269, 1270, 1272, 1322, 1326, 1327, 1328, 1329, 1365, 1530, 1650, 1946, 1947, 1948, 2109, 2206, 2303, 2331, 2360, 2361, 2429, 2433, 2469]


In [161]:
filtered_headers = [headers_PROT[i] for i in indexes_not_xxx]
print(filtered_headers)

[">AAA72105.1_ant(2'')-Ia_VHET01000093.1", '>AAB59083.4_aadA1_VHET01000089.1', '>AAC44316.1_qacE_VHET01000093.1', '>AAF34177.1_dfrA1_VHET01000089.1', '>AAG07648.1_rplB_VHET01000003.1', '>AAL14439.1_adeA_VHET01000090.1', '>AAL14440.1_adeB_VHET01000090.1', '>AAN82549.1_Ecol_EFTu_PLV_VHET01000113.1', '>AAN87714.1_msr(E)_VHET01000105.1', '>AAP40270.1_blaOXA_VHET01000102.1', '>AAX14802.1_adeJ_VHET01000044.1', '>AAX14803.1_adeK_VHET01000044.1', '>ABI20451.1_mphE_VHET01000105.1', ">ABO10616.2_ant(3'')-IIa_VHET01000052.1", '>ABO11759.2_abaF_VHET01000037.1', '>ABO13164.2_pmrA_VHET01000044.1', '>ABY60434.1_sul1_VHET01000093.1', '>ACJ41739.1_adeI_VHET01000044.1', '>ACL01107.1_sat2_VHET01000089.1', '>ACQ82816.1_amvA_VHET01000017.1', '>ADK46845.1_pmrB_VHET01000044.1', '>ADM92605.1_adeR_VHET01000090.1', '>ADM92606.1_adeS_VHET01000090.1', '>ADY84088.1_nreB_VHET01000039.1', '>AEK25668.1_ftsI_VHET01000005.1', ">AGI04227.1_aph(3')-VI_VHET01000116.1", '>AGV28567.1_adeN_VHET01000016.1', '>AHB92962.1_Abau_

In [162]:
kmer_size=3

In [163]:
# Load the k-mer vocabulary for DNA, proteins, and rRNA
kmer_vocab_dna = np.load('/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_'+str(kmer_size)+'_flatteneddna_stats_.npy', allow_pickle=True)
kmer_vocab_protein = np.load('/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_'+str(kmer_size)+'_flattenedprotein_stats_.npy', allow_pickle=True)
kmer_vocab_rrna = np.load('/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_'+str(kmer_size)+'_flattenedrrna_stats_.npy', allow_pickle=True)


# Convert the loaded data into DataFrames
dna_kmer_df = pd.DataFrame(kmer_vocab_dna, columns=['Index', 'Kmer', 'DNA'])
protein_kmer_df = pd.DataFrame(kmer_vocab_protein, columns=['Index', 'Kmer', 'Protein'])
rrna_kmer_df = pd.DataFrame(kmer_vocab_rrna, columns=['Index', 'Kmer', 'rRNA'])

# Clean the 'rRNA' column to remove parentheses and commas
rrna_kmer_df['rRNA'] = rrna_kmer_df['rRNA'].apply(lambda x: x[0])

# Group k-mers by their associated DNA, protein, and rRNA
dna_kmer_dict = dna_kmer_df.groupby('DNA')['Kmer'].apply(list).to_dict()
protein_kmer_dict = protein_kmer_df.groupby('Protein')['Kmer'].apply(list).to_dict()
rrna_kmer_dict = rrna_kmer_df.groupby('rRNA')['Kmer'].apply(list).to_dict()


In [164]:
kmer_vocab_protein

array([[0, 'acr', '>AAA16241.1_ompR_ABFGGZ010000015.1'],
       [1, 'add', '>AAA16241.1_ompR_ABFGGZ010000015.1'],
       [2, 'ade', '>AAA16241.1_ompR_ABFGGZ010000015.1'],
       ...,
       [640706, 'yyl', '>tet(S/M)_1_HM367711'],
       [640707, 'yys', '>tet(S/M)_1_HM367711'],
       [640708, 'yyv', '>tet(S/M)_1_HM367711']], dtype=object)

In [255]:
len(dense_GCA_006492705[0])

717254

In [166]:
indices = np.nonzero(dense_GCA_006492705)[1]  # Use [1] to get column indices

In [167]:
len(indices)

24012

In [170]:
indices

array([    74,    154,    219, ..., 715878, 716451, 717214])

In [168]:
dense_GCA_006492705[0][indices]

array([1, 1, 1, ..., 1, 1, 1], dtype=int16)

In [169]:
dna_kmer_df

Unnamed: 0,Index,Kmer,DNA
0,0,aaa,>AAA16241.1_ompR_ABFGGZ010000015.1
1,1,aac,>AAA16241.1_ompR_ABFGGZ010000015.1
2,2,aag,>AAA16241.1_ompR_ABFGGZ010000015.1
3,3,aat,>AAA16241.1_ompR_ABFGGZ010000015.1
4,4,aca,>AAA16241.1_ompR_ABFGGZ010000015.1
...,...,...,...
76141,76141,tta,>tet(S/M)_1_HM367711
76142,76142,ttc,>tet(S/M)_1_HM367711
76143,76143,ttg,>tet(S/M)_1_HM367711
76144,76144,ttt,>tet(S/M)_1_HM367711


In [171]:
dna_kmer_df.loc[219,:]

Index                 219
Kmer                  xxx
DNA      >AAA18786.1_cdtB
Name: 219, dtype: object

In [256]:
def process_kmers(kmer_size, assembly, headers_DNA, X_dna_chars, headers_PROT, X_protein_chars, headers_RRNA, X_rrna_chars):
    """
    Processes k-mers for DNA, protein, and rRNA sequences and returns k-mer counts.

    Parameters:
    kmer_size (int): Size of the k-mers.
    dna_vocab_path (str): File path to the k-mer vocabulary for DNA.
    protein_vocab_path (str): File path to the k-mer vocabulary for protein.
    rrna_vocab_path (str): File path to the k-mer vocabulary for rRNA.
    headers_DNA (list): List of headers for DNA sequences.
    X_dna_chars (list): List of DNA sequences corresponding to the headers.
    headers_PROT (list): List of headers for protein sequences.
    X_protein_chars (list): List of protein sequences corresponding to the headers.
    headers_RRNA (list): List of headers for rRNA sequences.
    X_rrna_chars (list): List of rRNA sequences corresponding to the headers.

    Returns:
    tuple: Tuple containing lists of k-mer counts for DNA, protein, and rRNA.
    """

    # Load the k-mer vocabulary for DNA, proteins, and rRNA
    kmer_vocab_dna = np.load('/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_'+str(kmer_size)+'_flatteneddna_stats_.npy', allow_pickle=True)
    kmer_vocab_protein = np.load('/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_'+str(kmer_size)+'_flattenedprotein_stats_.npy', allow_pickle=True)
    kmer_vocab_rrna = np.load('/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/kmer_'+str(kmer_size)+'_flattenedrrna_stats_.npy', allow_pickle=True)


    # Convert the loaded data into DataFrames
    dna_kmer_df = pd.DataFrame(kmer_vocab_dna, columns=['Index', 'Kmer', 'DNA'])
    protein_kmer_df = pd.DataFrame(kmer_vocab_protein, columns=['Index', 'Kmer', 'Protein'])
    rrna_kmer_df = pd.DataFrame(kmer_vocab_rrna, columns=['Index', 'Kmer', 'rRNA'])
    
    # Clean the 'rRNA' column to remove parentheses and commas
    rrna_kmer_df['rRNA'] = rrna_kmer_df['rRNA'].apply(lambda x: x[0])

    # Group k-mers by their associated DNA, protein, and rRNA
    dna_kmer_dict = dna_kmer_df.groupby('DNA')['Kmer'].apply(list).to_dict()
    protein_kmer_dict = protein_kmer_df.groupby('Protein')['Kmer'].apply(list).to_dict()
    rrna_kmer_dict = rrna_kmer_df.groupby('rRNA')['Kmer'].apply(list).to_dict()

    # Initialize lists to hold k-mer counts for DNA, proteins, and rRNA
    all_kmer_counts_dna = []
    all_kmer_counts_protein = []
    all_kmer_counts_rrna = []

    # Process k-mers for DNA sequences
    for dna, kmer_vocab in dna_kmer_dict.items():
        dna_key = dna.split('_')[0]
        matching_index = None
        for i, header in enumerate(headers_DNA):
            header_key = header.split('_')[0]
            if header_key == dna_key:
                matching_index = i
                break

        if matching_index is not None:
            sequence = X_dna_chars[0][matching_index]
            vectorizer = CountVectorizer(analyzer='char', ngram_range=(len(kmer_vocab[0]), len(kmer_vocab[0])), vocabulary=kmer_vocab)
            kmer_matrix = vectorizer.transform([sequence])
            kmer_counts = kmer_matrix.toarray().flatten().tolist()
            all_kmer_counts_dna.extend(kmer_counts)
        else:
            print(f"DNA {dna} not found in headers_DNA. Filling with 0s.")
            all_kmer_counts_dna.extend([0] * len(kmer_vocab))

    # Process k-mers for protein sequences
    for protein, kmer_vocab in protein_kmer_dict.items():
        protein_key = protein.split('_')[0]
        matching_index = None
        for i, header in enumerate(headers_PROT):
            header_key = header.split('_')[0]
            if header_key == protein_key:
                matching_index = i
                break

        if matching_index is not None:
            sequence = X_protein_chars[0][matching_index]
            vectorizer = CountVectorizer(analyzer='char', ngram_range=(len(kmer_vocab[0]), len(kmer_vocab[0])), vocabulary=kmer_vocab)
            kmer_matrix = vectorizer.transform([sequence])
            kmer_counts = kmer_matrix.toarray().flatten().tolist()
            all_kmer_counts_protein.extend(kmer_counts)
        else:
            print(f"Protein {protein} not found in headers_PROT. Filling with 0s.")
            all_kmer_counts_protein.extend([0] * len(kmer_vocab))

    # Process k-mers for rRNA sequences
    for rrna, kmer_vocab in rrna_kmer_dict.items():
        matching_index = None
        for i, header in enumerate(headers_RRNA[0]):
            if header == rrna:
                matching_index = i
                break

        if matching_index is not None:
            sequence = X_rrna_chars[0][matching_index]
            vectorizer = CountVectorizer(analyzer='char', ngram_range=(len(kmer_vocab[0]), len(kmer_vocab[0])), vocabulary=kmer_vocab)
            kmer_matrix = vectorizer.transform([sequence])
            kmer_counts = kmer_matrix.toarray().flatten().tolist()
            all_kmer_counts_rrna.extend(kmer_counts)
        else:
            print(f"rRNA {rrna} not found in headers_RRNA. Filling with 0s.")
            all_kmer_counts_rrna.extend([0] * len(kmer_vocab))

    #np.save(directory+'kmer_' + str(kmer_size) + '_flattened_' + "rrna_" + assembly + '_.npy', all_kmer_counts_rrna)
    #np.save(directory+'kmer_' + str(kmer_size) + '_flattened_' + "dna_" + assembly + '_.npy', all_kmer_counts_dna)
    #np.save(directory+'kmer_' + str(kmer_size) + '_flattened_' + "protein_" + assembly + '_.npy', all_kmer_counts_protein)
	
	

# Return the k-mer counts
    return all_kmer_counts_dna, all_kmer_counts_protein, all_kmer_counts_rrna



In [257]:
all_kmer_counts_dna, all_kmer_counts_protein, all_kmer_counts_rrna = process_kmers(3,assembly, headers_DNA, X_dna_chars, headers_PROT, X_protein_chars, headers_RRNA, X_rrna_chars)

In [258]:
# Convert lists to NumPy arrays first
all_kmer_counts_dna = np.array(all_kmer_counts_dna)
all_kmer_counts_rrna = np.array(all_kmer_counts_rrna)
all_kmer_counts_protein = np.array(all_kmer_counts_protein)
X_dna = all_kmer_counts_dna.reshape(1, -1)
X_rrna = all_kmer_counts_rrna.reshape(1, -1)
X_protein = all_kmer_counts_protein.reshape(1, -1)
X = np.concatenate((X_dna, X_rrna, X_protein), axis=1)
X

array([[0, 0, 0, ..., 0, 0, 0]])

In [259]:
dense_GCA_006492705

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int16)

In [228]:
X_GCA_006492705

<1x717254 sparse matrix of type '<class 'numpy.int16'>'
	with 24012 stored elements in Compressed Sparse Row format>

In [260]:
dtest = xgb.DMatrix(csr_matrix(X))
y_pred= model.predict(dtest)
y_pred

array([1.], dtype=float32)

In [261]:
y_pred

array([1.], dtype=float32)

In [262]:
len(X[0])

717254

In [175]:
indices_X = np.nonzero(X)[1]  # Use [1] to get column indices

In [176]:
indices_GCA = np.nonzero(dense_GCA_006492705)[1]  # Use [1] to get column indices

In [177]:
len(indices_X)

23551

In [178]:
len(indices_GCA)

24012

In [179]:
indices_GCA

array([    74,    154,    219, ..., 715878, 716451, 717214])

In [263]:
matching_indexes = [i for i in range(len(dense_GCA_006492705[0])) if dense_GCA_006492705[0][i] != X[0][i]]
len(matching_indexes)

5258

In [102]:
X[0][4538]

0

In [243]:
matching_indexes

[8072,
 8975,
 8976,
 8977,
 8978,
 8979,
 8980,
 8981,
 8982,
 8983,
 8984,
 8985,
 8986,
 8987,
 8988,
 8989,
 8990,
 8991,
 8992,
 8993,
 8994,
 8997,
 8998,
 8999,
 9000,
 9001,
 9002,
 9003,
 9004,
 9005,
 9006,
 9007,
 9008,
 9009,
 9010,
 9011,
 9012,
 9013,
 9014,
 9015,
 9016,
 9017,
 9018,
 9019,
 9020,
 9021,
 9022,
 9034,
 9035,
 9036,
 9037,
 9038,
 9040,
 9041,
 9042,
 9043,
 9044,
 9045,
 9046,
 9047,
 9048,
 9049,
 9050,
 10613,
 10615,
 10616,
 10617,
 10618,
 10619,
 10620,
 10621,
 10622,
 10623,
 10624,
 10626,
 10627,
 10628,
 10629,
 10630,
 10631,
 10632,
 10633,
 10634,
 10635,
 10636,
 10637,
 10638,
 10639,
 10640,
 10641,
 10642,
 10643,
 10644,
 10645,
 10646,
 10647,
 10648,
 10649,
 10650,
 10651,
 10652,
 10653,
 10654,
 10656,
 10657,
 10658,
 10659,
 10660,
 10675,
 10676,
 10677,
 10678,
 10679,
 10680,
 10681,
 10682,
 10683,
 10684,
 10685,
 10686,
 10687,
 10688,
 10689,
 15320,
 15321,
 15322,
 15323,
 15324,
 15325,
 15326,
 15327,
 15328,
 15329,

In [184]:
X[0][706213]

0

In [185]:
dense_GCA_006492705[0][706213]

2

In [187]:
protein_kmer_df.loc[706213:706215,:]

Unnamed: 0,Index,Kmer,Protein


In [72]:
len(X[0])

717254

In [112]:
X = pickle.load(open("/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/X_dna_chars.pkl", 'rb'))

In [127]:
len(X)

18916

In [128]:
len(assemblies)

18916

In [131]:
assemblies[16382]

'GCA_023877555.1'

In [130]:
X[16382]

['XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',
 'XXXXX',


In [122]:
headers=pickle.load(open("/mnt/raid1b/kdan_data/Paper/Machine_Learning/build_dataset/data/headers_" + "dna" +".pkl", 'rb'))

In [123]:
headers

[">AAA03550.1_aac(2')-Ia",
 '>AAA16241.1_ompR_ABFGGZ010000015.1',
 '>AAA16360.1_stxA2',
 '>AAA16527.1_ant(9)-Ib',
 '>AAA18472.1_ystA',
 '>AAA18786.1_cdtB',
 '>AAA19190.1_stxB2',
 '>AAA19777.1_seh',
 '>AAA19882.1_blaF',
 '>AAA19915.1_aac(3)-IId',
 '>AAA20116.1_tetA(P)',
 '>AAA20117.1_tetB(P)',
 '>AAA20192.1_erm(C)',
 '>AAA20885.1_astA',
 '>AAA21094.1_arsA',
 '>AAA21095.1_arsB_ABFGGZ010000001.1',
 '>AAA21096.1_arsC_ABFGGZ010000001.1',
 '>AAA21532.1_cepA',
 ">AAA21889.1_aac(6')-I",
 '>AAA22081.1_catB1',
 '>AAA22190.1_aadK',
 '>AAA22277.1_bmr',
 '>AAA22289.1_cat86',
 '>AAA22419.1_erm(G)',
 '>AAA22595.1_erm(D)',
 '>AAA22851.1_tet(L)',
 '>AAA22904.1_cfiA',
 '>AAA23018.1_catA13',
 '>AAA23033.2_tet(O)',
 '>AAA23215.1_catA16',
 '>AAA23235.1_etx',
 '>AAA23263.1_bont_F6',
 '>AAA23405.1_fasA',
 '>AAA23421.1_fim41a',
 '>AAA23523.1_bmaE',
 '>AAA23737.1_f17a',
 '>AAA24032.1_faeG',
 '>AAA24035.1_faeG',
 '>AAA24293.1_papG-II',
 '>AAA24300.1_ftsI_ABFGGZ010000004.1',
 '>AAA24632.1_stxA2',
 '>AAA24653.1_s

In [195]:
kmer_df = pd.concat([dna_kmer_df, protein_kmer_df, rrna_kmer_df], ignore_index=True, sort=False)

In [244]:
filtered_df = kmer_df.loc[matching_indexes, :]
filtered_df = filtered_df[filtered_df['Kmer'] != 'xxx']

In [250]:
X[0][714401]

1

In [249]:
filtered_df.loc[714401,:]

Index                   638255
Kmer                       ssn
DNA                        NaN
Protein    >erm(42)_2_AB601890
rRNA                       NaN
Name: 714401, dtype: object

In [197]:
print(indices_GCA)

[    74    154    219 ... 715878 716451 717214]
