In [1]:
import os
import pandas as pd
import gzip
import numpy as np
from itertools import product
from joblib import Parallel, delayed
from tqdm.auto import tqdm
from collections import defaultdict, Counter, OrderedDict
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir("../../data/10famsim")

In [3]:
pwd

'/home/jovyan/work/RCSparK/data/10famsim'

In [4]:
# Function to read FASTA file and yield name and sequence
def fasta_iter(fasta_file):
    with gzip.open(fasta_file, 'rt') as file:
        name = None
        seqs = []
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if name:
                    yield name, ''.join(seqs)
                name = line[1:]  # Skip '>'
                seqs = []
            else:
                seqs.append(line)
        if name:
            yield name, ''.join(seqs)  # Yield the last entry

# Function to read FASTQ file and yield name and sequence
def fastq_iter(fastq_file):
    with gzip.open(fastq_file, 'rt') as file:
        while True:
            name = file.readline().strip()[1:]  # Skip '@' in FASTQ
            seq = file.readline().strip()
            file.readline()  # Skip the '+' line
            file.readline()  # Skip the quality line
            if not name or not seq:
                break
            yield name, seq

# Fungsi untuk menghitung frekuensi global k-mer
def compute_global_kmer_abundance(seq_iter, kmer_len):
    global_kmer_counts = Counter()
    for h, seq in seq_iter:
        norm_seq = str(seq).upper()
        for i in range(len(norm_seq) - kmer_len + 1):
            kmer = norm_seq[i:i+kmer_len]
            if set(kmer) <= {'A', 'T', 'G', 'C'}:
                global_kmer_counts[kmer] += 1
    return global_kmer_counts

# Function to generate feature mapping for k-mers with sequences as keys
def generate_feature_mapping(kmer_len):
    BASE_COMPLEMENT = {"A": "T", "T": "A", "G": "C", "C": "G"}
    kmer_hash = {}
    kmer_names = []
    counter = 0
    for kmer in product("ATGC", repeat=kmer_len):
        kmer = ''.join(kmer)
        rev_compl = ''.join([BASE_COMPLEMENT[x] for x in reversed(kmer)])  # Generate reverse complement
        if kmer not in kmer_hash and rev_compl not in kmer_hash:
            # Use canonical k-mer (lexicographically smaller of kmer and rev_compl)
            canonical_kmer = min(kmer, rev_compl)
            kmer_hash[canonical_kmer] = counter
            kmer_names.append(canonical_kmer)  # Store the k-mer for naming features
            counter += 1
    return kmer_hash, kmer_names, counter
    
# Function to detect file type based on file extension
def detect_file_type(file_path):
    ext = os.path.splitext(file_path)[-1].lower()  # Get file extension and convert to lowercase
    # Handle gzipped files
    if ext == '.gz':
        ext = os.path.splitext(file_path[:-3])[-1].lower()  # Check the extension before ".gz"
        
    if ext in ['.fasta', '.fa', '.fna']:
        return 'fasta'
    elif ext in ['.fastq', '.fq']:
        return 'fastq'
    else:
        raise ValueError("Unsupported file extension. Please provide a valid FASTA or FASTQ file.")
        
# Function to generate k-mer features from FASTA or FASTQ with abundance feature
def generate_kmer_features(folder_path, length_threshold, kmer_len, split=False, split_threshold=0):
    # List all files in the folder with .fna.gz extension
    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.fna.gz')]
    
    kmer_dict, kmer_names, nr_features = generate_feature_mapping(kmer_len)
    composition = OrderedDict()
    abundance = OrderedDict()  # Dictionary to store abundance (total k-mer count per sequence)

    for file_path in files:
        # Automatically detect file type from file extension
        file_type = detect_file_type(file_path)

        # Select the appropriate file iterator based on the file type
        if file_type == 'fasta':
            seq_iter = fasta_iter(file_path)
        elif file_type == 'fastq':
            seq_iter = fastq_iter(file_path)
        else:
            raise ValueError("Unsupported file type. Use 'fasta' or 'fastq'.")

        def seq_list():
            for h, seq in seq_iter:
                # Ambil hanya ID sebelum karakter "|"
                h = h.split('|')[0].strip()
                
                if not split:
                    yield h, seq
                elif len(seq) >= split_threshold:
                    half = len(seq) // 2
                    yield (h + '_1', seq[:half])
                    yield (h + '_2', seq[half:])

        for h, seq in seq_list():
            if len(seq) < length_threshold:
                continue
            norm_seq = str(seq).upper()
            kmers = [kmer_dict[norm_seq[i:i+kmer_len]]
                     for i in range(len(norm_seq) - kmer_len + 1)
                     if norm_seq[i:i+kmer_len] in kmer_dict]  # Ignore kmers with non-canonical bases
            
            # Count the abundance of k-mers and store in composition and abundance dictionaries
            composition[f"{file_path}_{h}"] = np.bincount(np.array(kmers, dtype=np.int64), minlength=nr_features)
            abundance[f"{file_path}_{h}"] = len(kmers)  # Total k-mer count for abundance

    # Create DataFrame with k-mer names as column headers
    df = pd.DataFrame.from_dict(composition, orient='index', dtype=float, columns=kmer_names)

    # Add abundance column
    df['abundance'] = pd.Series(abundance)

    # Apply smoothing and normalize by row (sequence)
    df.iloc[:, :-1] = df.iloc[:, :-1].apply(lambda x: x + 1e-5)  # Smoothing
    df.iloc[:, :-1] = df.iloc[:, :-1].div(df.iloc[:, :-1].sum(axis=1), axis=0)  # Normalize by row (sequence), excluding abundance

    return df


# Contigs

## Feature

In [5]:
fasta_folder = './'  # Updated file path to your uploaded file
kmer_len = 5  # Original k-mer length
length_threshold = 100  # Example sequence length threshold

# Generate spaced k-mer features
kmer_df = generate_kmer_features(fasta_folder, length_threshold, kmer_len)
kmer_df = pd.DataFrame(kmer_df)
kmer_df

Unnamed: 0,AAAAA,AAAAT,AAAAG,AAAAC,AAATA,AAATT,AAATG,AAATC,AAAGA,AAAGT,...,CGGAG,CCTCG,CGACG,CCACG,CCCCG,CGCCG,CCGCG,CCAGG,CCCGG,abundance
./GCF_000009905-Exact.d324fd4b.fna.gz_r1.1,4.716867e-08,4.716867e-08,4.716867e-08,4.716914e-03,4.716867e-08,4.716867e-08,4.716867e-08,4.716867e-08,4.716914e-03,4.716914e-03,...,9.433782e-03,4.716867e-08,4.716914e-03,4.716867e-08,4.716914e-03,0.014151,4.716867e-08,4.716867e-08,1.415065e-02,212
./GCF_000009905-Exact.d324fd4b.fna.gz_r2.1,3.936929e-08,3.936929e-08,3.936968e-03,3.936929e-08,3.936929e-08,3.936929e-08,3.936929e-08,3.936929e-08,3.936968e-03,3.936929e-08,...,3.936929e-08,3.936929e-08,3.936929e-08,3.936968e-03,3.936968e-03,0.031495,3.936968e-03,7.873896e-03,3.936968e-03,254
./GCF_000009905-Exact.d324fd4b.fna.gz_r3.1,3.936929e-08,3.936929e-08,3.936929e-08,3.936929e-08,3.936929e-08,3.936929e-08,3.936929e-08,3.936929e-08,3.936968e-03,3.936929e-08,...,3.936929e-08,3.936968e-03,3.936968e-03,3.936929e-08,3.936929e-08,0.003937,3.936929e-08,7.873896e-03,3.936929e-08,254
./GCF_000009905-Exact.d324fd4b.fna.gz_r4.1,3.802207e-08,3.802207e-08,3.802207e-08,3.802207e-08,3.802207e-08,3.802207e-08,3.802207e-08,3.802207e-08,3.802207e-08,3.802207e-08,...,3.802207e-08,3.802207e-08,2.281328e-02,3.802207e-08,7.604453e-03,0.015209,3.802245e-03,3.802207e-08,3.802245e-03,263
./GCF_000009905-Exact.d324fd4b.fna.gz_r5.1,3.999918e-08,3.999918e-08,3.999918e-08,3.999918e-08,3.999918e-08,3.999918e-08,3.999918e-08,3.999918e-08,3.999958e-03,3.999918e-08,...,3.999918e-08,7.999876e-03,3.999958e-03,3.999918e-08,7.999876e-03,0.012000,7.999876e-03,1.599971e-02,3.999958e-03,250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
./GCF_000012985-Exact.d32508a2.fna.gz_r9996.1,4.237196e-08,4.237196e-08,4.237196e-08,4.237239e-03,4.237196e-08,4.237239e-03,4.237196e-08,4.237239e-03,8.474435e-03,4.237196e-08,...,4.237196e-08,4.237196e-08,4.237196e-08,4.237196e-08,4.237196e-08,0.004237,4.237239e-03,4.237239e-03,4.237196e-08,236
./GCF_000012985-Exact.d32508a2.fna.gz_r9997.1,4.048499e-08,4.048540e-03,4.048540e-03,4.048499e-08,4.048499e-08,4.048499e-08,4.048540e-03,4.048499e-08,4.048499e-08,4.048540e-03,...,4.048540e-03,4.048499e-08,4.048499e-08,4.048540e-03,4.048499e-08,0.028340,4.048540e-03,4.048499e-08,4.048499e-08,247
./GCF_000012985-Exact.d32508a2.fna.gz_r9998.1,1.090892e-02,1.090892e-02,1.090892e-02,3.636296e-08,3.636296e-08,7.272628e-03,3.636332e-03,7.272628e-03,7.272628e-03,3.636296e-08,...,3.636332e-03,3.636332e-03,3.636296e-08,3.636296e-08,3.636296e-08,0.003636,3.636332e-03,3.636332e-03,3.636296e-08,275
./GCF_000012985-Exact.d32508a2.fna.gz_r9999.1,3.890973e-08,3.891012e-03,3.890973e-08,3.890973e-08,7.781985e-03,3.891012e-03,3.890973e-08,3.890973e-08,3.890973e-08,3.890973e-08,...,7.781985e-03,3.890973e-08,3.890973e-08,3.890973e-08,3.891012e-03,0.007782,3.890973e-08,3.890973e-08,3.891012e-03,257


## Label

In [6]:
# label_path = "./contigs/gsa_mapping.tsv.gz"
# label_df = pd.read_csv(label_path, sep='\t')
# label_df = label_df[["genome_id"]]
# label_df

def generate_label_dataframe(file_path):
    labels = OrderedDict()
    
    # Automatically detect file type
    file_type = detect_file_type(file_path)
    if file_type == 'fasta':
        seq_iter = fasta_iter(file_path)
    elif file_type == 'fastq':
        seq_iter = fastq_iter(file_path)
    else:
        raise ValueError("Unsupported file type. Use 'fasta' or 'fastq'.")

    for h, seq in seq_iter:
        # Ambil hanya ID sebelum karakter "|"
        seq_id = h.split('|')[0].strip()
        # Ekstrak label di antara tanda petik dua ("")
        label_match = re.search(r'"(.*?)"', h)
        label = label_match.group(1) if label_match else "Unknown"
        
        # Simpan ID dan label
        labels[seq_id] = label

    # Buat DataFrame dari dictionary labels
    label_df = pd.DataFrame.from_dict(labels, orient='index', columns=['Label'])
    
    return label_df

folder_path = './'
files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.fna.gz')]

# Menggabungkan semua label dari beberapa file
label_df = pd.concat([generate_label_dataframe(f) for f in files])

# Reset indeks jika diperlukan
label_df.reset_index(inplace=True)
label_df.columns = ['Sequence_ID', 'Label']
label_df =  label_df[['Label']]
label_df

Unnamed: 0,Label
0,NC_006177.1 Symbiobacterium thermophilum IAM 1...
1,NC_006177.1 Symbiobacterium thermophilum IAM 1...
2,NC_006177.1 Symbiobacterium thermophilum IAM 1...
3,NC_006177.1 Symbiobacterium thermophilum IAM 1...
4,NC_006177.1 Symbiobacterium thermophilum IAM 1...
...,...
99995,"NC_007645.1 Hahella chejuensis KCTC 2396, comp..."
99996,"NC_007645.1 Hahella chejuensis KCTC 2396, comp..."
99997,"NC_007645.1 Hahella chejuensis KCTC 2396, comp..."
99998,"NC_007645.1 Hahella chejuensis KCTC 2396, comp..."


In [7]:
# Tampilkan statistik deskriptif dari data fitur
print("Statistik Deskriptif Data Fitur:")
print(kmer_df.describe())

# Tampilkan distribusi kelas dari data label
print("\nDistribusi Kelas pada Data Label:")
print(label_df.value_counts())

Statistik Deskriptif Data Fitur:
              AAAAA         AAAAT         AAAAG         AAAAC         AAATA  \
count  1.000000e+05  1.000000e+05  1.000000e+05  1.000000e+05  1.000000e+05   
mean   7.192815e-03  5.690635e-03  4.235023e-03  3.654365e-03  4.729090e-03   
std    8.771448e-03  6.020118e-03  4.864990e-03  4.166099e-03  5.601509e-03   
min    2.481358e-08  2.358462e-08  2.358462e-08  2.577286e-08  2.358462e-08   
25%    4.328908e-08  4.405187e-08  4.255226e-08  4.219318e-08  4.201590e-08   
50%    4.149331e-03  4.166619e-03  3.787843e-03  3.662972e-03  3.846117e-03   
75%    1.153827e-02  8.695502e-03  7.352840e-03  5.434686e-03  7.812383e-03   
max    1.051199e-01  4.498194e-02  4.297934e-02  3.401305e-02  5.395588e-02   

              AAATT         AAATG         AAATC         AAAGA         AAAGT  \
count  1.000000e+05  1.000000e+05  1.000000e+05  1.000000e+05  1.000000e+05   
mean   4.254107e-03  3.268449e-03  3.496504e-03  3.605722e-03  2.634730e-03   
std    5.126718e-0

## Classifier

In [8]:
from sklearn.model_selection import train_test_split
import time
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

### Pembagian Data

In [9]:
features = kmer_df
labels = label_df

# Pisahkan data menjadi training dan testing set (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

print("Jumlah data latih per kelas:")
print(np.unique(y_train, return_counts=True))

print("\nJumlah data uji per kelas:")
print(np.unique(y_test, return_counts=True))

Jumlah data latih per kelas:
(array(['NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome',
       'NC_003910.7 Colwellia psychrerythraea 34H, complete sequence',
       'NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence',
       'NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence',
       'NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome',
       'NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence',
       'NC_008709.1 Psychromonas ingrahamii 37, complete sequence',
       'NC_009922.1 Alkaliphilus oremlandii OhILAs, complete sequence',
       'NC_011830.1 Desulfitobacterium hafniense DCB-2, complete sequence',
       'NC_013422.1 Halothiobacillus neapolitanus c2, complete sequence'],
      dtype=object), array([8023, 7962, 7995, 8014, 8008, 8000, 7998, 7987, 8015, 7998]))

Jumlah data uji per kelas:
(array(['NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 c

### Random Forest

In [10]:
# List model yang akan digunakan
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    #"Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    #"MLP Classifier": MLPClassifier(random_state=42, max_iter=500)
}

# Menyimpan hasil metrik dan waktu komputasi
results = []

# Loop melalui semua model dan evaluasi performanya
for model_name, model in models.items():
    start_time = time.time()  # Mulai penghitungan waktu
    
    # Latih model
    model.fit(X_train, y_train)
    
    # Prediksi menggunakan data testing
    y_pred = model.predict(X_test)
    
    # Hitung waktu komputasi
    elapsed_time = time.time() - start_time
    
    # Evaluasi model dengan berbagai metrik
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Simpan hasil dalam dictionary
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Time (seconds)": elapsed_time
    })
    
    # Tampilkan hasil classification report untuk setiap model
    print(f"\nClassification Report untuk {model_name}:")
    print(classification_report(y_test, y_pred))

# Tampilkan hasil dalam bentuk dataframe untuk mempermudah pembacaan
import pandas as pd
results_df = pd.DataFrame(results)
results_df

  return fit_method(estimator, *args, **kwargs)



Classification Report untuk Random Forest:
                                                                                         precision    recall  f1-score   support

                 NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome       0.74      0.71      0.73      1977
                           NC_003910.7 Colwellia psychrerythraea 34H, complete sequence       0.66      0.74      0.70      2038
                  NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence       0.98      0.96      0.97      2005
                            NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence       0.81      0.78      0.80      1986
  NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome       0.74      0.88      0.80      1992
NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence       0.75      0.77      0.76      2000
                              NC_008709.1 Psychromon

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Classification Report untuk Logistic Regression:
                                                                                         precision    recall  f1-score   support

                 NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome       0.69      0.63      0.65      1977
                           NC_003910.7 Colwellia psychrerythraea 34H, complete sequence       0.57      0.55      0.56      2038
                  NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence       0.91      0.95      0.93      2005
                            NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence       0.77      0.71      0.74      1986
  NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome       0.52      0.80      0.63      1992
NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence       0.57      0.55      0.56      2000
                              NC_008709.1 Psyc

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report untuk Support Vector Machine:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                                                         precision    recall  f1-score   support

                 NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome       0.13      0.06      0.08      1977
                           NC_003910.7 Colwellia psychrerythraea 34H, complete sequence       0.00      0.00      0.00      2038
                  NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence       0.00      0.00      0.00      2005
                            NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence       0.10      0.03      0.04      1986
  NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome       0.16      0.04      0.06      1992
NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence       0.10      0.90      0.19      2000
                              NC_008709.1 Psychromonas ingrahamii 37, complete sequence       0.

  return self._fit(X, y)



Classification Report untuk K-Nearest Neighbors:
                                                                                         precision    recall  f1-score   support

                 NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome       0.43      0.61      0.50      1977
                           NC_003910.7 Colwellia psychrerythraea 34H, complete sequence       0.42      0.57      0.48      2038
                  NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence       0.94      0.92      0.93      2005
                            NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence       0.61      0.61      0.61      1986
  NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome       0.57      0.61      0.59      1992
NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence       0.53      0.47      0.50      2000
                              NC_008709.1 Psyc

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Time (seconds)
0,Random Forest,0.7728,0.772993,0.7728,0.770303,139.039397
1,Logistic Regression,0.6531,0.654736,0.6531,0.648167,144.464875
2,Support Vector Machine,0.10615,0.082078,0.10615,0.04371,4011.303041
3,K-Nearest Neighbors,0.561,0.568342,0.561,0.559343,48.983661


In [11]:
# List model yang akan digunakan
models = {
    # "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    # "Logistic Regression": LogisticRegression(random_state=42),
    # "Support Vector Machine": SVC(random_state=42),
    # "K-Nearest Neighbors": KNeighborsClassifier(),
    # "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "MLP Classifier": MLPClassifier(random_state=42, max_iter=100)
}

# Menyimpan hasil metrik dan waktu komputasi
results = []

# Loop melalui semua model dan evaluasi performanya
for model_name, model in models.items():
    start_time = time.time()  # Mulai penghitungan waktu
    
    # Latih model
    model.fit(X_train, y_train)
    
    # Prediksi menggunakan data testing
    y_pred = model.predict(X_test)
    
    # Hitung waktu komputasi
    elapsed_time = time.time() - start_time
    
    # Evaluasi model dengan berbagai metrik
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Simpan hasil dalam dictionary
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Time (seconds)": elapsed_time
    })
    
    # Tampilkan hasil classification report untuk setiap model
    print(f"\nClassification Report untuk {model_name}:")
    print(classification_report(y_test, y_pred))

dff = pd.DataFrame(results)
dff

  y = column_or_1d(y, warn=True)



Classification Report untuk MLP Classifier:
                                                                                         precision    recall  f1-score   support

                 NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome       0.82      0.83      0.82      1977
                           NC_003910.7 Colwellia psychrerythraea 34H, complete sequence       0.85      0.67      0.75      2038
                  NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence       1.00      0.94      0.97      2005
                            NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence       0.85      0.89      0.87      1986
  NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome       0.79      0.92      0.85      1992
NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence       0.77      0.84      0.80      2000
                              NC_008709.1 Psychromo

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Time (seconds)
0,MLP Classifier,0.83,0.835395,0.83,0.830154,615.483605


In [12]:
hasil = pd.concat([results_df, dff], ignore_index=True)
hasil

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Time (seconds)
0,Random Forest,0.7728,0.772993,0.7728,0.770303,139.039397
1,Logistic Regression,0.6531,0.654736,0.6531,0.648167,144.464875
2,Support Vector Machine,0.10615,0.082078,0.10615,0.04371,4011.303041
3,K-Nearest Neighbors,0.561,0.568342,0.561,0.559343,48.983661
4,MLP Classifier,0.83,0.835395,0.83,0.830154,615.483605


In [13]:
hasil.to_csv('../../result/Kmers+abundance_10fam.csv', index=False)

# Reads

## Feature

In [12]:
read_file = './reads/anonymous_reads.fq.gz'  # Updated file path to your uploaded file
kmer_len = 4  # Original k-mer length
length_threshold = 100  # Example sequence length threshold

# Generate spaced k-mer features
featureR_df = generate_kmer_features(read_file, length_threshold, kmer_len)
print(featureR_df)

FileNotFoundError: [Errno 2] No such file or directory: './reads/anonymous_reads.fq.gz'

In [13]:
labelR_path = "./reads/reads_mapping.tsv.gz"
labelR_df = pd.read_csv(labelR_path, sep='\t')
labelR_df = labelR_df[["genome_id"]]
labelR_df

FileNotFoundError: [Errno 2] No such file or directory: './reads/reads_mapping.tsv.gz'

In [14]:
features = featureR_df
labels = labelR_df

# Pisahkan data menjadi training dan testing set (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

print("Jumlah data latih per kelas:")
print(np.unique(y_train, return_counts=True))

print("\nJumlah data uji per kelas:")
print(np.unique(y_test, return_counts=True))

NameError: name 'featureR_df' is not defined

In [15]:
# Inisialisasi model Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Latih model
rf_model.fit(X_train, y_train)

# Prediksi menggunakan data testing
y_pred = rf_model.predict(X_test)

# Tampilkan hasil akurasi dan classification report
print("\nAkurasi Model:")
print(accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

NameError: name 'RandomForestClassifier' is not defined