In [1]:
import os
import gzip
import glob
import re
import pandas as pd
import numpy as np
from collections import Counter, defaultdict, OrderedDict
from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
os.chdir("../../data/10famsim")

In [3]:
def file_iter(file_path):
    sequences = {}
    with gzip.open(file_path, 'rt') as file:
        first_char = file.read(1)
    
    if first_char == '>':
        # Jika file FASTA
        with gzip.open(file_path, 'rt') as file:
            sequence_id = None
            current_sequence = ''
            for line in file:
                if line.startswith('>'):
                    if sequence_id is not None:
                        sequences[sequence_id] = current_sequence
                    sequence_id = line.strip().split(' ')[0]
                    current_sequence = ''
                else:
                    current_sequence += line.strip()
            if sequence_id is not None:
                sequences[sequence_id] = current_sequence
    elif first_char == '@':
        # Jika file FASTQ
        with gzip.open(file_path, 'rt') as file:
            line_count = 0
            sequence_id = None
            current_sequence = ''
            for line in file:
                line_count += 1
                if line_count % 4 == 1:
                    sequence_id = line.strip().split(' ')[0]  # Ambil sequence ID
                elif line_count % 4 == 2:
                    current_sequence = line.strip()  # Urutan nukleotida
                    sequences[sequence_id] = current_sequence
    return sequences
    
def get_complement(sequence):
    BASE_COMPLEMENT = {"A": "T", "T": "A", "G": "C", "C": "G", "*": "*"}
    return ''.join([BASE_COMPLEMENT[base] for base in reversed(sequence)])
     
def extract_spaced_kmers(sequence, pattern):
    kmer_length = len(pattern)
    kmers = []
    for i in range(len(sequence) - kmer_length + 1):
        kmer = ''.join(sequence[i + j] if pattern[j] == '1' else '*' for j in range(kmer_length))
        if all(base in 'ATGC*' for base in kmer):
            # Get complement using the user's suggested approach
            kmer_complement = get_complement(kmer)
            # Choose canonical k-mer
            canonical_kmer = min(kmer, kmer_complement)
            kmers.append(canonical_kmer)
    return kmers

def process_sequence(sequence_id, sequence, patterns):
    all_kmers = []
    for pattern in patterns:
        all_kmers.extend(extract_spaced_kmers(sequence, pattern))
    return sequence_id, Counter(all_kmers)

def count_spaced_kmers(sequences, patterns, n_jobs=8):
    kmer_counts = Parallel(n_jobs=n_jobs)(
        delayed(process_sequence)(sequence_id, sequence, patterns)
        for sequence_id, sequence in tqdm(sequences.items())
    )
    return dict(kmer_counts)

def build_feature_space(kmer_counts, sequences):
    all_kmers = set()
    for kmer_count in kmer_counts.values():
        all_kmers.update(kmer_count.keys())

    all_kmers = sorted(all_kmers)
    feature_space = defaultdict(list)

    for sequence_id, kmer_count in kmer_counts.items():
        sequence_length = len(sequences[sequence_id])
        feature_vector = [(kmer_count.get(kmer, 0) / sequence_length) for kmer in all_kmers]
        feature_space[sequence_id] = feature_vector
    
    return all_kmers, feature_space

In [4]:
# def file_iter(file_path):
#     sequences = {}
    
#     # Cek format file berdasarkan karakter pembuka ('>' untuk FASTA, '@' untuk FASTQ)
#     with gzip.open(file_path, 'rt') as file:
#         first_char = file.read(1)
    
#     if first_char == '>':
#         # Jika file FASTA
#         with gzip.open(file_path, 'rt') as file:
#             sequence_id = None
#             current_sequence = ''
#             for line in file:
#                 if line.startswith('>'):
#                     if sequence_id is not None:
#                         sequences[sequence_id] = current_sequence
#                     sequence_id = line.strip().split(' ')[0]
#                     current_sequence = ''
#                 else:
#                     current_sequence += line.strip()
#             if sequence_id is not None:
#                 sequences[sequence_id] = current_sequence
#     elif first_char == '@':
#         # Jika file FASTQ
#         with gzip.open(file_path, 'rt') as file:
#             line_count = 0
#             sequence_id = None
#             current_sequence = ''
#             for line in file:
#                 line_count += 1
#                 if line_count % 4 == 1:
#                     sequence_id = line.strip().split(' ')[0]  # Ambil sequence ID
#                 elif line_count % 4 == 2:
#                     current_sequence = line.strip()  # Urutan nukleotida
#                     sequences[sequence_id] = current_sequence
#     return sequences

#     def get_complement(sequence):
#         BASE_COMPLEMENT = {"A": "T", "T": "A", "G": "C", "C": "G", "*": "*"}
#         return ''.join([BASE_COMPLEMENT[base] for base in reversed(sequence)])
    
#     def extract_spaced_kmers_with_reversecomplement(sequence, pattern):
#         kmer_length = len(pattern)
#         kmers = []
#         for i in range(len(sequence) - kmer_length + 1):
#             kmer = ''.join(sequence[i + j] if pattern[j] == '1' else '*' for j in range(kmer_length))
#             if all(base in 'ATGC*' for base in kmer):
#                 # Get complement using the user's suggested approach
#                 kmer_complement = get_complement(kmer)
#                 # Choose canonical k-mer
#                 canonical_kmer = min(kmer, kmer_complement)
#                 kmers.append(canonical_kmer)
#         return kmers

# def process_sequence(sequence_id, sequence, patterns):
#     all_kmers = []
#     for pattern in patterns:
#         all_kmers.extend(extract_spaced_kmers_with_reversecomplement(sequence, pattern))
#     return sequence_id, Counter(all_kmers)

# # Fungsi untuk menghitung jumlah spaced k-mers dalam sequences
# def count_spaced_kmers(sequences, patterns, n_jobs=8):
#     print("Menghitung masing-masing fitur dengan paralelisme")
#     kmer_counts = Parallel(n_jobs=n_jobs)(
#         delayed(process_sequence)(sequence_id, sequence, patterns)
#         for sequence_id, sequence in tqdm(sequences.items())
#     )
#     return dict(kmer_counts)

# # Fungsi untuk membangun ruang fitur dari k-mer counts
# def build_feature_space(kmer_counts, sequences):
#     print("Membangun feature space")
#     all_kmers = set()
#     for kmer_count in kmer_counts.values():
#         all_kmers.update(kmer_count.keys())

#     all_kmers = sorted(all_kmers)
#     feature_space = defaultdict(list)

#     for sequence_id, kmer_count in kmer_counts.items():
#         sequence_length = len(sequences[sequence_id])
#         feature_vector = [(kmer_count.get(kmer, 0) / sequence_length) for kmer in all_kmers]
#         feature_space[sequence_id] = feature_vector
    
#     return all_kmers, feature_space

In [5]:
# # Fungsi untuk menghasilkan pola spaced k-mers acak
# def generate_random_spaced_kmers():
#     patterns = []
#     for _ in range(1):  # Assuming we create 20 initial patterns
#         pattern_3 = '1' + ''.join(random.choice('01') for _ in range(1)) + '1'
#         pattern_4 = '1' + ''.join(random.choice('01') for _ in range(2)) + '1'
#         pattern_5 = '1' + '0' + ''.join(random.choice('01') for _ in range(2)) + '1'
#         patterns.append([pattern_3, pattern_4, pattern_5])
#     return patterns

# Contigs

## Feature

In [6]:
# Mendapatkan semua file di dalam folder
folder_path = './'
file_paths = glob.glob(os.path.join(folder_path, '*.fna.gz'))

patterns = [['111', '1111', '11111']]
all_features = []
file_names = []  # List untuk menyimpan nama file

for file_path in file_paths:
    print(f"Memproses file: {file_path}")  # Cetak nama file yang sedang diproses
    sequences = file_iter(file_path)
    for pattern_group in patterns:
        kmer_counts = count_spaced_kmers(sequences, pattern_group)
        feature_name, feature_space = build_feature_space(kmer_counts, sequences)
        features = np.array(list(feature_space.values()))
        all_features.append(features)
        file_names.extend([file_path] * len(features))  # Tambahkan nama file untuk setiap sequence

# Gabungkan semua fitur menjadi satu DataFrame
all_features_combined = np.vstack(all_features)
feature_df = pd.DataFrame(all_features_combined)

# Menampilkan DataFrame hasil
feature_df

Memproses file: ./GCF_000009905-Exact.d324fd4b.fna.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2039.39it/s]


Memproses file: ./GCF_000018325-Exact.d3251aed.fna.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2165.98it/s]


Memproses file: ./GCF_000009045-Exact.d324f740.fna.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2138.35it/s]


Memproses file: ./GCF_000012325-Exact.d3250346.fna.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:05<00:00, 1861.14it/s]


Memproses file: ./GCF_000024765-Exact.d325265a.fna.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2172.05it/s]


Memproses file: ./GCF_000014725-Exact.d3250ec9.fna.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2068.68it/s]


Memproses file: ./GCF_000021925-Exact.d3252020.fna.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2185.01it/s]


Memproses file: ./GCF_000015285-Exact.d32514fa.fna.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2171.06it/s]


Memproses file: ./GCF_000013425-Exact.d324f178.fna.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2161.84it/s]


Memproses file: ./GCF_000012985-Exact.d32508a2.fna.gz


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2175.29it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,146,147,148,149,150,151,152,153,154,155
0,0.064,0.086,0.112,0.026,0.044,0.098,0.128,0.022,0.002,0.002,...,0.000,0.000,0.000,0.000,0.004,0.012,0.004,0.006,0.006,0.000
1,0.040,0.094,0.114,0.024,0.040,0.122,0.118,0.008,0.002,0.000,...,0.000,0.002,0.002,0.000,0.004,0.012,0.002,0.006,0.004,0.000
2,0.056,0.100,0.140,0.030,0.040,0.122,0.128,0.034,0.000,0.004,...,0.002,0.002,0.000,0.000,0.004,0.010,0.004,0.012,0.000,0.000
3,0.028,0.120,0.108,0.028,0.046,0.116,0.132,0.010,0.000,0.000,...,0.002,0.008,0.002,0.000,0.004,0.008,0.010,0.002,0.006,0.000
4,0.030,0.112,0.122,0.014,0.024,0.126,0.126,0.010,0.000,0.002,...,0.000,0.002,0.002,0.000,0.000,0.010,0.002,0.006,0.006,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.112,0.122,0.108,0.072,0.116,0.152,0.120,0.050,0.002,0.012,...,0.000,0.002,0.002,0.004,0.028,0.004,0.008,0.020,0.002,0.002
99996,0.092,0.156,0.106,0.038,0.116,0.142,0.112,0.032,0.012,0.012,...,0.006,0.002,0.006,0.002,0.010,0.012,0.002,0.008,0.008,0.000
99997,0.136,0.100,0.140,0.060,0.144,0.124,0.100,0.064,0.026,0.006,...,0.012,0.002,0.004,0.000,0.008,0.014,0.002,0.006,0.004,0.004
99998,0.108,0.126,0.114,0.058,0.104,0.130,0.130,0.050,0.006,0.008,...,0.012,0.010,0.006,0.000,0.006,0.010,0.002,0.008,0.002,0.006


## Label

In [7]:
# label_path = "./contigs/gsa_mapping.tsv.gz"
# label_df = pd.read_csv(label_path, sep='\t')
# label_df = label_df[["genome_id"]]
# label_df

def fasta_iter(fasta_file):
    with gzip.open(fasta_file, 'rt') as file:
        name = None
        seqs = []
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if name:
                    yield name, ''.join(seqs)
                name = line[1:]  # Skip '>'
                seqs = []
            else:
                seqs.append(line)
        if name:
            yield name, ''.join(seqs)  # Yield the last entry
            
# Function to detect file type based on file extension
def detect_file_type(file_path):
    ext = os.path.splitext(file_path)[-1].lower()  # Get file extension and convert to lowercase
    # Handle gzipped files
    if ext == '.gz':
        ext = os.path.splitext(file_path[:-3])[-1].lower()  # Check the extension before ".gz"
        
    if ext in ['.fasta', '.fa', '.fna']:
        return 'fasta'
    elif ext in ['.fastq', '.fq']:
        return 'fastq'
    else:
        raise ValueError("Unsupported file extension. Please provide a valid FASTA or FASTQ file.")

def generate_label_dataframe(file_path):
    labels = OrderedDict()
    
    # Automatically detect file type
    file_type = detect_file_type(file_path)
    if file_type == 'fasta':
        seq_iter = fasta_iter(file_path)
    elif file_type == 'fastq':
        seq_iter = fastq_iter(file_path)
    else:
        raise ValueError("Unsupported file type. Use 'fasta' or 'fastq'.")

    for h, seq in seq_iter:
        # Ambil hanya ID sebelum karakter "|"
        seq_id = h.split('|')[0].strip()
        # Ekstrak label di antara tanda petik dua ("")
        label_match = re.search(r'"(.*?)"', h)
        label = label_match.group(1) if label_match else "Unknown"
        
        # Simpan ID dan label
        labels[seq_id] = label

    # Buat DataFrame dari dictionary labels
    label_df = pd.DataFrame.from_dict(labels, orient='index', columns=['Label'])
    
    return label_df

folder_path = './'
files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.fna.gz')]

# Menggabungkan semua label dari beberapa file
label_df = pd.concat([generate_label_dataframe(f) for f in files])

# Reset indeks jika diperlukan
label_df.reset_index(inplace=True)
label_df.columns = ['Sequence_ID', 'Label']
label_df =  label_df[['Label']]
label_df

Unnamed: 0,Label
0,NC_006177.1 Symbiobacterium thermophilum IAM 1...
1,NC_006177.1 Symbiobacterium thermophilum IAM 1...
2,NC_006177.1 Symbiobacterium thermophilum IAM 1...
3,NC_006177.1 Symbiobacterium thermophilum IAM 1...
4,NC_006177.1 Symbiobacterium thermophilum IAM 1...
...,...
99995,"NC_007645.1 Hahella chejuensis KCTC 2396, comp..."
99996,"NC_007645.1 Hahella chejuensis KCTC 2396, comp..."
99997,"NC_007645.1 Hahella chejuensis KCTC 2396, comp..."
99998,"NC_007645.1 Hahella chejuensis KCTC 2396, comp..."


## Classifier

In [8]:
from sklearn.model_selection import train_test_split
import time
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

## Pembagian data

In [9]:
features = feature_df
labels = label_df

# Pisahkan data menjadi training dan testing set (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

print("Jumlah data latih per kelas:")
print(np.unique(y_train, return_counts=True))

print("\nJumlah data uji per kelas:")
print(np.unique(y_test, return_counts=True))

Jumlah data latih per kelas:
(array(['NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome',
       'NC_003910.7 Colwellia psychrerythraea 34H, complete sequence',
       'NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence',
       'NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence',
       'NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome',
       'NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence',
       'NC_008709.1 Psychromonas ingrahamii 37, complete sequence',
       'NC_009922.1 Alkaliphilus oremlandii OhILAs, complete sequence',
       'NC_011830.1 Desulfitobacterium hafniense DCB-2, complete sequence',
       'NC_013422.1 Halothiobacillus neapolitanus c2, complete sequence'],
      dtype=object), array([8023, 7962, 7995, 8014, 8008, 8000, 7998, 7987, 8015, 7998]))

Jumlah data uji per kelas:
(array(['NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 c

## Random Forest

In [10]:
# List model yang akan digunakan
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    #"Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    #"MLP Classifier": MLPClassifier(random_state=42, max_iter=500)
}

# Menyimpan hasil metrik dan waktu komputasi
results = []

# Loop melalui semua model dan evaluasi performanya
for model_name, model in models.items():
    start_time = time.time()  # Mulai penghitungan waktu
    
    # Latih model
    model.fit(X_train, y_train)
    
    # Prediksi menggunakan data testing
    y_pred = model.predict(X_test)
    
    # Hitung waktu komputasi
    elapsed_time = time.time() - start_time
    
    # Evaluasi model dengan berbagai metrik
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Simpan hasil dalam dictionary
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Time (seconds)": elapsed_time
    })
    
    # Tampilkan hasil classification report untuk setiap model
    print(f"\nClassification Report untuk {model_name}:")
    print(classification_report(y_test, y_pred))

# Tampilkan hasil dalam bentuk dataframe untuk mempermudah pembacaan
import pandas as pd
results_df = pd.DataFrame(results)
results_df

  return fit_method(estimator, *args, **kwargs)



Classification Report untuk Random Forest:
                                                                                         precision    recall  f1-score   support

                 NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome       0.85      0.84      0.84      1977
                           NC_003910.7 Colwellia psychrerythraea 34H, complete sequence       0.78      0.82      0.80      2038
                  NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence       0.99      0.97      0.98      2005
                            NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence       0.92      0.84      0.88      1986
  NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome       0.82      0.91      0.87      1992
NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence       0.86      0.82      0.84      2000
                              NC_008709.1 Psychromon

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Classification Report untuk Logistic Regression:
                                                                                         precision    recall  f1-score   support

                 NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome       0.79      0.82      0.80      1977
                           NC_003910.7 Colwellia psychrerythraea 34H, complete sequence       0.72      0.77      0.75      2038
                  NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence       0.98      0.96      0.97      2005
                            NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence       0.85      0.82      0.83      1986
  NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome       0.76      0.86      0.81      1992
NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence       0.80      0.69      0.74      2000
                              NC_008709.1 Psyc

  y = column_or_1d(y, warn=True)



Classification Report untuk Support Vector Machine:
                                                                                         precision    recall  f1-score   support

                 NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome       0.86      0.85      0.86      1977
                           NC_003910.7 Colwellia psychrerythraea 34H, complete sequence       0.78      0.82      0.80      2038
                  NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence       0.98      0.98      0.98      2005
                            NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence       0.89      0.89      0.89      1986
  NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome       0.84      0.90      0.87      1992
NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence       0.86      0.79      0.82      2000
                              NC_008709.1 P

  return self._fit(X, y)



Classification Report untuk K-Nearest Neighbors:
                                                                                         precision    recall  f1-score   support

                 NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome       0.79      0.86      0.83      1977
                           NC_003910.7 Colwellia psychrerythraea 34H, complete sequence       0.72      0.82      0.76      2038
                  NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence       0.99      0.98      0.98      2005
                            NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence       0.86      0.82      0.84      1986
  NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome       0.81      0.89      0.85      1992
NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence       0.77      0.79      0.78      2000
                              NC_008709.1 Psyc

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Time (seconds)
0,Random Forest,0.8514,0.85273,0.8514,0.850984,47.385388
1,Logistic Regression,0.79635,0.796612,0.79635,0.794912,32.201012
2,Support Vector Machine,0.85385,0.853765,0.85385,0.853334,426.004485
3,K-Nearest Neighbors,0.8209,0.821821,0.8209,0.819811,2.158871


In [12]:
# List model yang akan digunakan
models = {
    # "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    # "Logistic Regression": LogisticRegression(random_state=42),
    # "Support Vector Machine": SVC(random_state=42),
    # "K-Nearest Neighbors": KNeighborsClassifier(),
    # "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "MLP Classifier": MLPClassifier(random_state=42, max_iter=100)
}

# Menyimpan hasil metrik dan waktu komputasi
results = []

# Loop melalui semua model dan evaluasi performanya
for model_name, model in models.items():
    start_time = time.time()  # Mulai penghitungan waktu
    
    # Latih model
    model.fit(X_train, y_train)
    
    # Prediksi menggunakan data testing
    y_pred = model.predict(X_test)
    
    # Hitung waktu komputasi
    elapsed_time = time.time() - start_time
    
    # Evaluasi model dengan berbagai metrik
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Simpan hasil dalam dictionary
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Time (seconds)": elapsed_time
    })
    
    # Tampilkan hasil classification report untuk setiap model
    print(f"\nClassification Report untuk {model_name}:")
    print(classification_report(y_test, y_pred))

dff = pd.DataFrame(results)
dff

  y = column_or_1d(y, warn=True)



Classification Report untuk MLP Classifier:
                                                                                         precision    recall  f1-score   support

                 NC_000964.3 Bacillus subtilis subsp. subtilis str. 168 complete genome       0.89      0.84      0.86      1977
                           NC_003910.7 Colwellia psychrerythraea 34H, complete sequence       0.82      0.79      0.80      2038
                  NC_006177.1 Symbiobacterium thermophilum IAM 14863, complete sequence       0.99      0.98      0.98      2005
                            NC_007645.1 Hahella chejuensis KCTC 2396, complete sequence       0.89      0.91      0.90      1986
  NC_007795.1 Staphylococcus aureus subsp. aureus NCTC 8325 chromosome, complete genome       0.87      0.90      0.88      1992
NC_008346.1 Syntrophomonas wolfei subsp. wolfei str. Goettingen G311, complete sequence       0.84      0.83      0.83      2000
                              NC_008709.1 Psychromo

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Time (seconds)
0,MLP Classifier,0.8661,0.866534,0.8661,0.866121,312.043707


In [13]:
hasil = pd.concat([results_df, dff], ignore_index=True)
hasil

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Time (seconds)
0,Random Forest,0.8514,0.85273,0.8514,0.850984,47.385388
1,Logistic Regression,0.79635,0.796612,0.79635,0.794912,32.201012
2,Support Vector Machine,0.85385,0.853765,0.85385,0.853334,426.004485
3,K-Nearest Neighbors,0.8209,0.821821,0.8209,0.819811,2.158871
4,MLP Classifier,0.8661,0.866534,0.8661,0.866121,312.043707


# Reads

## Feature

In [9]:
read_file = './reads/anonymous_reads.fq.gz' 
sequences = file_iter(read_file)
patterns = [['101', '1111', '10001']]
for i, pattern_group in enumerate(patterns, start=1):
    kmer_counts = count_spaced_kmers(sequences, pattern_group)
    feature_name, feature_space = build_feature_space(kmer_counts, sequences)
    features = np.array(list(feature_space.values()))

feature_df = pd.DataFrame(features)
feature_df

Menghitung masing-masing fitur dengan paralelisme


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 666654/666654 [01:54<00:00, 5797.08it/s]


Membangun feature space


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,146,147,148,149,150,151,152,153,154,155
0,0.046667,0.093333,0.140000,0.026667,0.053333,0.133333,0.073333,0.040000,0.000000,0.006667,...,0.006667,0.000000,0.000000,0.000000,0.000000,0.006667,0.013333,0.000000,0.006667,0.0
1,0.026667,0.160000,0.126667,0.020000,0.040000,0.153333,0.106667,0.026667,0.006667,0.000000,...,0.000000,0.006667,0.000000,0.000000,0.000000,0.013333,0.006667,0.000000,0.000000,0.0
2,0.033333,0.100000,0.140000,0.046667,0.046667,0.173333,0.093333,0.026667,0.000000,0.006667,...,0.000000,0.000000,0.000000,0.000000,0.006667,0.000000,0.006667,0.000000,0.000000,0.0
3,0.080000,0.106667,0.153333,0.040000,0.066667,0.126667,0.113333,0.060000,0.000000,0.000000,...,0.000000,0.000000,0.013333,0.000000,0.006667,0.006667,0.020000,0.000000,0.000000,0.0
4,0.073333,0.100000,0.140000,0.020000,0.073333,0.140000,0.106667,0.020000,0.000000,0.006667,...,0.000000,0.013333,0.000000,0.000000,0.013333,0.006667,0.006667,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666649,0.080000,0.133333,0.126667,0.033333,0.073333,0.093333,0.140000,0.053333,0.006667,0.006667,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.006667,0.026667,0.000000,0.000000,0.0
666650,0.080000,0.106667,0.080000,0.040000,0.040000,0.113333,0.153333,0.020000,0.000000,0.000000,...,0.000000,0.006667,0.000000,0.000000,0.006667,0.000000,0.026667,0.006667,0.006667,0.0
666651,0.100000,0.100000,0.120000,0.053333,0.093333,0.093333,0.180000,0.033333,0.006667,0.013333,...,0.000000,0.000000,0.000000,0.000000,0.006667,0.000000,0.006667,0.006667,0.006667,0.0
666652,0.086667,0.120000,0.133333,0.026667,0.086667,0.140000,0.106667,0.033333,0.000000,0.000000,...,0.000000,0.006667,0.000000,0.000000,0.013333,0.020000,0.000000,0.006667,0.013333,0.0


In [10]:
labelR_path = "./reads/reads_mapping.tsv.gz"
labelR_df = pd.read_csv(labelR_path, sep='\t')
labelR_df = labelR_df[["genome_id"]]
labelR_df

Unnamed: 0,genome_id
0,Genome19.0
1,Genome19.0
2,Genome19.0
3,Genome19.0
4,Genome18.0
...,...
666649,Genome18.0
666650,Genome19.0
666651,Genome19.0
666652,Genome19.0


In [11]:
features = feature_df
labels = labelR_df

# Pisahkan data menjadi training dan testing set (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

print("Jumlah data latih per kelas:")
print(np.unique(y_train, return_counts=True))

print("\nJumlah data uji per kelas:")
print(np.unique(y_test, return_counts=True))

Jumlah data latih per kelas:
(array(['Genome10.0', 'Genome10.1', 'Genome11.0', 'Genome12.0',
       'Genome13.0', 'Genome14.0', 'Genome15.0', 'Genome16.0',
       'Genome17.0', 'Genome18.0', 'Genome19.0', 'Genome2.0',
       'Genome20.0', 'Genome22.0', 'Genome23.0', 'Genome24.0',
       'Genome3.0', 'Genome4.0', 'Genome4.1', 'Genome5.0', 'Genome6.0',
       'Genome7.0', 'Genome8.0', 'Genome9.0'], dtype=object), array([  2843,   9214,     37,    438,   2081,   8187,   5417,    338,
         1936,  69103, 367831,  12720,    853,  13194,   2856,    444,
         9619,   5141,   1501,    340,   7383,    656,  10800,    391]))

Jumlah data uji per kelas:
(array(['Genome10.0', 'Genome10.1', 'Genome11.0', 'Genome12.0',
       'Genome13.0', 'Genome14.0', 'Genome15.0', 'Genome16.0',
       'Genome17.0', 'Genome18.0', 'Genome19.0', 'Genome2.0',
       'Genome20.0', 'Genome22.0', 'Genome23.0', 'Genome24.0',
       'Genome3.0', 'Genome4.0', 'Genome4.1', 'Genome5.0', 'Genome6.0',
       'Genome7.0'

In [12]:
# Inisialisasi model Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Latih model
rf_model.fit(X_train, y_train)

# Prediksi menggunakan data testing
y_pred = rf_model.predict(X_test)

# Tampilkan hasil akurasi dan classification report
print("\nAkurasi Model:")
print(accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

  return fit_method(estimator, *args, **kwargs)



Akurasi Model:
0.6924421177370604

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

  Genome10.0       0.00      0.00      0.00       759
  Genome10.1       0.00      0.00      0.00      2264
  Genome11.0       0.00      0.00      0.00         9
  Genome12.0       0.00      0.00      0.00       106
  Genome13.0       0.00      0.00      0.00       491
  Genome14.0       0.00      0.00      0.00      2001
  Genome15.0       0.00      0.00      0.00      1323
  Genome16.0       0.00      0.00      0.00        94
  Genome17.0       0.00      0.00      0.00       446
  Genome18.0       0.19      0.00      0.00     17141
  Genome19.0       0.69      1.00      0.82     92345
   Genome2.0       0.00      0.00      0.00      3170
  Genome20.0       0.00      0.00      0.00       189
  Genome22.0       0.00      0.00      0.00      3268
  Genome23.0       0.00      0.00      0.00       730
  Genome24.0       0.00      0.00      0.00       118
   Genome3.0       0.00      0.00      0.00      2399
   Genome4.0       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
