In [11]:
# RANDOM FOREST CLASSIFIER

import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import os
from sklearn.metrics import classification_report

# Muat data
data_dict = pickle.load(open('../data.pickle', 'rb'))

# Periksa bentuk dari setiap elemen dalam data_dict['data']
# Buat daftar untuk menyimpan data yang valid
valid_data = []

for sample in data_dict['data']:
    if len(sample) == 42:  # Pastikan panjang fitur adalah 42
        valid_data.append(sample)

# Konversi ke array NumPy
data = np.array(valid_data)  # Harus berukuran (n_samples, 42)
labels = np.array(data_dict['labels'][:len(valid_data)])  # Sesuaikan dengan jumlah data valid

# Membuat labels_dict untuk pemetaan huruf
labels_dict = {i: label for i, label in enumerate(sorted(set(labels)))}

# Split data menjadi training dan testing
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.30, shuffle=True, stratify=labels)

# Inisialisasi dan latih model
model = RandomForestClassifier()

# Data sudah memiliki bentuk yang sesuai
model.fit(x_train, y_train)

# Prediksi dan hitung skor akurasi
y_predict = model.predict(x_test)
score = accuracy_score(y_predict, y_test)
print('{}% of samples were classified correctly !'.format(score * 100))

# Cek apakah file model.p sudah ada
model_filename = 'model.p'
if os.path.exists(model_filename):
    # Jika ada, rename menjadi model1.p, model2.p, dst.
    i = 1
    new_filename = f'{os.path.splitext(model_filename)[0]}{i}{os.path.splitext(model_filename)[1]}'
    while os.path.exists(new_filename):
        i += 1
        new_filename = f'{os.path.splitext(model_filename)[0]}{i}{os.path.splitext(model_filename)[1]}'
    os.rename(model_filename, new_filename)

# Tentukan folder untuk menyimpan model
model_folder = '../model/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

# Tentukan nama file model dengan increment
i = 1
model_filename = os.path.join(model_folder, 'rf_model.p')
while os.path.exists(model_filename):
    model_filename = os.path.join(model_folder, f'rf_model_{i}.p')
    i += 1

# Simpan model ke file
with open(model_filename, 'wb') as f:
    pickle.dump({'model': model}, f)

# Generate classification report
report = classification_report(y_test, y_predict, target_names=[labels_dict[i] for i in sorted(labels_dict.keys())])
print(report)

97.5866851595007% of samples were classified correctly !
              precision    recall  f1-score   support

           A       0.98      0.99      0.99       150
           B       0.99      1.00      0.99       150
           C       0.98      0.99      0.99       148
           D       0.99      0.99      0.99       139
           E       0.99      0.97      0.98       149
           F       0.97      0.98      0.98       148
           G       0.98      0.98      0.98       130
           H       0.98      0.90      0.94       108
           I       0.94      0.99      0.97       149
           J       0.98      0.96      0.97        83
           K       0.96      0.97      0.97       149
           L       0.98      0.98      0.98       149
           M       0.99      0.98      0.99       149
           N       0.97      0.98      0.98       149
           O       0.99      0.97      0.98       144
           P       0.96      0.96      0.96       140
           Q       0.95 

In [12]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import os

# Muat data
data_dict = pickle.load(open('../data.pickle', 'rb'))

# Filter hanya data dengan panjang fitur 42
valid_data = [sample for sample in data_dict['data'] if len(sample) == 42]
data = np.array(valid_data)
labels = np.array(data_dict['labels'][:len(valid_data)])

# Label dictionary
labels_dict = {i: label for i, label in enumerate(sorted(set(labels)))}

# Split train-test
x_train_full, x_test, y_train_full, y_test = train_test_split(
    data, labels, test_size=0.30, stratify=labels, random_state=42
)

# Inisialisasi K-Fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracies = []
all_y_true = []
all_y_pred = []
final_model = None

# Cross-validation
for fold, (train_index, val_index) in enumerate(skf.split(x_train_full, y_train_full), 1):
    x_train, x_val = x_train_full[train_index], x_train_full[val_index]
    y_train, y_val = y_train_full[train_index], y_train_full[val_index]

    model = RandomForestClassifier(random_state=42)
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    acc = accuracy_score(y_val, y_pred)
    accuracies.append(acc)

    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    print(f"Fold {fold} Accuracy: {acc:.4f}")

    final_model = model  # simpan model dari fold terakhir

# Rata-rata akurasi K-Fold
mean_kfold_acc = np.mean(accuracies)
print("\nMean K-Fold Accuracy:", mean_kfold_acc)

# Classification report untuk hasil K-Fold
report_kfold = classification_report(all_y_true, all_y_pred, target_names=[labels_dict[i] for i in sorted(labels_dict.keys())])
print("Classification Report (K-Fold):\n", report_kfold)

# Evaluasi pada test set
y_test_pred = final_model.predict(x_test)
test_acc = accuracy_score(y_test, y_test_pred)
print("Final Test Accuracy:", test_acc)

# Classification report untuk test set
report_test = classification_report(y_test, y_test_pred, target_names=[labels_dict[i] for i in sorted(labels_dict.keys())])
print("Classification Report (Test Data):\n", report_test)

# Simpan model
model_folder = '../model/'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

i = 1
model_filename = os.path.join(model_folder, 'rf_model_kfold.p')
while os.path.exists(model_filename):
    model_filename = os.path.join(model_folder, f'rf_model_kfold_{i}.p')
    i += 1

with open(model_filename, 'wb') as f:
    pickle.dump({'model': final_model}, f)

Fold 1 Accuracy: 0.9786
Fold 2 Accuracy: 0.9845
Fold 3 Accuracy: 0.9738
Fold 4 Accuracy: 0.9655
Fold 5 Accuracy: 0.9822
Fold 6 Accuracy: 0.9631
Fold 7 Accuracy: 0.9786
Fold 8 Accuracy: 0.9715
Fold 9 Accuracy: 0.9643
Fold 10 Accuracy: 0.9762

Mean K-Fold Accuracy: 0.9738406658739596
Classification Report (K-Fold):
               precision    recall  f1-score   support

           A       0.99      0.99      0.99       350
           B       0.99      1.00      0.99       350
           C       0.99      0.99      0.99       346
           D       0.99      0.99      0.99       323
           E       0.97      0.97      0.97       349
           F       0.97      0.98      0.98       344
           G       0.97      0.97      0.97       304
           H       0.97      0.94      0.96       251
           I       0.97      0.97      0.97       346
           J       0.96      0.97      0.97       193
           K       0.96      0.97      0.97       347
           L       0.98      0.97  